# 1. Data Processing and Transformation

In [1]:
#Import the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Import preprocessing libraries
from sklearn.preprocessing import StandardScaler

#Display all columns and rows in the dataframe
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
#Load the dataset from correlation_with_bankrupt.csv
file_path = '../data/raw/cleaned_data.csv'

#Load the dataset
df = pd.read_csv(file_path)

#Display the first 5 rows of the dataset
df.head()

In [None]:
#Check the shape of the dataset
print("Shape of the dataset: ", df.shape)

#Check the columns of the dataset
print("\nColumns in the dataset:\n", df.columns)

- We observe that there is an additional 'space' at the beginning of the column names. Need to fix that.

In [None]:
#Check the datatypes of the columns
print(f"Datatypes of the columns:\n{df.dtypes}")

- All the columns have dtype "float64" except for: " Liability-Assets Flag" and "Bankrupt?" which is target variable.

In [None]:
#Printing the unique values in the column " Liability-Assets Flag"
print(f"Unique values in the column ' Liability-Assets Flag':\n{df[' Liability-Assets Flag'].unique()}")

In [None]:
#Check for missing values in the dataset
print(f"Missing values in the dataset:\n{df.isnull().sum()}")

In [None]:
#Check for duplicate rows in the dataset
print(f"Duplicate rows in the dataset: {df.duplicated().sum()}")

In [None]:
#Summary statistics of the dataset
summary_statistics = df.describe()
summary_statistics.to_csv('../data/raw/summary_statistics.csv')

#Display the summary statistics from csv file
summary_statistics = pd.read_csv('../data/raw/summary_statistics.csv')
print(summary_statistics)

In [None]:
#Check the distribution of the target variable

#Count the number of bankrupt and non-bankrupt companies
bankrupt = df['Bankrupt?'].value_counts()
print(f"Number of bankrupt and non-bankrupt companies:\n{bankrupt}")

#Count the normalized value of bankrupt and non-bankrupt companies
bankrupt_normalized = df['Bankrupt?'].value_counts(normalize=True)
print(f"\nNormalized value of bankrupt and non-bankrupt companies:\n{bankrupt_normalized}")

#Plot the distribution of the target variable
plt.figure(figsize=(10, 6))
sns.countplot(x='Bankrupt?', data=df)
plt.title('Distribution of the target variable')
plt.show()

- Only 3.2% of the values account to '1' that is 'Bankrupt companies' and rest 96.8% account to '0' that is 'Non-bankrupt companies'.
- We need to keep this imbalance of the target variable in mind during modeling phase.

In [None]:
#Checking for outliers in the dataset in a non-graphical way
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
outliers = ((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).sum()

#Display the number of outliers in each column
print(f"Number of outliers in each column:\n{outliers}")

#Display the percentage of outliers in each column
percentage_outliers = (outliers/df.shape[0])*100
print(f"\nPercentage of outliers in each column:\n{percentage_outliers}")

In [None]:
#Checking the distribution of the features using histograms
df.hist(figsize=(20, 20))
plt.suptitle('Distribution of the features')
plt.show()

In [None]:
#Checking for features with more than 5% outliers
features_above_5pct_outliers = percentage_outliers[percentage_outliers > 5]
print(f"Features with more than 5% outliers:\n{features_above_5pct_outliers}")

#Print the count of features with more than 5% outliers
print(f"\nNumber of features with more than 5% outliers: {len(features_above_5pct_outliers)}")

In [13]:
#Removing the outliers from the dataset
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_no_outliers = df[~((df < lower_bound) | (df > upper_bound)).any(axis=1)]

In [None]:
#Display the shape of the dataset after removing the outliers
print(f"Shape of the dataset after removing outliers: {df_no_outliers.shape}")

In [None]:
#Check the distribution of the target variable after removing the outliers
print(f"Distribution of the target variable after removing the outliers:\n{df_no_outliers['Bankrupt?'].value_counts()}")

- After removing outliers all the records with target variable '1' are removed. This means the records are not outliers but real life 'Bankruptcy' cases.
- Hence we need to consider all the records in the dataset for our modelling.

In [16]:
#Splitting the dataset into feature and target variables
X = df.drop('Bankrupt?', axis=1)
y = df['Bankrupt?']

#Standardizing the features
scaler = StandardScaler().set_output(transform="pandas")
X_scaled = scaler.fit_transform(X)

In [None]:
#Display the shape of X_scaled and y
print(f"Shape of X_scaled: {X_scaled.shape}")
print(f"Shape of y: {y.shape}")

In [None]:
#Display the first 5 rows of the scaled feature variables
print(f"First 5 rows of the scaled features: \n{X_scaled.head()}")

In [19]:
#Save the scaled features to a csv file
X_scaled.to_csv('../data/raw/scaled_features.csv', index=False)

# 2. Feature Engineering and Selection

In [None]:
#Create polynomial features
from sklearn.preprocessing import PolynomialFeatures

#Create polynomial features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False).set_output(transform='pandas')
X_poly = poly.fit_transform(X_scaled)

#Display the shape of the dataset after creating polynomial features
print(f"Shape of the dataset after creating polynomial features: {X_poly.shape}")

#Display the first 5 rows of the dataset after creating polynomial features
print(f"First 5 rows of the dataset after creating polynomial features: \n{X_poly.head()}")

In [21]:
#Save the polynomial features to a csv file
X_poly.to_csv('../data/raw/polynomial_features.csv', index=False)

In [None]:
#Import OLS from statsmodels
import statsmodels.api as sm

#Add a constant to the dataset
X_poly = sm.add_constant(X_poly)

#Checking the indices of X_poly are aligned with y_no_outliers
print(f"Are the indices of X_poly aligned with target: {X_poly.index.equals(y.index)}")

## 2A. VIF

In [None]:
#Making a Variance Inflation check on X_poly and X_scaled
from statsmodels.stats.outliers_influence import variance_inflation_factor

#Create a copy of X_poly
X_poly_copy = X_poly.copy()

#Add a constant to the dataset
X_poly_copy = sm.add_constant(X_poly_copy)

#Eliminating features based on VIF
vif = pd.DataFrame()
vif["Features"] = X_poly_copy.columns
vif["VIF"] = [variance_inflation_factor(X_poly_copy.values, i) for i in range(X_poly_copy.shape[1])]
vif = vif.sort_values(by='VIF', ascending=False)

#Display the features and their VIF values
print(f"Features and their VIF values:\n{vif}")

#Drop the features with VIF greater than 10
X_poly_copy = X_poly_copy.drop(columns=vif[vif['VIF'] > 10]['Features'])

#Display the shape of the dataset after dropping features with VIF greater than 10
print(f"Shape of the dataset after dropping features with VIF greater than 10: {X_poly_copy.shape}")

In [None]:
#Display the features of X_poly_copy dataset after dropping features with VIF greater than 10
print(f"Features of X_poly_copy dataset after dropping features with VIF greater than 10:\n{X_poly_copy.columns}")

In [27]:
#Save the dataset after dropping features with VIF greater than 10 to a csv file
X_poly_copy.to_csv('../data/raw/vif_features.csv', index=False)

## 2B. PCA

In [None]:
#PCA for dimensionality reduction
from sklearn.decomposition import PCA

#Create a PCA object
pca = PCA(n_components=0.95).set_output(transform="pandas")

#Fit the PCA object to the scaled features
X_pca = pca.fit_transform(X_poly)

#Display the shape of the dataset after PCA
print(f"Shape of the dataset after PCA: {X_pca.shape}")

#Display the first 5 rows of the dataset after PCA
print(f"First 5 rows of the dataset after PCA: \n{X_pca.head()}")

In [29]:
#Save the dataset after PCA to a csv file
X_pca.to_csv('../data/raw/pca_features.csv', index=False)

## 2C. RFE

In [30]:
#RFE for feature selection
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

#Create a logistic regression object
logreg = LogisticRegression(max_iter=1000)

#Create a RFE object
rfecv = RFECV(estimator=logreg, 
              step=1, 
              cv=StratifiedKFold(5), 
              scoring='recall'
              ).set_output(transform="pandas")

In [None]:
#RFE on X_scaled
X_rfecv_scaled = rfecv.fit_transform(X_scaled, y)

#Get the optimal number of features
optimal_num_features_scaled = rfecv.n_features_
print(f"Optimal number of features: {optimal_num_features_scaled}")

#Get the selected features
selected_features_scaled = rfecv.support_
print(f"Selected features: {selected_features_scaled}")

In [None]:
#Display the shape of the dataset after RFE
print(f"Shape of the dataset after RFE: {X_rfecv_scaled.shape}")

#Display the selected features after RFE
print(f"Selected features after RFE: \n{X_rfecv_scaled.columns}")

In [33]:
#Save the dataset after RFE to a csv file
X_rfecv_scaled.to_csv('../data/raw/rfe_features.csv', index=False)

In [34]:
#Save target variable to a csv file
y.to_csv('../data/raw/target_variable.csv', index=False)

# 3. Feature Selection for scaled dataset without Polynomial features (interaction effect)

## 3A. VIF

In [None]:
#Making a Variance Inflation check on X_poly and X_scaled
from statsmodels.stats.outliers_influence import variance_inflation_factor

#Create a copy of X_poly
X_scaled_copy = X_scaled.copy()

#Add a constant to the dataset
X_scaled_copy = sm.add_constant(X_scaled_copy)

#Eliminating features based on VIF
vif_new = pd.DataFrame()
vif_new["Features"] = X_scaled_copy.columns
vif_new["VIF"] = [variance_inflation_factor(X_scaled_copy.values, i) for i in range(X_scaled_copy.shape[1])]
vif_new = vif_new.sort_values(by='VIF', ascending=False)

#Display the features and their VIF values
print(f"Features and their VIF values:\n{vif_new}")

#Drop the features with VIF greater than 10
X_scaled_copy = X_scaled_copy.drop(columns=vif_new[vif_new['VIF'] > 10]['Features'])

#Display the shape of the dataset after dropping features with VIF greater than 10
print(f"Shape of the dataset after dropping features with VIF greater than 10: {X_scaled_copy.shape}")

#Display the features of X_poly_copy dataset after dropping features with VIF greater than 10
print(f"Features of X_poly_copy dataset after dropping features with VIF greater than 10:\n{X_scaled_copy.columns}")

#Save the dataset after dropping features with VIF greater than 10 to a csv file
X_scaled_copy.to_csv('../data/features_without_interactions/vif_features_new.csv', index=False)

## 3B. PCA

In [None]:
#PCA for dimensionality reduction
from sklearn.decomposition import PCA

#Create a PCA object
pca_new = PCA(n_components=0.95).set_output(transform="pandas")

#Fit the PCA object to the scaled features
X_pca_new = pca_new.fit_transform(X_scaled)

#Display the shape of the dataset after PCA
print(f"Shape of the dataset after PCA: {X_pca_new.shape}")

#Display the first 5 rows of the dataset after PCA
print(f"First 5 rows of the dataset after PCA: \n{X_pca_new.head()}")

#Save the dataset after PCA to a csv file
X_pca_new.to_csv('../data/features_without_interactions/pca_features.csv', index=False)

## 3C. RFECV for RandomForest

In [None]:
#RFE for feature selection
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

#Create a logistic regression object
rfc = RandomForestClassifier()

#Create a RFECV object
rfecv_new = RFECV(estimator=rfc, 
              step=1, 
              cv=StratifiedKFold(5), 
              scoring='recall'
              ).set_output(transform="pandas")

#RFECV on X_scaled
X_rfecv_scaled_new = rfecv_new.fit_transform(X_scaled, y)

#Get the optimal number of features
optimal_num_features_scaled_new = rfecv_new.n_features_
print(f"Optimal number of features: {optimal_num_features_scaled_new}")

#Get the selected features
selected_features_scaled_new = rfecv_new.support_
print(f"Selected features: {selected_features_scaled_new}")

#Display the shape of the dataset after RFECV
print(f"Shape of the dataset after RFECV: {X_rfecv_scaled_new.shape}")

#Display the selected features after RFECV
print(f"Selected features after RFECV: \n{X_rfecv_scaled_new.columns}")

#Save the dataset after RFE to a csv file
X_rfecv_scaled.to_csv('../data/features_without_interactions/rfe_features.csv', index=False)