# Data Exploration and Processing

The dataset has 95 features, far too many to properly predict the target (if the company is bankrupt or not). Therefore, some data exploration and processing must be done in order to narrow down the dataset.

In [1]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import QuantileTransformer
import matplotlib.pyplot as plt
from scipy.stats import shapiro

Ideas for feature reduction:
- Remove low variance features
- Remove features with low correlation to target
- Transform data into Gaussian and remove features that are still non-Gaussian

In [35]:
# Read data and separate into features and target
df = pd.read_csv('train_data.csv')
X_train = df.drop('Bankrupt?', axis=1)
y_train = df['Bankrupt?']

In [36]:
# Remove features where the variance is less than 0.001
var = X_train.var()
low_variance_cols = np.where(var < 0.001)[0].tolist()
high_variance_df = X_train.drop(X_train.columns[low_variance_cols], axis=1)

In [37]:
# Remove features with a low correlation to the target
correlations = df.corr()['Bankrupt?'].abs().sort_values(ascending=False)
correlations = correlations.drop('Bankrupt?')
high_correlation_cols = correlations.head(high_variance_df.shape[1])
high_correlation_cols = high_correlation_cols.index.tolist()
high_correlation_df = df.loc[:,high_correlation_cols]

In [38]:
# Resulting features are the intersection of the high variance and high correlation features
high_var_corr_cols = list(set(high_variance_df.columns.tolist()).intersection(set(high_correlation_df.columns.tolist())))
high_var_corr_df = X_train.loc[:,high_var_corr_cols]

In [39]:
# Transform all features into Gaussian, then drop features that are still not Gaussian
qt = QuantileTransformer(output_distribution='normal', random_state=42)
df_qt = qt.fit_transform(high_var_corr_df)
high_var_corr_df = pd.DataFrame(df_qt, columns=high_var_corr_df.columns)

cols_to_drop = []
for i in high_var_corr_df.columns:
    stat, p = shapiro(high_var_corr_df.loc[:,i])
    if stat < 0.9:
        cols_to_drop.append(i)

print(f'Columns being dropped: {cols_to_drop}')
high_var_corr_df = high_var_corr_df.drop(columns=cols_to_drop)

Columns being dropped: [' Liability-Assets Flag', ' Tax rate (A)']


  res = hypotest_fun_out(*samples, **kwds)


In [41]:
# Export data
high_var_corr_df.to_csv('processed_train_data.csv',index=False)