In [14]:
# Import Python Packages
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler

In [19]:
data_dir = './data/'
# Load the data
old_train_df = pd.read_csv(data_dir + '1_clean_train.csv')
train_df = pd.read_csv(data_dir + '2_new_train.csv')
train_ros_df = pd.read_csv(data_dir + '2_new_train_ros.csv')
train_smote_df = pd.read_csv(data_dir + '2_new_train_smote.csv')
test_df = pd.read_csv(data_dir + '2_new_test.csv')
print(f"Old train shape: {old_train_df.shape}, New train shape: {train_df.shape}, "
      f"Train ROS shape: {train_ros_df.shape}, Train SMOTE shape: {train_smote_df.shape}, "
      f"Test shape: {test_df.shape}")

Old train shape: (88872, 47), New train shape: (71097, 47), Train ROS shape: (112095, 47), Train SMOTE shape: (112095, 47), Test shape: (17775, 47)


In [16]:
def standardise_continuous_features(df):
    continuous_columns = ['age', 'annual_income', 'monthly_inhand_salary',
       'num_bank_accounts', 'num_credit_card', 'interest_rate', 'num_of_loan',
       'delay_from_due_date', 'num_of_delayed_payment', 'changed_credit_limit',
       'num_credit_inquiries', 'outstanding_debt', 'credit_utilization_ratio',
       'total_emi_per_month', 'amount_invested_monthly', 'monthly_balance',
       'credit_history_age_in_month']
    scaled_cont_df = pd.DataFrame(StandardScaler().fit_transform(df[continuous_columns]),
                                  columns=continuous_columns,
                                  index=df.index)
    df[continuous_columns] = scaled_cont_df
    return df

def multicollinearity_check(df, threshold=12):
    """
    Function to check for multicollinearity in the dataset.
    """
    # Calculate VIF
    vif_data = pd.DataFrame()
    vif_data["Feature"] = df.columns
    vif_data["VIF"] = [variance_inflation_factor(df.values, i) 
                       for i in range(df.shape[1])]
    # Sort VIF data
    vif_data = vif_data.sort_values('VIF', ascending=False)
    
    # Filter features with VIF > threshold
    high_vif = vif_data[vif_data["VIF"] > threshold]
    
    return high_vif

In [17]:
# Standardise continuous features
old_train_df = standardise_continuous_features(old_train_df)
train_df = standardise_continuous_features(train_df)
train_ros_df = standardise_continuous_features(train_ros_df)
train_smote_df = standardise_continuous_features(train_smote_df)
test_df = standardise_continuous_features(test_df)

# Check for multicollinearity
high_vif_old_train = multicollinearity_check(old_train_df)
high_vif_train = multicollinearity_check(train_df)
high_vif_train_ros = multicollinearity_check(train_ros_df)
high_vif_train_smote = multicollinearity_check(train_smote_df)
high_vif_test = multicollinearity_check(test_df)

# Print the results
print("High VIF features in old_train_df:")
print(high_vif_old_train)
print("High VIF features in train_df:")
print(high_vif_train)
print("High VIF features in train_ros_df:")
print(high_vif_train_ros)
print("High VIF features in train_smote_df:")
print(high_vif_train_smote)
print("High VIF features in test_df:")
print(high_vif_test)

High VIF features in old_train_df:
Empty DataFrame
Columns: [Feature, VIF]
Index: []
High VIF features in train_df:
Empty DataFrame
Columns: [Feature, VIF]
Index: []
High VIF features in train_ros_df:
Empty DataFrame
Columns: [Feature, VIF]
Index: []
High VIF features in train_smote_df:
Empty DataFrame
Columns: [Feature, VIF]
Index: []
High VIF features in test_df:
Empty DataFrame
Columns: [Feature, VIF]
Index: []


In [18]:
# Save the processed data
old_train_df.to_csv(data_dir + '3_old_train_processed.csv', index=False)
train_df.to_csv(data_dir + '3_train_processed.csv', index=False)
train_ros_df.to_csv(data_dir + '3_train_ros_processed.csv', index=False)
train_smote_df.to_csv(data_dir + '3_train_smote_processed.csv', index=False)
test_df.to_csv(data_dir + '3_test_processed.csv', index=False)