# Big Data in Finance

In [None]:
# prepare package
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTENC
import threadpoolctl
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import matplotlib.colors as colors
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

In [None]:
# prepare data
df_sf=pd.read_csv("/Users/chorkid/Desktop/Imperial/Modules/2024 Spring/Big Data/Coursework/sfdata.csv")
df_sf.drop(columns=df_sf.columns[0:1], inplace=True)
df_sf.set_index('LOAN_ID', drop=True,inplace=True)

## Data Cleaning and Exploratory Analysis

In [None]:
## drop columns after manual selection
column_string='PPMT_FLG, IO, PRODUCT, SERVICER, MASTER_SERVICER, ISSUANCE_UPB, CURRENT_UPB, LOAN_AGE, REM_MONTHS, ADJ_REM_MONTHS, MATR_DT, CSCORE_C, ZIP, FIRST_PAY_IO, MNTHS_TO_AMTZ_IO, DLQ_STATUS, PMT_HISTORY, MOD_FLAG, MI_CANCEL_FLAG, RPRCH_DTE, CURR_SCHD_PRNCPL, TOT_SCHD_PRNCPL, UNSCHD_PRNCPL_CURR, LAST_PAID_INSTALLMENT_DATE, FORECLOSURE_DATE, DISPOSITION_DATE, ASSET_RECOVERY_COSTS, ORIGINAL_LIST_START_DATE, ORIGINAL_LIST_PRICE, CURRENT_LIST_START_DATE, CURRENT_LIST_PRICE, SERV_IND, CURRENT_PERIOD_MODIFICATION_LOSS_AMOUNT, CUMULATIVE_MODIFICATION_LOSS_AMOUNT, CURRENT_PERIOD_CREDIT_EVENT_NET_GAIN_OR_LOSS, CUMULATIVE_CREDIT_EVENT_NET_GAIN_OR_LOSS, FORECLOSURE_PRINCIPAL_WRITE_OFF_AMOUNT, ZERO_BALANCE_CODE_CHANGE_DATE, LOAN_HOLDBACK_INDICATOR, LOAN_HOLDBACK_EFFECTIVE_DATE, DELINQUENT_ACCRUED_INTEREST, ARM_5_YR_INDICATOR, ARM_PRODUCT_TYPE, MONTHS_UNTIL_FIRST_PAYMENT_RESET, MONTHS_BETWEEN_SUBSEQUENT_PAYMENT_RESET, INTEREST_RATE_CHANGE_DATE, PAYMENT_CHANGE_DATE, ARM_INDEX, ARM_CAP_STRUCTURE, INITIAL_INTEREST_RATE_CAP, PERIODIC_INTEREST_RATE_CAP, LIFETIME_INTEREST_RATE_CAP, MARGIN, BALLOON_INDICATOR, PLAN_NUMBER, DEAL_NAME'
list_col_delete=column_string.split(", ")
df_sf=df_sf.drop(list_col_delete,axis=1)

## convert datetime to be numerical (set earliest date be 1, if one day later, then add 1)
df_sf['ACT_PERIOD'] = pd.to_datetime(df_sf['ACT_PERIOD'])
df_sf['ORIG_DATE'] = pd.to_datetime(df_sf['ORIG_DATE'])
df_sf['FIRST_PAY'] = pd.to_datetime(df_sf['FIRST_PAY'])
df_sd_min_date=df_sf[['ACT_PERIOD','ORIG_DATE','FIRST_PAY']].min().min()
df_sf['ACT_PERIOD'] = (df_sf['ACT_PERIOD'] - df_sd_min_date).dt.days+1
df_sf['ORIG_DATE'] = (df_sf['ORIG_DATE'] - df_sd_min_date).dt.days+1
df_sf['FIRST_PAY'] = (df_sf['FIRST_PAY'] - df_sd_min_date).dt.days+1

In [None]:
## deal with NaN
df_sf['CURR_RATE'] = df_sf['CURR_RATE'].fillna(df_sf['ORIG_RATE'])
df_sf['NUM_BO']=df_sf['NUM_BO'].apply(lambda x: 1 if x > 1 else 0)
df_sf['CSCORE_B']=df_sf['CSCORE_B'].fillna(df_sf['CSCORE_B'].median()).round()
df_sf['HOMEREADY_PROGRAM_INDICATOR']=df_sf['FORBEARANCE_INDICATOR'].apply(lambda x: '7' if x == 7 else x)
df_sf["MI_PCT"]=df_sf['MI_PCT'].fillna(0).astype(float)
df_sf[['FORECLOSURE_COSTS','PROPERTY_PRESERVATION_AND_REPAIR_COSTS','MISCELLANEOUS_HOLDING_EXPENSES_AND_CREDITS',
       'ASSOCIATED_TAXES_FOR_HOLDING_PROPERTY','NET_SALES_PROCEEDS','CREDIT_ENHANCEMENT_PROCEEDS',
       'REPURCHASES_MAKE_WHOLE_PROCEEDS','OTHER_FORECLOSURE_PROCEEDS','NON_INTEREST_BEARING_UPB',
       'PRINCIPAL_FORGIVENESS_AMOUNT']]=df_sf[['FORECLOSURE_COSTS','PROPERTY_PRESERVATION_AND_REPAIR_COSTS',
                                               'MISCELLANEOUS_HOLDING_EXPENSES_AND_CREDITS',
                                               'ASSOCIATED_TAXES_FOR_HOLDING_PROPERTY','NET_SALES_PROCEEDS',
                                               'CREDIT_ENHANCEMENT_PROCEEDS','REPURCHASES_MAKE_WHOLE_PROCEEDS',
                                               'OTHER_FORECLOSURE_PROCEEDS','NON_INTEREST_BEARING_UPB',
                                               'PRINCIPAL_FORGIVENESS_AMOUNT']].fillna(0)
df_sf['MI_TYPE']=df_sf['MI_TYPE'].fillna(0)
df_sf['MI_TYPE']=df_sf['MI_TYPE'].astype(str)

df_sf['FORBEARANCE_INDICATOR']=df_sf['FORBEARANCE_INDICATOR'].fillna('0')
df_sf['FORBEARANCE_INDICATOR']=df_sf['FORBEARANCE_INDICATOR'].apply(lambda x: '7' if x == 7 or x==7.0 else x)
df_sf['ADR_TYPE']=df_sf['ADR_TYPE'].fillna('0')
df_sf['ADR_TYPE']=df_sf['ADR_TYPE'].apply(lambda x: '7' if x == 7 or x==7.0 else x)
df_sf['PROPERTY_INSPECTION_WAIVER_INDICATOR']=df_sf['PROPERTY_INSPECTION_WAIVER_INDICATOR'].fillna('0')
df_sf['HOMEREADY_PROGRAM_INDICATOR']=df_sf['HOMEREADY_PROGRAM_INDICATOR'].fillna('0')

## drop rows with NaN in column DTI and LAST_UPB
df_sf.dropna(subset=['DTI','LAST_UPB',],inplace=True)

## drop columns with more than 98% NaN
df_sf.dropna(thresh=round(len(df_sf)/50), axis=1,inplace=True)

# check if there is still NaN in the dataframe
#df_sf.isna().any()

In [None]:
# ANALYSIS OF DATA
df_sf_copy=df_sf.copy()

## divide original dataframe into defaulted and non-defaulted dataframe
df_sf_non_default=df_sf_copy[df_sf_copy['loan_status']=='Fully Repaid']
df_sf_default=df_sf_copy[df_sf_copy['loan_status']=='Defaulted']
df_sf_non_default.dropna(thresh=round(len(df_sf_non_default)/50), axis=1,inplace=True)
df_sf_default.dropna(thresh=round(len(df_sf_default)/50), axis=1,inplace=True)
## difference of variables between two dataframes, and defaulted contains all variables in non-defaulted
diff_def_nondef = set(df_sf_non_default.columns).symmetric_difference(set(df_sf_default.columns))

In [None]:
## convert all numerical data to be float datatype
df_sf[numerical_cols]=df_sf[numerical_cols].astype(float)
## convert categorical data
df_sf['MSA']=df_sf['MSA'].astype(str)
df_sf['Zero_Bal_Code']=df_sf['Zero_Bal_Code'].astype(str)

# identify and convert dummy variables 
num_unique_values = df_sf.nunique()

## filter columns where the number of unique values is 2
binary_cols = num_unique_values[num_unique_values == 2].index
binary_cols = binary_cols[:-1]

## get the number of unique values in each of these binary columns
num_unique_binary_values = df_sf[binary_cols].nunique()

## create dummy variables for binary columns
dummy_df = pd.get_dummies(df_sf[binary_cols], drop_first=True)

## drop the original binary columns
df_sf_dummies = pd.concat([df_sf, dummy_df], axis=1)
df_sf_dummies.drop(columns=binary_cols, inplace=True)

## save categorical dataframe and numerical dataframe respectively
### numerical
numerical_cols = df_sf_copy.select_dtypes(include=['number']).columns
df_sf_num=df_sf_copy[numerical_cols]
### categorical
categorical_cols = df_sf_copy.select_dtypes(include=['category', 'object', 'bool']).columns
df_sf_cat=df_sf_copy[categorical_cols]

In [None]:
## correlation
df_sf_num_corr=df_sf_num.corr()

## drop high correlated variables
df_sf = df_sf.drop({'FORECLOSURE_COSTS','PROPERTY_PRESERVATION_AND_REPAIR_COSTS','MISCELLANEOUS_HOLDING_EXPENSES_AND_CREDITS', 'ASSOCIATED_TAXES_FOR_HOLDING_PROPERTY','NET_SALES_PROCEEDS','CREDIT_ENHANCEMENT_PROCEEDS', 'REPURCHASES_MAKE_WHOLE_PROCEEDS', 'OTHER_FORECLOSURE_PROCEEDS','Zero_Bal_Code'}, axis = 1)

## Sampling and Rebalancing

In [None]:
# divide train and test set
df_sf_train, df_sf_test = train_test_split(df_sf, test_size=0.3)

# balancing data

## reduce the numder of Fully Repaid data randomly
### divide two unbalancing data set
df_sf_train_fully_repaid=df_sf_train[df_sf_train['loan_status']=='Fully Repaid']
df_sf_train_default=df_sf_train[df_sf_train['loan_status']=='Defaulted']

### select 600000 data
df_sf_reduced_fully_repaid=df_sf_train_fully_repaid.sample(n=300000)
df_sf_train = pd.concat([df_sf_reduced_fully_repaid, df_sf_train_default], axis=0, ignore_index=True)

# standardization
scaler = StandardScaler()
df_sf_train[numerical_cols] = scaler.fit_transform(df_sf_train[numerical_cols])
df_sf_test[numerical_cols] = scaler.transform(df_sf_test[numerical_cols])

## use SMOTE to oversampling Defaulted data
x_train=df_sf_train.drop(columns=['loan_status'])
y_train=df_sf_train['loan_status']
x_test=df_sf_test.drop(columns=['loan_status'])
y_test=df_sf_test['loan_status']

### oversampling for defaulted data
categorical_cols = x_train.select_dtypes(include=['object']).columns

categorical_cols_index = [x_train.columns.get_loc(name) for name in categorical_cols]
smotenc =SMOTENC(categorical_features=categorical_cols_index, random_state=42)
x_res_train,y_res_train=smotenc.fit_resample(x_train,y_train)

## Tree-based Models

### Encoding

In [None]:
# Hot-encode each independent category as a binary variable using 0 or 1, didn't change y
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import scale
## encode test data
ohe = OneHotEncoder(dtype="int")
ohe.fit(df_sf_test[categorical_cols])
df_sf_test_ohe = ohe.transform(df_sf_test[categorical_cols]).toarray()

transformed_ohe = pd.DataFrame(
    data=df_sf_test_ohe,
    columns=ohe.get_feature_names_out(categorical_cols),
    index=df_sf_test.index,
)
ds1 = transformed_ohe 
transformed_ohe = transformed_ohe.drop({'MSA_13220', 'SELLER_Home Point Mortgage Acceptance Corporation', 'SELLER_Fbc Mortgage Llc', 'MSA_27060', 'MSA_25980', 'MSA_27780', 'MSA_10380', 'SELLER_Peoples Home Equity, Inc.', 'MSA_21820', 'MSA_21420', 'SELLER_Prospect Mortgage, Llc', 'HOMEREADY_PROGRAM_INDICATOR_N', 'MSA_11640', 'MSA_21300', 'MSA_48260', 'SELLER_CrossCountry Mortgage, LLC', 'MSA_44940', 'FORBEARANCE_INDICATOR_N', 'MSA_38220', 'MSA_42700', 'PROPERTY_INSPECTION_WAIVER_INDICATOR_R', 'MSA_25020', 'MSA_41900', 'STATE_GU', 'MSA_11020', 'SELLER_Guild Mortgage Company LLC'},axis=1)

### merge dataframes
df_sf_test_encoded = pd.concat([df_sf_test.drop(categorical_cols,axis=1), transformed_ohe], axis=1)

x_res_test_encoded = df_sf_test_encoded.drop("loan_status", axis=1)
y_res_test = df_sf_test["loan_status"]

In [None]:
## encode training data
ohe = OneHotEncoder(dtype="int")
ohe.fit(x_res_train[categorical_cols])
x_res_train_ohe = ohe.transform(x_res_train[categorical_cols]).toarray()


transformed_ohe = pd.DataFrame(
    data=x_res_train_ohe,
    columns=ohe.get_feature_names_out(categorical_cols),
    index=x_res_train,
)
ds2 = transformed_ohe

## reset indices of the DataFrames
x_res_train.reset_index(drop=True, inplace=True)
transformed_ohe.reset_index(drop=True, inplace=True)

# concatenate the DataFrames
x_res_train_encoded = pd.concat([x_res_train.drop(categorical_cols, axis=1), transformed_ohe], axis=1)

In [None]:
# get the list of columns from each dataset
columns_df1 = set(ds1)
columns_df2 = set(ds2)

# find the columns that are present in one dataset but not in the other
columns_only_in_df1 = columns_df1 - columns_df2
columns_only_in_df2 = columns_df2 - columns_df1

# print the differences for elimination
print("Columns only in ds1:", columns_only_in_df1)
print("Columns only in ds2", columns_only_in_df2)

### Basic decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
# decision tree model with hyperparameters search
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# define the decision tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

param_grid = {
    'max_depth': [5, 10, 15, 20, 25, 30, 35],
    'min_samples_split': [10, 20, 50, 100],
    'min_samples_leaf': [3, 5, 10, 20],
    'criterion': ['gini', 'entropy']
}

# perform grid search cross-validation
grid_search = GridSearchCV(dt_classifier, param_grid, cv=5)
grid_search.fit(x_res_train_encoded, y_res_train)

# print the best hyperparameters and the corresponding accuracy score
print("Best Hyperparameters: ", grid_search.best_params_)
print("Best Accuracy Score: ", grid_search.best_score_)

In [None]:
# train the decision tree model with the best hyperparameters
best_tree = grid_search.best_estimator_
y_pred_dt = best_tree.predict(x_res_test_encoded)

In [None]:
# plot the best tree
plt.figure(figsize=(20, 10))
plot_tree(best_tree, filled=True, feature_names=x_res_train_encoded.columns)
plt.show()

In [None]:
# display the confusion matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

accuracy = accuracy_score(y_res_test, y_pred_dt)
print("Accuracy:", accuracy)

class_names = ['Defaulted', 'Fully Repaid']
disp_tree = ConfusionMatrixDisplay.from_estimator(
        best_tree,
        x_res_test_encoded,
        y_res_test,
        display_labels=class_names,
        cmap=plt.cm.Blues)

In [None]:
# plot the roc curve
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

y_test_encoded=y_res_test.replace({'Defaulted': 1, 'Fully Repaid': 0})
y_pred_dt_encoded = best_tree.predict_proba(x_res_test_encoded)[:, 0]

fpr_dt, tpr_dt, thresholds_dt = roc_curve(y_test_encoded, y_pred_dt_encoded)
roc_auc_dt = auc(fpr_dt, tpr_dt)

plt.figure()
plt.plot(fpr_dt, tpr_dt, color='darkorange', lw=2, label='ROC curve (AUC = %0.2f)' % roc_auc_dt)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
# plot feature importance
feature_importances = best_tree.feature_importances_
feature_names = x_res_train_encoded.columns 

# sort the feature importances in descending order
sorted_indices = feature_importances.argsort()[::-1]
sorted_importances = feature_importances[sorted_indices]
sorted_feature_names = feature_names[sorted_indices]

# select the top 5 features
top_n = 10
top_indices = sorted_indices[:top_n]
top_importances = sorted_importances[:top_n]
top_feature_names = sorted_feature_names[:top_n]

# plot the top 5 feature importances
plt.figure(figsize=(10, 6))
plt.bar(range(len(top_importances)), top_importances, tick_label=top_feature_names)
plt.xticks(rotation=45)
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Top 10 Feature Importance')
plt.tight_layout()
plt.show()

### Gradient Boosting Tree

In [None]:
# train boosted tree
GDC_tree = GradientBoostingClassifier(random_state=42)
param_grid = {
    'max_features': ['sqrt', 'log2'],
    'n_estimators': [5, 10, 30, 100], 
    'learning_rate': [0.01, 0.1, 1.0],
    'subsample': [0.1, 0.3, 0.5],
    'max_depth': [5, 10, 15]
}

grid_search = GridSearchCV(estimator=GDC_tree, param_grid=param_grid, n_jobs=-1, cv=5, scoring='accuracy')
grid_search.fit(x_res_train_encoded, y_res_train)

# Print the best hyperparameters and the corresponding accuracy score
print("Best Hyperparameters: ", grid_search.best_params_)
print("Best Accuracy Score: ", grid_search.best_score_)

# fit boosted tree
best_boosted = grid_search.best_estimator_
y_pred_boosted = best_boosted.predict(x_res_test_encoded)

In [None]:
# display results
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

accuracy = accuracy_score(y_res_test, y_pred_boosted)
print("Accuracy:", accuracy)

class_names = ['Defaulted', 'Fully Repaid']
disp_boosted = ConfusionMatrixDisplay.from_estimator(
        best_boosted,
        x_res_test_encoded,
        y_res_test,
        display_labels=class_names,
        cmap=plt.cm.Blues)

In [None]:
# plot roc curve
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

y_test_encoded=y_res_test.replace({'Defaulted': 1, 'Fully Repaid': 0})
# predict_prob
y_pred_boosted_encoded = best_boosted.predict_proba(x_res_test_encoded)[:, 0]

fpr_boosted, tpr_boosted, thresholds_boosted = roc_curve(y_test_encoded, y_pred_boosted_encoded)

# compute the area under the ROC curve (AUC)
roc_auc_boosted = auc(fpr_boosted, tpr_boosted)

# plot the ROC curve
plt.figure()
plt.plot(fpr_boosted, tpr_boosted, color='blue', lw=2, label='ROC curve (AUC = %0.2f)' % roc_auc_boosted)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
# plot feature importance
feature_importances = best_boosted.feature_importances_
feature_names = x_res_train_encoded.columns  

# sort the feature importances in descending order
sorted_indices = feature_importances.argsort()[::-1]
sorted_importances = feature_importances[sorted_indices]
sorted_feature_names = feature_names[sorted_indices]

# select the top 5 features
top_n = 10
top_indices = sorted_indices[:top_n]
top_importances = sorted_importances[:top_n]
top_feature_names = sorted_feature_names[:top_n]

# plot the top 5 feature importances
plt.figure(figsize=(10, 6))
plt.bar(range(len(top_importances)), top_importances, tick_label=top_feature_names)
plt.xticks(rotation=45)
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Top 10 Feature Importance')
plt.tight_layout()
plt.show()

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=42)

In [None]:
# train random forest

param_grid = {
    'n_estimators': [5, 10, 30], 
    'max_features': ['sqrt', 'log2', None],
    'min_samples_leaf': [100, 200, 300, 400],
    'min_samples_split': [200, 300, 400, 500],
    'max_depth': [5, 10, 15]
}

# instantiate the grid search model
grid_search = GridSearchCV(estimator = rfc, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, return_train_score=True)
grid_search.fit(x_res_train_encoded, y_res_train)

# print the best hyperparameters and the corresponding accuracy score
print("Best Hyperparameters: ", grid_search.best_params_)
print("Best Accuracy Score: ", grid_search.best_score_)

# fit random forest
best_rf = grid_search.best_estimator_
y_pred_rf = best_rf.predict(x_res_test_encoded)

In [None]:
# display results
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

accuracy = accuracy_score(y_res_test, y_pred_rf)
print("Accuracy:", accuracy)

class_names = ['Defaulted', 'Fully Repaid']
disp_rf = ConfusionMatrixDisplay.from_estimator(
        best_rf,
        x_res_test_encoded,
        y_res_test,
        display_labels=class_names,
        cmap=plt.cm.Blues)

In [None]:
# plot roc curve
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

y_test_encoded=y_res_test.replace({'Defaulted': 1, 'Fully Repaid': 0})
y_pred_rf_encoded = best_rf.predict_proba(x_res_test_encoded)[:, 0]

fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test_encoded, y_pred_rf_encoded)

# compute the area under the ROC curve (AUC)
roc_auc_rf = auc(fpr_rf, tpr_rf)

# plot the ROC curve
plt.figure()
plt.plot(fpr_rf, tpr_rf, color='red', lw=2, label='ROC curve (AUC = %0.2f)' % roc_auc_rf)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
# plot feature importance
feature_importances = best_rf.feature_importances_
feature_names = x_res_train_encoded.columns  

# sort the feature importances in descending order
sorted_indices = feature_importances.argsort()[::-1]
sorted_importances = feature_importances[sorted_indices]
sorted_feature_names = feature_names[sorted_indices]

# select the top 10 features
top_n = 10
top_indices = sorted_indices[:top_n]
top_importances = sorted_importances[:top_n]
top_feature_names = sorted_feature_names[:top_n]

# plot the top 10 feature importances
plt.figure(figsize=(10, 6))
plt.bar(range(len(top_importances)), top_importances, tick_label=top_feature_names)
plt.xticks(rotation=45)
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Top 10 Feature Importance')
plt.tight_layout()
plt.show()

In [None]:
# plot all ROC curve together
plt.figure()
plt.plot(fpr_dt, tpr_dt, color='darkorange', lw=2, label='Decision Tree (AUC = %0.2f)' % roc_auc_dt)
plt.plot(fpr_rf, tpr_rf, color='red', lw=2, label='Random Forest (AUC = %0.2f)' % roc_auc_rf)
plt.plot(fpr_boosted, tpr_boosted, color='blue', lw=2, label='Boosted Tree (AUC = %0.2f)' % roc_auc_boosted)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

## Logistic Regression

### Encoding

In [None]:
# y label
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
# change to dummy variable
categorical_cols_class = x.select_dtypes(include=['object']).columns
categorical_cols_class=categorical_cols_class.drop(['SELLER','STATE'])
categorical_label = ["SELLER", "STATE"]
x_encoded = pd.get_dummies(x, columns=categorical_cols_class)
x_encoded['SELLER']=label_encoder.fit_transform(x_encoded['SELLER'])
x_encoded['STATE']=label_encoder.fit_transform(x_encoded['STATE'])

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_encoded,y_encoded, test_size=0.3, random_state=42)
df_sf_train=x_train
df_sf_train['loan_status']=y_train

## divide two unbalancing data set
df_sf_train_fully_repaid=df_sf_train[df_sf_train['loan_status']== 1]
df_sf_train_default=df_sf_train[df_sf_train['loan_status']== 0]

# resample data and scale
df_sf_reduced_fully_repaid=df_sf_train_fully_repaid.sample(n=300000)
df_sf_train = pd.concat([df_sf_reduced_fully_repaid, df_sf_train_default], axis=0, ignore_index=True)
y_train=df_sf_train["loan_status"]
x_train=df_sf_train.drop("loan_status", axis=1)
smote = SMOTE()
x_resampled, y_resampled = smote.fit_resample(x_train, y_train)
scaler = StandardScaler()
x_resampled_scaled = scaler.fit_transform(x_resampled)
x_test_scaled = scaler.transform(x_test)

### Model Training

In [None]:
import gc
gc.collect()

In [None]:
# set up cross validation scheme
folds = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 4)

# specify range of hyperparameters
params = {'C': [0.001, 0.01, 0.1, 1, 10]}

## using Logistic regression for class imbalance
model = LogisticRegression(max_iter=10000, solver='liblinear') #class_weight='balanced'
grid_search_cv = GridSearchCV(estimator = model, param_grid = params, 
                        scoring= 'roc_auc', 
                        cv = folds, 
                        return_train_score=True, verbose = 1)            
grid_search_cv.fit(x_resampled_scaled, y_resampled)

## reviewing the results
cv_results = pd.DataFrame(grid_search_cv.cv_results_)
grid_search_cv.best_params_

In [None]:
# result check
best_logistic_regression_model = grid_search_cv.best_estimator_
coefficients = abs(best_logistic_regression_model.coef_)
feature_names = x_train.columns
feature_coefficients = dict(zip(feature_names, coefficients[0]))

In [None]:
# plot importance features
feature_coefficients = list(zip(feature_names, coefficients[0]))

sorted_coefficients = sorted(feature_coefficients, key=lambda x: abs(x[1]), reverse=True)

top_10_coefficients = sorted_coefficients[:10]

top_10_variable_names = [item[0] for item in top_10_coefficients]
top_10_variable_coefficients = [item[1] for item in top_10_coefficients]

plt.figure(figsize=(10, 6))
plt.barh(top_10_variable_names, top_10_variable_coefficients, color='skyblue')
plt.xlabel('Coefficient Value')
plt.ylabel('Feature')
plt.title('Top 10 Features by Coefficient Magnitude')
plt.gca().invert_yaxis() 
plt.show()

In [None]:
# plot confusion matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

y_pred = grid_search_cv.predict(x_test_scaled)

cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap="Blues", fmt="d", xticklabels=['Default', 'Fully-repaid'], yticklabels=['Default', 'Fully-repaid'])
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# plot ROC curve
from sklearn.metrics import roc_curve, auc

y_pred_proba = grid_search_cv.predict_proba(x_test_scaled)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

## SVM model

In [None]:
# scale the data
x_res_test_encoded_scaled = scale(x_res_test_encoded)
x_res_train_encoded_scaled = scale(x_res_train_encoded)

In [None]:
# Fit the initial SVM and plot Confusion Matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import matplotlib.colors as colors
import matplotlib.pyplot as plt

clf_svm = SVC(random_state=42, probability=True)
clf_svm.fit(x_res_train_encoded_scaled, y_res_train)

#calculate overall accuracy
y_pred = clf_svm.predict(x_res_test_encoded_scaled)
accuracy = accuracy_score(y_res_test, y_pred)
print(f'Accuracy: {accuracy:.2%}')

class_names = ['Did Not Default', 'Defaulted']
disp = ConfusionMatrixDisplay.from_estimator(
        clf_svm,
        x_res_test_encoded_scaled,
        y_res_test,
        display_labels=class_names,
        cmap=plt.cm.Blues)

In [None]:
# plot top 10 features
import matplotlib.pyplot as plt
from sklearn.inspection import permutation_importance
import numpy as np

# calculate permutation importance with reduced repeats and parallelization
perm_importance = permutation_importance(clf_svm, 
                                         x_res_test_encoded_scaled, 
                                         y_res_test, 
                                         n_repeats=2, 
                                         random_state=42,
                                         n_jobs=-1)

# get the feature importances and feature indices
feature_importances = perm_importance.importances_mean
feature_indices = np.arange(len(feature_importances))

# sort the feature importances and indices in descending order
sorted_indices = np.argsort(feature_importances)[::-1]

# print the top features
top_features = 10  # Choose the number of top features to display
for i in range(top_features):
    print(f"Feature {sorted_indices[i]}: Importance {feature_importances[sorted_indices[i]]}")

# get the column names of x_res_train_encoded
feature_names = x_res_train_encoded.columns
# get the top 10 feature names
top_feature_names = feature_names[sorted_indices[:top_features]]

# print the top feature names and their importances
for i in range(top_features):
    print(f"{top_feature_names[i]}: {feature_importances[sorted_indices[i]]}")

# plot the top features with their names
plt.figure(figsize=(10, 6))
plt.bar(range(top_features), feature_importances[sorted_indices[:top_features]], color='skyblue')
plt.xlabel('Feature')
plt.ylabel('Permutation Importance')
plt.title('Top 10 Features by Permutation Importance')
plt.xticks(range(top_features), feature_names[sorted_indices[:top_features]], rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# plot ROC curve
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# sssuming you have already trained your model and obtained predicted probabilities
y_pred_proba = clf_svm.predict_proba(x_res_test_encoded_scaled)[:, 1]

# compute ROC curve and ROC area for each class
fpr, tpr, _ = roc_curve(y_res_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

# plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
# optimize model with hyperparameter tuning
from sklearn.model_selection import GridSearchCV, StratifiedKFold

param_grid = {'C':[100], #[0.5,0.1,1,10,100,1000]
              'gamma':[1,'scale'],#['scale', 1,0.1, 0.01,0.001,0.0001]
              'kernel':['rbf'],
              'probability': [True]}

optimal_params = GridSearchCV(SVC(), param_grid, cv = 5, scoring='accuracy', verbose=3)
optimal_params.fit(x_res_train_encoded_scaled, y_res_train)

## see "best" parameters
optimal_params.best_params_

In [None]:
## refit model with optimal hyperparameters
#grid_predictions = optimal_params.predict(x_res_test_encoded.values)
grid_predictions = optimal_params.predict(x_res_test_encoded.values)
#clf_svm = SVC(random_state = 42, C=1, gamma=0.1, kernel='poly')
#clf_svm.fit(x_res_train_encoded_scaled, y_res_train)

## calculate overall accuracy
#y_pred = clf_svm.predict(x_res_test_encoded_scaled)
accuracy = accuracy_score(y_res_test, grid_predictions)
print(f'Accuracy: {accuracy:.2%}')

## plot confusion matrix
cm = confusion_matrix(y_res_test, grid_predictions)

# plot the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
# plot ROC curve
from sklearn.metrics import roc_curve, roc_auc_score

# calculate predicted probabilities for the positive class
y_pred_proba = optimal_params.predict_proba(x_res_test_encoded_scaled)[:, 1]

# calculate fpr and tpr for the ROC curve
fpr, tpr, thresholds = roc_curve(y_res_test, y_pred_proba)

# calculate ROC AUC score
roc_auc = roc_auc_score(y_res_test, y_pred_proba)

# plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
# applying PCA to reduce dimensioanlity
from sklearn.decomposition import PCA
# reducing dimensionality within the data
pca = PCA()
x_train_pca = pca.fit_transform(x_res_train_encoded_scaled)

per_var = np.round(pca.explained_variance_ratio_*100, decimals=1)
labels = [str(x) for x in range(1, len(per_var)+1)]

#plot scree plot
plt.bar(x=range(1, len(per_var)+1), height=per_var)
plt.tick_params(axis='x', which = 'both', bottom=False, top=False, labelbottom=False)
plt.ylabel("Explained variance (%)")
plt.xlabel('Principal Components')
plt.title('Scree Plot')
plt.show()

In [None]:
# using first 2 PCA
train_pc1_coords = x_train_pca[:, 0]
train_pc2_coords = x_train_pca[:, 1]

pca_train_scaled = scale(np.column_stack((train_pc1_coords, train_pc2_coords)))

param_grid = {'C':[100], #[0.01, 0.1, 0.5, 1, 10, 100]
              'gamma':[1], #[1, 0.75, 0.5, 0.25, 0.1, 0.01, 0.001]
              'kernel':['rbf']}
optimal_params = GridSearchCV(SVC(), param_grid, cv = 5, scoring='accuracy', verbose=3)

optimal_params.fit(pca_train_scaled, y_res_train)
optimal_params.best_params_

In [None]:
# plot PCA visualisation
clf_svm = SVC(random_state=42, C=100, gamma=1)
clf_svm.fit(pca_train_scaled, y_res_train)

X_test_pca = pca.transform(x_res_train_encoded_scaled)
test_pc1_coords = X_test_pca[:, 0]
test_pc2_coords = X_test_pca[:, 1]

x_min = test_pc1_coords.min()-1
x_max = test_pc1_coords.max()+1
y_min = test_pc2_coords.min()-1
y_max = test_pc2_coords.max()+1

xx, yy = np.meshgrid(np.arange(start=x_min, stop=x_max, step=0.1),np.arange(start=y_min, stop=y_max, step=0.1) )

Z = clf_svm.predict(np.column_stack((xx.ravel(), yy.ravel())))
Z = Z.reshape(xx.shape)

# visualizing the data
fig, ax = plt.subplots(figsize=(10,10))
ax.contourf(xx,yy, Z, alpha=0.1)
cmap = colors.ListedColormap(['#e41a1c', '#4daf4a'])
scatter = ax.scatter(test_pc1_coords, test_pc2_coords, c=y_res_train, cmap=cmap, s=100, edgecolors='k', alpha=0.7)
legend = ax.legend(scatter.legend_elements()[0], scatter.legend_elements()[1], loc='upper right')
legend.get_texts()[0].set_text('Defaulted')
legend.get_texts()[1].set_text('Did not Default')
ax.set_ylabel('PC2')
ax.set_xlabel('PC1')
ax.set_title('Visualizing the Decision Boundary Using Principal Components')
plt.show()

## Neural Network

### Encoding

In [None]:
# categorical variable encoder
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
x_train_encoded = pd.DataFrame(encoder.fit_transform(x_res_train[categorical_cols]),index=x_res_train.index)
x_test_encoded = pd.DataFrame(encoder.transform(x_test[categorical_cols]),index=x_test.index)

# y LabelEncoder
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_res_train)
y_test_encoded = label_encoder.transform(y_test)

# combine encoded categorical data and numerical data
x_train_processed = pd.concat([x_train_encoded, x_res_train[numerical_cols]], axis=1)
x_test_processed = pd.concat([x_test_encoded,x_test[numerical_cols]], axis=1)

### Model Training

In [None]:
# create neural network
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=x_train_processed.shape[1]))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# set model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# train model
model.fit(x_train_processed, y_train_encoded, epochs=10, batch_size=32, validation_split=0.2)

# evaluate model
loss,accuracy= model.evaluate(x_test_processed, y_test_encoded)
print('Test Accuracy:', accuracy)

# convert NN result to be classification
y_pred = model.predict(x_test_processed) 
y_class_pred = (y_pred > 0.5).astype(int)

In [None]:
## plot confusion matrix
class_names = ['Defaulted', 'Fully Repaid']

cm = confusion_matrix(np.array(y_test_encoded), y_class_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
disp.plot(cmap=plt.cm.Blues)

plt.show()

### Compare ROC curves of all models

In [None]:
pred_log=pd.read_csv("C:/Users/12095/Desktop/Big Data I/Presentation/output_prob.csv")
pred_log.reset_index(drop=True)
pred_log=pred_log['0']
pred_log=1-np.array(pred_log)

test_log=pd.read_csv("C:/Users/12095/Desktop/Big Data I/Presentation/test_log.csv")
test_log.reset_index(drop=True)
test_log=test_log['0']
test_log=np.array(test_log)
test_log=np.where(test_log == 1, 0, 1)

pred_svm=pd.read_csv("C:/Users/12095/Desktop/Big Data I/Presentation/pred_SVM(3).csv")
pred_svm=pred_svm['0']
pred_svm=np.array(pred_svm)

test_svm=pd.read_csv("C:/Users/12095/Desktop/Big Data I/Presentation/test_SVM(3).csv")
test_svm=test_svm['loan_status']
test_svm=np.array(test_svm)

pred_tree=pd.read_csv("C:/Users/12095/Desktop/Big Data I/Presentation/y_pred_proba_boosted.csv")
pred_tree.reset_index(drop=True)
pred_tree=pred_tree['0']
pred_tree=np.array(pred_tree)

test_tree=pd.read_csv("C:/Users/12095/Desktop/Big Data I/Presentation/test_tree.csv")
test_tree.reset_index(drop=True)
test_tree=test_tree['loan_status']
test_tree=np.array(test_tree)

test_nn=np.where(y_test_encoded == 1, 0, 1)
pred_nn=np.where(y_class_pred.flatten() == 1, 0, 1)

In [None]:
# calculate ROC value
fpr_nn, tpr_nn, thresholds_nn = roc_curve(test_nn, (1-y_pred))
fpr_svm, tpr_svm, thresholds_svm = roc_curve(test_svm, pred_svm)
fpr_log, tpr_log, thresholds_log = roc_curve(test_log, pred_log)
fpr_tree, tpr_tree, thresholds_tree = roc_curve(test_tree, pred_tree)
# calculate AUC value
roc_auc_nn = auc(fpr_nn, tpr_nn)
roc_auc_svm = auc(fpr_svm, tpr_svm)
roc_auc_log = auc(fpr_log, tpr_log)
roc_auc_tree = auc(fpr_tree, tpr_tree)

# plot the ROC curve
plt.figure()
plt.plot(fpr_nn, tpr_nn, color='darkorange', lw=2, label='Nerual Network (AUC = %0.2f)' % roc_auc_nn)
plt.plot(fpr_svm, tpr_svm, color='red', lw=2, label='Support Vector Machine (AUC = %0.2f)' % roc_auc_svm)
plt.plot(fpr_log, tpr_log, color='blue', lw=2, label='Logistic (AUC = %0.2f)' % roc_auc_log)
plt.plot(fpr_tree, tpr_tree, color='green', lw=2, label='Boosted Tree (AUC = %0.2f)' % roc_auc_tree)

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()