# Exploratory Data Analysis & Model Fitting

In [None]:
import os
import sys

# Add the src directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [None]:
import pandas as pd

pd.set_option('display.max_columns', None)

## Data Preprocessing

### Load the dataset

In [None]:
from IPython.display import display
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns

from src import preprocessing

if "df_backup" not in globals():
    # takes 2-4min
    df = preprocessing.load_data()
    df_backup = df.copy()
else:
    df = df_backup.copy()  # type: ignore

### Brief Inspection

In [None]:
df.info()

In [None]:
with pd.option_context('display.max_rows', None):
    display(df.dtypes)

In [None]:
# View first few rows
print("First few rows:")
display(df.head())

# Check for missing values
print("\nMissing values:")
display(df.isnull().sum())

# Summary statistics
print("\nSummary statistics:")
display(df.describe(include='all'))

### Data Cleaning

In [None]:
# Remove features that are independent of the target variable
# and those that could cause data leakage
df = preprocessing.drop_cols(df)

df.head()

# Remove features with too many missing values
df = preprocessing.drop_sparse_cols(df)

# Convert date columns to unix timestamps
df = preprocessing.convert_dates(df)

# Impute missing values
df = preprocessing.impute_missing_values(df)


In [None]:
df.head()

### Categorical Encoding

In [None]:
from src.feature_engineering import encode_target, frequency_encoding, onehot_encoding

# Encode target variable and categorical features
df = encode_target(df)
df = onehot_encoding(df)
df = frequency_encoding(df)

### Drop highly correlated feature pairs

In [None]:
from src import algebra, visualize
from src.feature_engineering import drop_high_corr

corr = algebra.correlation_matrix(df)

# plot correlation matrix
visualize.correlation_matrix(corr)

df = drop_high_corr(df, corr)

### Feature Engineering

In [None]:
from src.feature_engineering import frequency_encoding, new_features

df = new_features(df)

### Scale feature values

In [None]:
from src.preprocessing import scale_features

df = scale_features(df)

### Take another peak

In [None]:
df.head()

### Handle data imbalance

In [None]:
visualize.class_dist(df)

In [None]:
from src import training

X_resampled, y_resampled = training.resample(df)

### Split the data

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42
)

### Feature Selection using RFE

In [None]:
# Takes about 5.5 minutes to run

# Initialize the model for RFE
from sklearn.feature_selection import RFE
from src.model import create_model

rfe_model = create_model(eval_metric='logloss')

# Initialize RFE
rfe = RFE(estimator=rfe_model, n_features_to_select=10)

# Fit RFE
rfe.fit(X_train, y_train)

# Transform the data
cols = X_train.columns[rfe.support_]
X_train = pd.DataFrame(rfe.transform(X_train), columns=cols) # type: ignore
X_test = pd.DataFrame(rfe.transform(X_test), columns=cols) # type: ignore

In [None]:
X_test.head()

## Training

### Train the model

In [None]:
model = create_model(eval_metric='logloss')

model.fit(X_train, y_train)

## Inference
### Make predictions

In [None]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:,1]

## Evaluation

### Accuracy

In [None]:
# Accuracy
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

### Classification Report

In [None]:
# Classification report
from sklearn.metrics import classification_report

print("Classification Report:\n", classification_report(y_test, y_pred))

### Confusion Matrix

In [None]:
# Confusion matrix
visualize.confusion_matrix(y_test, y_pred)

### ROC Curve

In [None]:
from sklearn.metrics import roc_auc_score

# ROC-AUC score
roc_auc = roc_auc_score(y_test, y_proba)
print(f"ROC-AUC Score: {roc_auc:.4f}")

# Plot ROC curve
visualize.roc_curve(y_test, y_proba, roc_auc)

### Cross-Validation

In [None]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')

print(f"Cross-Validation ROC-AUC Scores: {cv_scores}")
print(f"Mean Cross-Validation ROC-AUC Score: {cv_scores.mean():.4f}")

## Feature importance

In [None]:
# calculate correlations between the features in X_train and the target variable y_train
df_train = pd.concat([X_train, y_train])
correlations = df_train.corr()['loan_status'].sort_values().to_frame()

# plot the correlations
visualize.correlations(correlations.drop('loan_status'))

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.inspection import permutation_importance

# Calculate permutation importance
result = permutation_importance(model, X_train, y_train, n_repeats=3, random_state=42)

# Get importance values
importances = result['importances_mean']

# Create a DataFrame with feature names and importances
feature_importances = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
})

# Sort the DataFrame by importance in descending order
feature_importances = feature_importances.sort_values('Importance', ascending=False)

# Display the feature importances DataFrame
with pd.option_context('display.max_rows', None):
    display(feature_importances)


## Hyperparameter Tuning

### HP search

In [None]:
import random

from sklearn.model_selection import RandomizedSearchCV
from scipy import stats


param_dist = {
    "max_depth": stats.randint(2, 3),
    "learning_rate": stats.uniform(loc=0.93, scale=0.07),
    "n_estimators": stats.randint(100, 1000),
    "subsample": stats.norm(0.85, scale=0.05),
    "colsample_bytree": stats.uniform(loc=0.98, scale=0.02),
}

# Setup the randomized search with 50 iterations
search = RandomizedSearchCV(
    estimator=create_model(eval_metric='logloss'),
    param_distributions=param_dist,
    n_iter=30, # ~9sec/iter
    cv=3,
    verbose=0,
    random_state=42,
    n_jobs=(-1)
)

# Fit the model
search.fit(X_train, y_train)

# Best Model
print("Best Parameters:", search.best_params_)
print(f"Best ROC-AUC Score: {search.best_score_:.8f}")

### Retrain with these hyperparameters

In [None]:
best_params = search.best_params_
model_best = create_model(**best_params)
model_best.fit(X_train, y_train)

### Evaluate the optimized model

In [None]:
# Predict on test data
y_pred_best = model_best.predict(X_test)
y_proba_best = model_best.predict_proba(X_test)[:,1]

# Accuracy
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f"Optimized Accuracy: {accuracy_best:.4f}")

print("Optimized Classification Report:\n", classification_report(y_test, y_pred_best))

In [None]:
# ROC-AUC score
roc_auc = roc_auc_score(y_test, y_proba_best)
print(f"Optimized ROC-AUC Score: {roc_auc:.4f}")

# Plot ROC curve
visualize.roc_curve(y_test, y_proba_best, roc_auc)