<a href="https://colab.research.google.com/github/amfei/Credit_Risk_Modeling/blob/main/Model_Validation_loan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, auc, accuracy_score, confusion_matrix, classification_report, precision_recall_curve
from sklearn.calibration import calibration_curve, CalibrationDisplay
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import logging
import xgboost as xgb
from statsmodels.stats.outliers_influence import variance_inflation_factor
from imblearn.over_sampling import SMOTE


# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load the dataset
def load_data(filepath):
    try:
        data = pd.read_csv(filepath)
        data.dropna(inplace=True)
        return data
    except Exception as e:
        logging.error(f"Error loading data: {e}")
        raise

# Data preprocessing
def preprocess_data(data, feature_names):
    X = data[feature_names]
    y = data['loan_status']
    X_input= OHE (X)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_input)
    return X_scaled, y, scaler

# Model training
def train_model(X_train, y_train):
    #model= xgb.XGBClassifier(scale_pos_weight=4, max_delta_step= 1,subsample= 0.8,colsample_bytree= 0.8,eval_metric= 'auc')

    model = LogisticRegression(random_state=42)

    model.fit(X_train, y_train)
    return model

# Model evaluation
def evaluate_model(model, X_test, y_test):
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)

    auc_score = roc_auc_score(y_test, y_pred_proba)
    print(f'Test AUC Score: {auc_score}')
    print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
    print(f'Confusion Matrix:\n {confusion_matrix(y_test, y_pred)}')
    print(f'Classification Report:\n {classification_report(y_test, y_pred)}')

    # Cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
    print(f'Cross-Validation AUC Scores: {cv_scores}')
    print(f'Average Cross-Validation AUC Score: {np.mean(cv_scores)}')

    return y_pred_proba, y_pred


# ROC Curve
def plot_roc_curve(y_test, y_pred_proba):
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc='lower right')
    plt.show()

# Calibration Plot
def plot_calibration_curve(y_test, y_pred_proba):
    prob_true, prob_pred = calibration_curve(y_test, y_pred_proba, n_bins=10)
    plt.figure(figsize=(10, 6))
    plt.plot(prob_pred, prob_true, marker='o', label='Logistic Regression')
    plt.plot([0, 1], [0, 1], linestyle='--', label='Perfectly calibrated')
    plt.xlabel('Predicted probability')
    plt.ylabel('True probability')
    plt.title('Calibration plot: how well model predicted probabilities reflect the actual outcomes')
    plt.legend()
    plt.show()

# Lift Chart
def plot_lift_chart(y_test, y_pred_proba, n_bins=10):
    data = pd.DataFrame({'y_true': y_test, 'y_prob': y_pred_proba})
    data['bin'] = pd.qcut(data['y_prob'], n_bins, duplicates='drop')
    lift_df = data.groupby('bin').agg({'y_true': ['sum', 'count']})
    lift_df.columns = ['sum', 'count']
    lift_df['lift'] = lift_df['sum'] / (lift_df['count'] * (data['y_true'].sum() / len(data)))
    lift_df = lift_df.sort_index(ascending=False)

    plt.figure(figsize=(10, 6))
    plt.plot(lift_df['lift'].values, marker='o', linestyle='--')
    plt.xlabel('Decile')
    plt.ylabel('Lift')
    plt.title('Lift Chart: ratio of the actual number of defaults in the decile to the expected number of defaults')
    plt.xticks(range(n_bins), [f'Decile {i+1}' for i in range(n_bins)])
    plt.grid(True)
    plt.show()

# Assumption Checking: Multicollinearity
def plot_correlation_matrix(X_scaled, feature_names):
    cor_matrix = pd.DataFrame(X_scaled, columns=feature_names).corr()
    sns.heatmap(cor_matrix, annot=True, cmap='coolwarm')
    plt.title('Feature Correlation Matrix')
    plt.show()

# Sensitivity Analysis
def sensitivity_analysis(model, data, scaler, feature_names):
    for feature in feature_names:
        feature_range = np.linspace(data[feature].min(), data[feature].max(), 100)
        temp_df = pd.DataFrame({
            #'person_age': data['person_age'].mean(),
            'person_emp_length': data['person_emp_length'].mean(),
            'loan_int_rate': data['loan_int_rate'].mean(),
            'loan_percent_income': data['loan_percent_income'].mean(),
            'cb_person_cred_hist_length': data['cb_person_cred_hist_length'].mean()
        }, index=range(100))
        temp_df[feature] = feature_range
        temp_df_scaled = scaler.transform(temp_df)
        sensitivity_pred_proba = model.predict_proba(temp_df_scaled)[:, 1]

        plt.figure(figsize=(7, 5))
        plt.plot(feature_range, sensitivity_pred_proba, label=f'Predicted default probability')
        plt.xlabel(feature)
        plt.ylabel('Predicted Default Probability')
        plt.title(f'Sensitivity Analysis: Effect of {feature} on Default Probability')
        plt.legend()
        plt.grid()
        plt.show()



def countplot(data):

  # count plot on single categorical variable
  sns.countplot(data)

  # Show the plot
  plt.show()
def plot_precision_recall_curve(y_test, y_pred_proba):
  precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
  plt.figure(figsize = (6,6))
  plt.plot([0, 1], [0.5, 0.5],'k--')
  plt.plot(recall, precision)
  plt.xlabel('recall')
  plt.ylabel('precision')
  plt.title('Precision-Recall Curve (PRC)')
  plt.show()

def calc_vif(X):

  # Calculating VIF
  vif = pd.DataFrame()
  vif["variables"] = X.columns
  vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

  print(vif)
  return vif


def OHE(X):
    # Ensure categorical columns are treated as strings
    X.loc[:, 'person_home_ownership'] = X['person_home_ownership'].astype(str)
    X.loc[:, 'loan_intent'] = X['loan_intent'].astype(str)
    X.loc[:, 'loan_grade'] = X['loan_grade'].astype(str)
    X.loc[:, 'cb_person_default_on_file'] = X['cb_person_default_on_file'].astype(str)


    columns_categ = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']

    OHE_person_home_ownership = pd.get_dummies(X['person_home_ownership'], prefix='person_home_ownership', drop_first=False).astype(int)
    OHE_loan_intent = pd.get_dummies(X['loan_intent'], prefix='loan_intent', drop_first=False).astype(int)
    OHE_loan_grade = pd.get_dummies(X['loan_grade'], prefix='loan_grade', drop_first=False).astype(int)
    OHE_cb_person_default_on_file = pd.get_dummies(X['cb_person_default_on_file'], prefix='cb_person_default_on_file', drop_first=False).astype(int)

    columns_all = X.columns.tolist()
    columns_exclude = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file', 'loan_status']

    # Get all the numerical variable names
    columns_numerical = [column for column in columns_all if column not in columns_exclude]

    # Concatenate the one-hot-encoding columns and numerical columns as the input data
    X_input = pd.concat([OHE_person_home_ownership, OHE_loan_intent, OHE_loan_grade, OHE_cb_person_default_on_file, X[columns_numerical]], axis=1)

    return X_input



def over_sampling(X_train, y_train):
  sm = SMOTE(random_state=33)
  X_res, y_res = sm.fit_resample(X_train, y_train)

  return X_res, y_res


# Main execution
if __name__ == '__main__':
    data = load_data('credit_risk_dataset.csv')

    feature_names = ['person_emp_length', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length', 'person_home_ownership', 'loan_intent' ,'loan_grade', 'cb_person_default_on_file']

    X_scaled, y, scaler = preprocess_data(data,  feature_names)

    #vif = calc_vif(data[feature_names]).sort_values(by = 'VIF', ascending = False)

    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    print(y_train.value_counts())


    # X_train, y_train = over_sampling(X_train, y_train)

    # print(y_res.value_counts())

    model = train_model(X_train, y_train)

    y_pred_proba, y_pred = evaluate_model(model, X_test, y_test)

    plot_precision_recall_curve(y_test, y_pred_proba)
    plot_roc_curve(y_test, y_pred_proba)
    plot_calibration_curve(y_test, y_pred_proba)
    plot_lift_chart(y_test, y_pred_proba)



    #sensitivity_analysis(model, data, scaler, feature_names)

    # Save the model
    joblib.dump(model, 'logistic_regression_model.pkl')
    print('Model saved as logistic_regression_model.pkl')
