In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%matplotlib inline
import pandas as pd
path= ('/content/drive/MyDrive/Colab Notebooks/GenAI/loan_data_nov2023.csv')
data = pd.read_csv(path)
data.head()

Unnamed: 0,default,amount,interest,grade,years,ownership,income,age
0,0,5000,10.65,B,10.0,RENT,24000.0,33
1,0,2400,10.99,C,25.0,RENT,12252.0,31
2,0,10000,13.49,C,13.0,RENT,49200.0,24
3,0,5000,10.99,A,3.0,RENT,36000.0,39
4,0,3000,10.99,E,9.0,RENT,48000.0,24


In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Splitting the dataset
X = data.drop('default', axis=1)
y = data['default']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Data Preprocessing
numerical_features = ['amount', 'interest', 'years', 'income', 'age']
categorical_features = ['grade', 'ownership']

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Models and their hyperparameters
models = {
    'DecisionTree': (DecisionTreeClassifier(random_state=42), {'classifier__max_depth': [None, 10, 20, 30]}),
    'RandomForest': (RandomForestClassifier(random_state=42), {'classifier__n_estimators': [100, 200], 'classifier__max_depth': [None, 10, 20]}),
    'XGBoost': (XGBClassifier(random_state=42), {'classifier__n_estimators': [100, 200], 'classifier__learning_rate': [0.01, 0.1]}),
    'NaiveBayes': (GaussianNB(), {}),
    'SVM': (SVC(random_state=42), {'classifier__C': [1, 10], 'classifier__gamma': [0.001, 0.01]})
}

# Training, tuning, and evaluating models
results = {}
for name, (model, params) in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
    grid_search = GridSearchCV(pipeline, param_grid=params, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    results[name] = {'accuracy': accuracy, 'report': report, 'best_params': grid_search.best_params_}
    print(f"{name} - Accuracy: {accuracy}\n")
    print(f"Classification Report for {name}:\n{report}\n")
    print(f"Best Parameters for {name}:\n{grid_search.best_params_}\n")

# Selecting the best model
#best_model_name = max(results, key=lambda x: results[x]['accuracy'])
#print(f"Best performing model: {best_model_name} with accuracy {results[best_model_name]['accuracy']}")

DecisionTree - Accuracy: 0.8754081457295068

Classification Report for DecisionTree:
              precision    recall  f1-score   support

           0       0.89      0.98      0.93      5160
           1       0.16      0.02      0.04       659

    accuracy                           0.88      5819
   macro avg       0.53      0.50      0.49      5819
weighted avg       0.81      0.88      0.83      5819


Best Parameters for DecisionTree:
{'classifier__max_depth': 10}

RandomForest - Accuracy: 0.8865784499054821

Classification Report for RandomForest:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94      5160
           1       0.00      0.00      0.00       659

    accuracy                           0.89      5819
   macro avg       0.44      0.50      0.47      5819
weighted avg       0.79      0.89      0.83      5819


Best Parameters for RandomForest:
{'classifier__max_depth': 10, 'classifier__n_estimators': 200}



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


XGBoost - Accuracy: 0.8867503007389586

Classification Report for XGBoost:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94      5160
           1       0.00      0.00      0.00       659

    accuracy                           0.89      5819
   macro avg       0.44      0.50      0.47      5819
weighted avg       0.79      0.89      0.83      5819


Best Parameters for XGBoost:
{'classifier__learning_rate': 0.01, 'classifier__n_estimators': 100}

NaiveBayes - Accuracy: 0.8626911840522427

Classification Report for NaiveBayes:
              precision    recall  f1-score   support

           0       0.89      0.96      0.93      5160
           1       0.22      0.08      0.12       659

    accuracy                           0.86      5819
   macro avg       0.55      0.52      0.52      5819
weighted avg       0.81      0.86      0.83      5819


Best Parameters for NaiveBayes:
{}

SVM - Accuracy: 0.8867503007389586

Classification Re

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
