In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%matplotlib inline
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from sklearn.impute import SimpleImputer
path= ('/content/drive/MyDrive/Colab Notebooks/GenAI/loan_data_nov2023.csv')
loan_data = pd.read_csv(path)
loan_data.head()

Unnamed: 0,default,amount,interest,grade,years,ownership,income,age
0,0,5000,10.65,B,10.0,RENT,24000.0,33
1,0,2400,10.99,C,25.0,RENT,12252.0,31
2,0,10000,13.49,C,13.0,RENT,49200.0,24
3,0,5000,10.99,A,3.0,RENT,36000.0,39
4,0,3000,10.99,E,9.0,RENT,48000.0,24


In [None]:
# Splitting the dataset
X = loan_data.drop('default', axis=1)
y = loan_data['default']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle class imbalance
class_weights = y_train.value_counts(normalize=True).to_dict()

In [None]:
# Data Preprocessing steps
# Selecting categorical and numerical columns
numerical_features = ['amount', 'interest', 'years', 'income', 'age']

# Creating transformers for numerical and categorical data
numeric_transformer = Pipeline(steps=[
    #('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_features = ['grade', 'ownership']
categorical_transformer = Pipeline(steps=[
    #('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combining transformers into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [None]:
#Hyper-parameter tuning
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# Define the model
#log_reg = LogisticRegression()
log_reg = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, class_weight=class_weights))
])

param_grid = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga']  # These solvers work well with l1 and l2 penalties
}

# Set up GridSearchCV
grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)

# Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best parameters and score
print("Best Parameters:", best_params)
print("Best Score:", best_score)

Best Parameters: {'classifier__C': 0.001, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
Best Score: 0.837701103463145


In [None]:
#Training Model
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Step 1: Create the Logistic Regression model with the best parameters
final_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(C=0.001,penalty='l1',solver='liblinear', random_state=42, class_weight=class_weights))
])

# Step 2: Train the model
final_model.fit(X_train, y_train)

# Step 3: Evaluate the model
y_pred = final_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, final_model.predict_proba(X_test)[:, 1])
report = classification_report(y_test, y_pred)

# Print the performance metrics
print("Classification Report:\n", report)

#print(f'\nMetrics for {name}:')
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-score: {f1:.4f}')

Classification Report:
               precision    recall  f1-score   support

           0       0.89      1.00      0.94      5160
           1       0.00      0.00      0.00       659

    accuracy                           0.89      5819
   macro avg       0.44      0.50      0.47      5819
weighted avg       0.79      0.89      0.83      5819

Accuracy: 0.8868
Precision: 0.7863
Recall: 0.8868
F1-score: 0.8335


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#Predicting Using the Model
new_data_dict = {
    'amount': [600000],
    'interest': [17.71],
    'grade': ['D'],
    'years': [1],
    'ownership': ['RENT'],
    'income': [6000],
    'age': [31]
}

# Convert the dictionary to a DataFrame
new_data = pd.DataFrame.from_dict(new_data_dict)

# Use your trained pipeline to make predictions for the new data instance
# The pipeline will automatically apply the necessary preprocessing steps
new_prediction = final_model.predict(new_data)

# Output the prediction
print("The predicted class for the new data is:", new_prediction[0])

# Interpret the prediction and print a message
if new_prediction[0] == 1:
    print("The Customer will Default.")
else:
    print("The Customer will not Default.")

The predicted class for the new data is: 0
The Customer will not Default.


In [None]:
#Saving the Model to pkl file
import pickle
import joblib

# Assuming 'final_model' is your trained model
filename = '/content/drive/MyDrive/Colab Notebooks/GenAI/final_logistic_regression_model.pkl'

# Save the model to a file
with open('model.pkl', 'wb') as file:
    pickle.dump(final_model, file)