In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score


In [2]:
data=pd.read_csv('Data_2.csv')

In [3]:
imp_features=pd.read_csv('Imp_Variables.csv')

In [4]:
imp_features['Feature'].unique()

array(['Loan_by_Assests', 'Loan_by_Income', 'Loan_by_Total_income',
       'Loan_Per_Year_by_income_annum', 'Loan_by_Assets_Total_income',
       'Loan_by_RealEstateAssets_BankAssets_Total_income',
       'Luxury_assets_by_Total_assets', 'Liquidity_Ratio',
       'Loan_by_RealEstateAssets_Total_income',
       'Loan_by_Income_perDependent', 'commercial_assets_value',
       'residential_assets_value',
       'Loan_by_TotalIncomeAsset_per_dependent', 'Real_Estate_Assests',
       'Loan_Per_Year_by_Income_perDependent', 'Total_Assets',
       'luxury_assets_value', 'Total_Income_Loan_Duration',
       'Income_perDependent', 'bank_asset_value', 'Loan_Per_Year',
       'loan_amount', 'income_annum'], dtype=object)

In [5]:
features=['Loan_by_Assests', 'Loan_by_Income', 'Loan_by_Total_income',
       'Loan_Per_Year_by_income_annum', 'Loan_by_Assets_Total_income',
       'Loan_by_RealEstateAssets_BankAssets_Total_income',
       'Luxury_assets_by_Total_assets', 'Liquidity_Ratio',
       'Loan_by_RealEstateAssets_Total_income',
       'Loan_by_Income_perDependent', 'commercial_assets_value',
       'residential_assets_value',
       'Loan_by_TotalIncomeAsset_per_dependent', 'Real_Estate_Assests',
       'Loan_Per_Year_by_Income_perDependent', 'Total_Assets',
       'luxury_assets_value', 'Total_Income_Loan_Duration',
       'Income_perDependent', 'bank_asset_value', 'Loan_Per_Year',
       'loan_amount', 'income_annum','cibil_score']# Added Cibil Score

In [6]:
X = data[features] # Features
y = data['loan_status'] # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_classifier = RandomForestClassifier(random_state=42)

# Define parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

best_rf_classifier = RandomForestClassifier(**best_params, random_state=42)

cv_scores = cross_val_score(best_rf_classifier, X_train, y_train, cv=5)
best_rf_classifier.fit(X_train, y_train)

y_pred = best_rf_classifier.predict(X_test)

# Calculate evaluation metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("Best Parameters:", best_params)
print("Cross-Validation Mean Accuracy:", np.mean(cv_scores))
print("Precision:", precision)
print("Recall:", recall)
print("Accuracy:", accuracy)


Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Cross-Validation Mean Accuracy: 0.9967789165446559
Precision: 1.0
Recall: 1.0
Accuracy: 1.0


In [22]:
######### ACROSS ALL SLICES OF DATA

In [41]:
data_temp=data[data.cibil_range==0]

X = data_temp[features] 
y = data_temp['loan_status']

y_pred = best_rf_classifier.predict(X)

precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
accuracy = accuracy_score(y, y_pred)


print("Precision:", precision)
print("Recall:", recall)
print("Accuracy:", accuracy)


Precision: 1.0
Recall: 1.0
Accuracy: 1.0


In [42]:
data_temp=data[data.cibil_range==1]

X = data_temp[features] 
y = data_temp['loan_status']

y_pred = best_rf_classifier.predict(X)

precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
accuracy = accuracy_score(y, y_pred)


print("Precision:", precision)
print("Recall:", recall)
print("Accuracy:", accuracy)


Precision: 1.0
Recall: 1.0
Accuracy: 1.0


In [43]:
data_temp=data[data.cibil_range==2]

X = data_temp[features] 
y = data_temp['loan_status']

y_pred = best_rf_classifier.predict(X)

precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
accuracy = accuracy_score(y, y_pred)


print("Precision:", precision)
print("Recall:", recall)
print("Accuracy:", accuracy)


Precision: 1.0
Recall: 1.0
Accuracy: 1.0


In [44]:
data_temp=data[data.cibil_range==3]

X = data_temp[features] 
y = data_temp['loan_status']

y_pred = best_rf_classifier.predict(X)

precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
accuracy = accuracy_score(y, y_pred)


print("Precision:", precision)
print("Recall:", recall)
print("Accuracy:", accuracy)


Precision: 1.0
Recall: 1.0
Accuracy: 1.0


In [45]:
data_temp=data[data.cibil_range==4]

X = data_temp[features] 
y = data_temp['loan_status']

y_pred = best_rf_classifier.predict(X)

precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
accuracy = accuracy_score(y, y_pred)


print("Precision:", precision)
print("Recall:", recall)
print("Accuracy:", accuracy)


Precision: 1.0
Recall: 1.0
Accuracy: 1.0


In [51]:
# # Select a tree from the forest
# tree_index = 0  # Change this to select a different tree
# selected_tree = best_rf_classifier.estimators_[tree_index]

# # Visualize the selected tree
# plt.figure(figsize=(12, 8))
# tree.plot_tree(selected_tree,  filled=True)
# plt.show()

In [52]:
# import matplotlib.pyplot as plt
# from sklearn import tree


In [53]:
import pickle


In [54]:
model_filepath = 'best_model.pkl'

# Save the model to the specified file path using pickle
with open(model_filepath, 'wb') as f:
    pickle.dump(best_rf_classifier, f)
