In [2]:
import pandas as pd
import numpy as np
import plotly
import itertools
from IPython.display import Image
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_val_score, KFold
from sklearn.base import clone
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
from sklearn.preprocessing import StandardScaler, LabelEncoder
from scipy import stats
from scipy.stats import norm, skew  
import plotly.express as px

In [4]:
df = pd.read_csv("Loan.csv")


In [5]:
df.head(2)

Unnamed: 0,ApplicationDate,Age,AnnualIncome,CreditScore,EmploymentStatus,EducationLevel,Experience,LoanAmount,LoanDuration,MaritalStatus,...,MonthlyIncome,UtilityBillsPaymentHistory,JobTenure,NetWorth,BaseInterestRate,InterestRate,MonthlyLoanPayment,TotalDebtToIncomeRatio,LoanApproved,RiskScore
0,2018-01-01,45,39948,617,Employed,Master,22,13152,48,Married,...,3329.0,0.724972,11,126928,0.199652,0.22759,419.805992,0.181077,0,49.0
1,2018-01-02,38,39709,628,Employed,Associate,15,26045,48,Single,...,3309.083333,0.935132,3,43609,0.207045,0.201077,794.054238,0.389852,0,52.0


In [6]:
train,test = train_test_split(df,test_size=0.3,random_state=42)
print("length of train :",len(train))
print("length of test :",len(test))



length of train : 14000
length of test : 6000


In [7]:
labels=["train","test"]
values=[len(train),len(test)]
fig = px.pie(names=labels,values=values,title="Train Vs Test Split")
fig.show()

In [8]:
print(train.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14000 entries, 17218 to 15795
Data columns (total 36 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ApplicationDate             14000 non-null  object 
 1   Age                         14000 non-null  int64  
 2   AnnualIncome                14000 non-null  int64  
 3   CreditScore                 14000 non-null  int64  
 4   EmploymentStatus            14000 non-null  object 
 5   EducationLevel              14000 non-null  object 
 6   Experience                  14000 non-null  int64  
 7   LoanAmount                  14000 non-null  int64  
 8   LoanDuration                14000 non-null  int64  
 9   MaritalStatus               14000 non-null  object 
 10  NumberOfDependents          14000 non-null  int64  
 11  HomeOwnershipStatus         14000 non-null  object 
 12  MonthlyDebtPayments         14000 non-null  int64  
 13  CreditCardUtilizationRate  

In [9]:
print(test.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6000 entries, 10650 to 9165
Data columns (total 36 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ApplicationDate             6000 non-null   object 
 1   Age                         6000 non-null   int64  
 2   AnnualIncome                6000 non-null   int64  
 3   CreditScore                 6000 non-null   int64  
 4   EmploymentStatus            6000 non-null   object 
 5   EducationLevel              6000 non-null   object 
 6   Experience                  6000 non-null   int64  
 7   LoanAmount                  6000 non-null   int64  
 8   LoanDuration                6000 non-null   int64  
 9   MaritalStatus               6000 non-null   object 
 10  NumberOfDependents          6000 non-null   int64  
 11  HomeOwnershipStatus         6000 non-null   object 
 12  MonthlyDebtPayments         6000 non-null   int64  
 13  CreditCardUtilizationRate   6




In [10]:
employee_status=train["EmploymentStatus"].value_counts()
labels=employee_status.index
values=employee_status.values
fig = px.bar(x=labels,y=values)
fig.show()

In [11]:
employee_status=train["EducationLevel"].value_counts()
labels=employee_status.index
values=employee_status.values
fig = px.bar(x=labels,y=values)
fig.show()

In [12]:
employee_status=train["MaritalStatus"].value_counts()
labels=employee_status.index
values=employee_status.values
fig = px.bar(x=labels,y=values)
fig.show()

In [13]:
employee_status=train["HomeOwnershipStatus"].value_counts()
labels=employee_status.index
values=employee_status.values
fig = px.bar(x=labels,y=values)
fig.show()

In [14]:
employee_status=train["LoanPurpose"].value_counts()
labels=employee_status.index
values=employee_status.values
fig = px.bar(x=labels,y=values)
fig.show()

In [15]:
train= train.reset_index(drop=True)
test = test.reset_index(drop=True)

train_target = train['RiskScore']
test_target = test['RiskScore']
train_data = train.drop(columns=['RiskScore'])
test_data = test.drop(columns=['RiskScore'])

In [16]:
combined = pd.concat([train_data, test_data], axis=0)
le = LabelEncoder()
for cols in combined.columns:
    if combined[cols].dtype =='object':
        combined[cols]= le.fit_transform(combined[cols])


In [17]:
train_data_transformed = combined.iloc[:len(train_data), :]
test_data_transformed = combined.iloc[len(train_data):, :]

In [18]:
train_transformed = pd.concat([train_data_transformed, train_target.reset_index(drop=True)], axis=1)
test_transformed = pd.concat([test_data_transformed, test_target.reset_index(drop=True)], axis=1)

In [19]:
train = train.drop(columns="ApplicationDate")
test = test.drop(columns="ApplicationDate")

In [20]:
target_variable="RiskScore"
features=train.columns.drop(target_variable)

In [21]:
corr= train.corr()
px.imshow(corr)






In [22]:
threshold =0.9
high_correlated_pairs=[]
for i in range(len(corr.columns)):
    for j in range(i):
        if abs(corr.iloc[i,j]) >threshold:
            high_correlated_pairs.append((corr.columns[i],corr.columns[j]))


high_correlated_pairs

[('Experience', 'Age'),
 ('MonthlyIncome', 'AnnualIncome'),
 ('NetWorth', 'TotalAssets')]

In [23]:
for pair in high_correlated_pairs:
    feature_1, feature_2 = pair
    interaction_column_name = f"{feature_1}_{feature_2}_interaction"
    train[interaction_column_name] = train[feature_1] * train[feature_2]

In [24]:
for pair in high_correlated_pairs:
    feature_1, feature_2 = pair
    interaction_column_name = f"{feature_1}_{feature_2}_interaction"
    test[interaction_column_name] = test[feature_1] * test[feature_2]

In [25]:
train.head(1)

Unnamed: 0,Age,AnnualIncome,CreditScore,EmploymentStatus,EducationLevel,Experience,LoanAmount,LoanDuration,MaritalStatus,NumberOfDependents,...,NetWorth,BaseInterestRate,InterestRate,MonthlyLoanPayment,TotalDebtToIncomeRatio,LoanApproved,RiskScore,Experience_Age_interaction,MonthlyIncome_AnnualIncome_interaction,NetWorth_TotalAssets_interaction
0,37,31058,538,Employed,Associate,13,17093,48,Divorced,2,...,338858,0.243093,0.19694,517.363552,0.344786,0,57.0,481,80383280.0,118883246430


In [26]:
threshold =0.8
high_correlated_pairs=[]
for i in range(len(corr.columns)):
    for j in range(i):
        if abs(corr.iloc[i,j]) >threshold:
            high_correlated_pairs.append((corr.columns[i],corr.columns[j]))


high_correlated_pairs

[('Experience', 'Age'),
 ('MonthlyIncome', 'AnnualIncome'),
 ('NetWorth', 'TotalAssets'),
 ('InterestRate', 'BaseInterestRate')]

In [27]:
triplets = set()  # Using a set to avoid duplicate triplets
for pair1, pair2 in itertools.combinations(high_correlated_pairs, 2):
    common_feature = set(pair1).intersection(set(pair2))
    if common_feature:
        feature_1 = pair1[0] if pair1[0] not in common_feature else pair1[1]
        feature_2 = pair2[0] if pair2[0] not in common_feature else pair2[1]
        feature_3 = list(common_feature)[0]
        if abs(corr.loc[feature_1, feature_2]) > threshold:
            triplet = tuple(sorted([feature_1, feature_2, feature_3]))  # Sort to maintain a consistent order
            triplets.add(triplet)

print(list(triplets))


[]


In [28]:
train = train.drop(columns=['Experience', 'Age','MonthlyIncome', 'AnnualIncome','NetWorth', 'TotalAssets'])
test = test.drop(columns=['Experience', 'Age','MonthlyIncome', 'AnnualIncome','NetWorth', 'TotalAssets'])

In [29]:
X= train.drop(columns=["RiskScore"],axis=1)
y= train["RiskScore"]

In [30]:
X_train=X
y_train=y

In [31]:
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor, RandomForestRegressor, BaggingRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
import joblib

In [32]:
categorical_columns = X_train.select_dtypes(include=['object']).columns.tolist()

In [33]:
categorical_columns

['EmploymentStatus',
 'EducationLevel',
 'MaritalStatus',
 'HomeOwnershipStatus',
 'LoanPurpose']

In [34]:
X_test= test.drop(columns=["RiskScore"],axis=1)
y_test= test["RiskScore"]


In [47]:
X_test.head(1)

Unnamed: 0,CreditScore,EmploymentStatus,EducationLevel,LoanAmount,LoanDuration,MaritalStatus,NumberOfDependents,HomeOwnershipStatus,MonthlyDebtPayments,CreditCardUtilizationRate,...,UtilityBillsPaymentHistory,JobTenure,BaseInterestRate,InterestRate,MonthlyLoanPayment,TotalDebtToIncomeRatio,LoanApproved,Experience_Age_interaction,MonthlyIncome_AnnualIncome_interaction,NetWorth_TotalAssets_interaction
0,604,Employed,High School,28479,84,Single,0,Mortgage,88,0.280949,...,0.809983,7,0.251479,0.222397,671.369324,0.090436,1,592,846064900.0,74595752


In [35]:
cat_model = CatBoostRegressor(iterations=1000, learning_rate=0.1, depth=6, verbose=0)

cat_model.fit(X_train, y_train,cat_features=categorical_columns)




<catboost.core.CatBoostRegressor at 0x19238d69ad0>

In [36]:
joblib.dump(cat_model, 'catboost_model.pkl')

['catboost_model.pkl']

In [38]:

loaded_model = joblib.load('catboost_model.pkl')
predictions = loaded_model.predict(X_test)

In [40]:
cat_mse = mean_squared_error(y_test, predictions)
print(cat_mse)

0.07588318364051724


In [43]:

categorical_columns = train.select_dtypes(include=['object']).columns.tolist()

unique_values = {col: df[col].unique() for col in categorical_columns}

for col, values in unique_values.items():
    print(f"Unique values in '{col}': {values}")



Unique values in 'EmploymentStatus': ['Employed' 'Self-Employed' 'Unemployed']
Unique values in 'EducationLevel': ['Master' 'Associate' 'Bachelor' 'High School' 'Doctorate']
Unique values in 'MaritalStatus': ['Married' 'Single' 'Divorced' 'Widowed']
Unique values in 'HomeOwnershipStatus': ['Own' 'Mortgage' 'Rent' 'Other']
Unique values in 'LoanPurpose': ['Home' 'Debt Consolidation' 'Education' 'Other' 'Auto']
