In [117]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [118]:
df = pd.read_csv('Finance_and_Banking.csv')
df.head()

Unnamed: 0,Customer_ID,Age,Income,Credit_Score,Loan_Amount,Default
0,CUST00001,56.0,135186.0,321.0,6433.0,0.0
1,,69.0,64674.0,773.0,,0.0
2,,46.0,65854.0,,40864.0,0.0
3,CUST00004,32.0,76271.0,,19710.0,0.0
4,CUST00005,,103688.0,,,0.0


In [119]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Customer_ID   900 non-null    object 
 1   Age           900 non-null    float64
 2   Income        900 non-null    float64
 3   Credit_Score  900 non-null    float64
 4   Loan_Amount   900 non-null    float64
 5   Default       900 non-null    float64
dtypes: float64(5), object(1)
memory usage: 47.0+ KB


In [120]:
df.drop('Customer_ID', axis=1, inplace=True)

In [121]:
df.isnull().sum()

Unnamed: 0,0
Age,100
Income,100
Credit_Score,100
Loan_Amount,100
Default,100


In [122]:
mean = df['Age'].mean()
median = df['Age'].median()

print(mean)
print(median)

df['Age'].fillna(mean, inplace=True)

43.785555555555554
44.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(mean, inplace=True)


In [123]:
mean = df['Income'].mean()
median = df['Income'].median()

print(mean)
print(median)

df['Income'].fillna(mean, inplace=True)

88853.26444444444
88331.5


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Income'].fillna(mean, inplace=True)


In [124]:
mean = df['Credit_Score'].mean()
median = df['Credit_Score'].median()

print(mean)
print(median)

df['Credit_Score'].fillna(mean, inplace=True)

577.1977777777778
578.5


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Credit_Score'].fillna(mean, inplace=True)


In [125]:
mean = df['Loan_Amount'].mean()
median = df['Loan_Amount'].median()

print(mean)
print(median)

df['Loan_Amount'].fillna(mean, inplace=True)

24936.64888888889
24022.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Loan_Amount'].fillna(mean, inplace=True)


In [126]:
df['Default'].value_counts()

Unnamed: 0_level_0,count
Default,Unnamed: 1_level_1
0.0,778
1.0,122


In [127]:
# # filling missing values
# # Default (0.0)

# df['Default'].fillna(0, inplace=True)

In [128]:
# # filling missing values
# # Default (1)

df['Default'].fillna(1, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Default'].fillna(1, inplace=True)


In [129]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

In [130]:
df['Income'] = ss.fit_transform(df[['Income']])
df['Loan_Amount'] = ss.fit_transform(df[['Loan_Amount']])
df

Unnamed: 0,Age,Income,Credit_Score,Loan_Amount,Default
0,56.000000,1.403084,321.000000,-1.386561,0.0
1,69.000000,-0.732215,773.000000,0.000000,0.0
2,46.000000,-0.696482,577.197778,1.193508,0.0
3,32.000000,-0.381026,577.197778,-0.391656,0.0
4,43.785556,0.449237,577.197778,0.000000,0.0
...,...,...,...,...,...
995,60.000000,-1.555392,716.000000,-1.182590,0.0
996,43.785556,1.634596,577.197778,-1.525190,0.0
997,62.000000,-0.262287,340.000000,-0.631072,0.0
998,35.000000,0.652344,606.000000,0.962185,0.0


In [131]:
corr = df.corrwith(df['Default'])
corr

Unnamed: 0,0
Age,0.000391
Income,-0.008836
Credit_Score,0.015789
Loan_Amount,-0.083949
Default,1.0


In [132]:
df['Default'] = df['Default'].astype(int)

In [133]:
df['Default'].value_counts()

Unnamed: 0_level_0,count
Default,Unnamed: 1_level_1
0,778
1,222


In [134]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)

In [135]:
X = df.drop('Default', axis=1)
y = df['Default']

X_resampled, y_resampled = smote.fit_resample(X, y)

In [136]:
print(X_resampled.shape)
print(y_resampled.shape)

(1556, 4)
(1556,)


In [137]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [138]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [139]:
y_pred = model.predict(X_test)
y_pred

array([1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1,

In [140]:
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.5352564102564102
[[87 71]
 [74 80]]
              precision    recall  f1-score   support

           0       0.54      0.55      0.55       158
           1       0.53      0.52      0.52       154

    accuracy                           0.54       312
   macro avg       0.54      0.54      0.54       312
weighted avg       0.54      0.54      0.54       312



In [141]:
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)

In [142]:
y_tpred = tree_model.predict(X_test)
y_tpred

array([0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1,

In [143]:
print(accuracy_score(y_test, y_tpred))
print(confusion_matrix(y_test, y_tpred))
print(classification_report(y_test, y_tpred))

0.657051282051282
[[110  48]
 [ 59  95]]
              precision    recall  f1-score   support

           0       0.65      0.70      0.67       158
           1       0.66      0.62      0.64       154

    accuracy                           0.66       312
   macro avg       0.66      0.66      0.66       312
weighted avg       0.66      0.66      0.66       312

