# **Credit Scoring Model**

In [31]:
import pandas as pd

df = pd.read_csv('bank.csv')
print(df.head())

   age          job  marital  education default  balance housing loan  \
0   30   unemployed  married    primary      no     1787      no   no   
1   33     services  married  secondary      no     4789     yes  yes   
2   35   management   single   tertiary      no     1350     yes   no   
3   30   management  married   tertiary      no     1476     yes  yes   
4   59  blue-collar  married  secondary      no        0     yes   no   

    contact  day month  duration  campaign  pdays  previous poutcome   y  
0  cellular   19   oct        79         1     -1         0  unknown  no  
1  cellular   11   may       220         1    339         4  failure  no  
2  cellular   16   apr       185         1    330         1  failure  no  
3   unknown    3   jun       199         4     -1         0  unknown  no  
4   unknown    5   may       226         1     -1         0  unknown  no  


## Data Preparation

### Preprocessing (One-hot encoding)

In [32]:
dummy_df = pd.get_dummies(df, columns=['job', 'marital', 'education', 'contact', 'month', 'poutcome'])


In [33]:
dummy_df['default'] = dummy_df['default'].map({'no':0,'yes':1})
dummy_df['housing'] = dummy_df['housing'].map({'no':0,'yes':1})
dummy_df['loan'] = dummy_df['loan'].map({'no':0,'yes':1})
dummy_df['y'] = dummy_df['y'].map({'no':0,'yes':1})

### Data Separation into x and y

In [34]:
y_temp = df['y']
y = dummy_df['y']
y

0       0
1       0
2       0
3       0
4       0
       ..
4516    0
4517    0
4518    0
4519    0
4520    0
Name: y, Length: 4521, dtype: int64

In [35]:
x_temp = df.drop('y',axis=1)
x = dummy_df.drop('y',axis=1)
x

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,30,0,1787,0,0,19,79,1,-1,0,...,False,False,False,False,True,False,False,False,False,True
1,33,0,4789,1,1,11,220,1,339,4,...,False,False,True,False,False,False,True,False,False,False
2,35,0,1350,1,0,16,185,1,330,1,...,False,False,False,False,False,False,True,False,False,False
3,30,0,1476,1,1,3,199,4,-1,0,...,True,False,False,False,False,False,False,False,False,True
4,59,0,0,1,0,5,226,1,-1,0,...,False,False,True,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,33,0,-333,1,0,30,329,5,-1,0,...,False,False,False,False,False,False,False,False,False,True
4517,57,1,-3313,1,1,9,153,1,-1,0,...,False,False,True,False,False,False,False,False,False,True
4518,57,0,295,0,0,19,151,11,-1,0,...,False,False,False,False,False,False,False,False,False,True
4519,28,0,1137,0,0,6,129,4,211,3,...,False,False,False,False,False,False,False,True,False,False


### Split into 80% train and 20% test data

In [36]:
from sklearn.model_selection import train_test_split
x_train_temp, x_test_temp, y_train_temp, y_test_temp = train_test_split(x_temp,y_temp,test_size=0.2,random_state=100)
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=100)

"""     random_state : int, RandomState instance or None, default=None
        Controls the shuffling applied to the data before applying the split.
        Pass an int for reproducible output across multiple function calls.
        
        Use a new random number generator seeded by the given integer. 
        Using an int will produce the same results across different calls. 
        However, it may be worthwhile checking that your results are stable 
        across a number of different distinct random seeds. Popular integer 
        random seeds are 0 and 42. Integer values must be in the range [0, 2**32 - 1].
        The seed is a starting point for a sequence of pseudorandom numbers. 
        If you start from the same seed, you get the very same sequence.
"""

'     random_state : int, RandomState instance or None, default=None\n        Controls the shuffling applied to the data before applying the split.\n        Pass an int for reproducible output across multiple function calls.\n        \n        Use a new random number generator seeded by the given integer. \n        Using an int will produce the same results across different calls. \n        However, it may be worthwhile checking that your results are stable \n        across a number of different distinct random seeds. Popular integer \n        random seeds are 0 and 42. Integer values must be in the range [0, 2**32 - 1].\n        The seed is a starting point for a sequence of pseudorandom numbers. \n        If you start from the same seed, you get the very same sequence.\n'

## Building the Model

### Training the Model

In [37]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(x_train,y_train)

In [38]:
len(x_test)

905

In [39]:
x_test.reset_index(drop=True, inplace=True)
x_test_temp.reset_index(drop=True, inplace=True)

In [40]:
y_rf_pred = rf.predict(x_test)

In [41]:
y_rf_pred

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [42]:
prediction = pd.DataFrame(y_rf_pred)
prediction

Unnamed: 0,0
0,0
1,0
2,1
3,0
4,0
...,...
900,0
901,0
902,0
903,0


In [43]:
import pandas as pd

# Create a sample DataFrame
prediction = pd.DataFrame({'column_name': [1, 2, 3, 4, 5]})

# Now you can assign a column name
prediction.columns = ['Loan Approved']

print(prediction)

   Loan Approved
0              1
1              2
2              3
3              4
4              5


In [44]:
prediction['Loan Approved'] = prediction['Loan Approved'].map({0:'no',1:'yes'})
prediction

Unnamed: 0,Loan Approved
0,yes
1,
2,
3,
4,


In [45]:
test_table_prediction = pd.concat([x_test_temp, prediction],axis=1).reset_index(drop=True)
test_table_prediction

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,Loan Approved
0,52,retired,married,secondary,no,424,no,no,cellular,19,nov,143,1,154,3,failure,yes
1,40,management,single,unknown,no,838,yes,no,unknown,12,may,619,3,-1,0,unknown,
2,32,technician,single,tertiary,no,5514,no,no,cellular,22,apr,319,1,182,1,success,
3,46,admin.,married,secondary,no,556,yes,yes,unknown,15,may,646,3,-1,0,unknown,
4,42,entrepreneur,married,tertiary,no,0,no,no,cellular,9,jul,236,1,-1,0,unknown,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900,50,blue-collar,married,primary,no,8139,yes,no,cellular,18,aug,256,6,-1,0,unknown,
901,38,self-employed,single,secondary,no,1146,yes,no,unknown,21,may,91,2,-1,0,unknown,
902,50,housemaid,married,primary,no,395,yes,no,unknown,15,may,419,2,-1,0,unknown,
903,42,housemaid,married,primary,no,83,no,yes,cellular,7,aug,184,1,-1,0,unknown,


## Checking accuracy and precision

In [46]:
from sklearn.metrics import accuracy_score,precision_score

In [47]:
accuracy = accuracy_score(y_test, y_rf_pred)
precision = precision_score(y_test, y_rf_pred)
print("Accuracy:", accuracy)
print("Precision:",precision)

Accuracy: 0.9082872928176795
Precision: 0.7142857142857143
