In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_excel("bank.xlsx")

In [4]:
df["default"] = np.where(df["default"] == "yes", 1, 0)
df["housing"] = np.where(df["housing"] == "yes", 1,0)
df["loan"] = np.where(df["loan"] == "yes", 1,0)
df["deposit"] = np.where(df["deposit"] == "yes", 1,0)

In [5]:
df["job"].value_counts()

job
management       2566
blue-collar      1944
technician       1823
admin.           1334
services          923
retired           778
self-employed     405
student           360
unemployed        357
entrepreneur      328
housemaid         274
unknown            70
Name: count, dtype: int64

In [6]:
df["job"] = df["job"].replace(["management","admin.","entrepreneur"],"white-collar")
df["job"] = df["job"].replace(["technician","services"],"blue-collar")
df["job"] = df["job"].replace(["retired","self-employed"],"self-depend")
df["job"] = df["job"].replace(["student"],"unemployed")
df["job"].value_counts()

job
blue-collar     4690
white-collar    4228
self-depend     1183
unemployed       717
housemaid        274
unknown           70
Name: count, dtype: int64

In [7]:
df["poutcome"].value_counts()

poutcome
unknown    8326
failure    1228
success    1071
other       537
Name: count, dtype: int64

In [8]:
df["poutcome"] = df["poutcome"].replace(["unknown"],"other")
df["poutcome"].value_counts()

poutcome
other      8863
failure    1228
success    1071
Name: count, dtype: int64

In [9]:
df.drop(columns = "month",inplace = True)
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,duration,campaign,pdays,previous,poutcome,deposit
0,59,white-collar,married,secondary,0,2343,1,0,unknown,5,1042,1,-1,0,other,1
1,56,white-collar,married,secondary,0,45,0,0,unknown,5,1467,1,-1,0,other,1
2,41,blue-collar,married,secondary,0,1270,1,0,unknown,5,1389,1,-1,0,other,1
3,55,blue-collar,married,secondary,0,2476,1,0,unknown,5,579,1,-1,0,other,1
4,54,white-collar,married,tertiary,0,184,0,0,unknown,5,673,2,-1,0,other,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11157,33,blue-collar,single,primary,0,1,1,0,cellular,20,257,1,-1,0,other,0
11158,39,blue-collar,married,secondary,0,733,0,0,unknown,16,83,4,-1,0,other,0
11159,32,blue-collar,single,secondary,0,29,0,0,cellular,19,156,2,-1,0,other,0
11160,43,blue-collar,married,secondary,0,0,0,1,cellular,8,9,2,172,5,failure,0


In [10]:
def IQR(x):
  q1 = x.quantile(0.25)
  q3 = x.quantile(0.75)
  iqr = q3 - q1
  lf = q1 - (1.5*iqr)
  uf = q3 + (1.5*iqr)
  print("LF",lf)
  print("UF",uf)

In [11]:
IQR(df["age"])

LF 6.5
UF 74.5


In [12]:
IQR(df["pdays"])

LF -33.625
UF 53.375


In [13]:
IQR(df["previous"])

LF -1.5
UF 2.5


In [14]:
IQR(df["campaign"])

LF -2.0
UF 6.0


In [15]:
IQR(df["duration"])

LF -399.0
UF 1033.0


In [16]:
IQR(df["balance"])

LF -2257.0
UF 4087.0


In [17]:
df["balance"] = np.where(df["balance"]>4087.0,4087.0,df["balance"])
df["balance"] = np.where(df["balance"]<-2257.0,-2257.0,df["balance"])
df["age"] = np.where(df["age"]>74,74,df["age"])
df["duration"] = np.where(df["duration"]>1033,1033,df["duration"])
df["campaign"] = np.where(df["campaign"]>6,6,df["campaign"])
df["pdays"] = np.where(df["pdays"]>53,53,df["pdays"])
df["previous"] = np.where(df["previous"]>2,2,df["previous"])

In [18]:
df.select_dtypes(object).columns

Index(['job', 'marital', 'education', 'contact', 'poutcome'], dtype='object')

In [19]:
bank = pd.get_dummies(df,columns = ['job', 'marital', 'education', 'contact', 'poutcome'], drop_first=True)

In [20]:
X = bank.drop(columns = ["deposit"])
y = bank["deposit"]

In [21]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)
print("train_accu: ",dt.score(X_train,y_train))
print("test_accu: ",dt.score(X_test,y_test))

train_accu:  1.0
test_accu:  0.7465293327362292


In [22]:
from sklearn.ensemble import RandomForestClassifier    
rm = RandomForestClassifier(random_state = 77)
rm.fit(X_train,y_train)
print("train" , rm.score(X_train,y_train))
print("test" , rm.score(X_test,y_test))

train 1.0
test 0.8047469771607703


In [23]:
rm = RandomForestClassifier(n_estimators = 2000, random_state = 77)
rm.fit(X_train,y_train)
print("train" , rm.score(X_train,y_train))
print("test" , rm.score(X_test,y_test))

train 1.0
test 0.80653828929691


In [29]:
ne = [150,200,250,300,350]
cri = ["gini","entropy"]
md = [10,20,30,40,50]
ms = [40,50,60,70,80,90,100]
for n in ne:
    for c in cri:
        for d in md:
            for s in ms:
                rf = RandomForestClassifier(n_estimators = n,
                                            criterion = c,
                                            max_depth = d,
                                            min_samples_split = s,
                                            random_state = 55)
                rf.fit(X_train,y_train)
                print("estimator -->  {},  cri --> {}, max_depth --> {}, min_sample --> {}" .format(n,c,d,s))
                print("Train" , rf.score(X_train,y_train))
                print("Test" , rf.score(X_test,y_test))
                print("**********************************************************************************************")

estimator -->  150,  cri --> gini, max_depth --> 10, min_sample --> 40
Train 0.8491432411244261
Test 0.8078817733990148
**********************************************************************************************
estimator -->  150,  cri --> gini, max_depth --> 10, min_sample --> 50
Train 0.8472393325120394
Test 0.8114643976712942
**********************************************************************************************
estimator -->  150,  cri --> gini, max_depth --> 10, min_sample --> 60
Train 0.8476873110090716
Test 0.8096730855351545
**********************************************************************************************
estimator -->  150,  cri --> gini, max_depth --> 10, min_sample --> 70
Train 0.8477993056333296
Test 0.8110165696372593
**********************************************************************************************
estimator -->  150,  cri --> gini, max_depth --> 10, min_sample --> 80
Train 0.8448874454026207
Test 0.8101209135691895
********************

In [30]:
from sklearn.model_selection import GridSearchCV

In [32]:
para = {"n_estimators": [100,200,250,300,350,400,450,500],
        "criterion" : ["gini","entropy"],
        "max_depth": [10,20,30,40,50,60],
        "min_samples_split": [40,50,60,70,80,90,100,110,120]}
rft = RandomForestClassifier(random_state = 77)jninjnjjn
grid  = GridSearchCV(rft,param_grid = para, cv = 5, verbose = 1, n_jobs = -1 )
grid.fit(X_train,y_train)

SyntaxError: invalid syntax (1303891931.py, line 5)

In [42]:
rfc = RandomForestClassifier(criterion = "entropy",
                            max_depth = 20,
                            min_samples_split = 50,
                            min_samples_leaf = 1,
                            random_state = 77)

In [43]:
rfc.fit(X_train,y_train)
print("Train_accu ", dt.score(X_train,y_train))
print("Test_accu ", dt.score(X_test,y_test))

Train_accu  1.0
Test_accu  0.7366771159874608


In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_train, dt.predict(X_train)))