In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')

In [3]:
#Different models initialised
log_clf_1 = LogisticRegression(random_state=0)
log_clf_2 = LogisticRegression(random_state=42)
decision_clf1 = DecisionTreeClassifier(criterion = 'entropy',random_state=0)
decision_clf2 = DecisionTreeClassifier(criterion = 'entropy', random_state=42)

In [4]:
#Creation of list of models
Model_List=[('Logistic Regression 1', log_clf_1),
            ('Logistic Regression 2', log_clf_2),
            ('Decision Tree 1', decision_clf1),
            ('Decision Tree 2', decision_clf2)]

In [5]:
path = r"Data\bank_data.csv"

In [6]:
data = pd.read_csv(path)
data.head(10)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,0,1,1,0,2343,1,0,2,5,8,1042,1,-1,0,3,1
1,56,0,1,1,0,45,0,0,2,5,8,1467,1,-1,0,3,1
2,41,9,1,1,0,1270,1,0,2,5,8,1389,1,-1,0,3,1
3,55,7,1,1,0,2476,1,0,2,5,8,579,1,-1,0,3,1
4,54,0,1,2,0,184,0,0,2,5,8,673,2,-1,0,3,1
5,42,4,2,2,0,0,1,1,2,5,8,562,2,-1,0,3,1
6,56,4,1,2,0,830,1,1,2,6,8,1201,1,-1,0,3,1
7,60,5,0,1,0,545,1,0,2,6,8,1030,1,-1,0,3,1
8,37,9,1,1,0,1,1,0,2,6,8,608,1,-1,0,3,1
9,28,7,2,1,0,5090,1,0,2,6,8,1297,3,-1,0,3,1


In [7]:
X = data.drop(columns="deposit")
y = data['deposit']

In [8]:
X_train,X_test,y_train,y_test = train_test_split(X, y, random_state=0, test_size=0.3)

In [9]:
voting_clf_hard = VotingClassifier(Model_List, voting='hard')
voting_clf_hard.fit(X_train, y_train)
hard_voting_score = voting_clf_hard.score(X_test, y_test)

In [10]:
voting_clf_soft = VotingClassifier(Model_List, voting='soft')
voting_clf_soft.fit(X_train, y_train)
soft_voting_score = voting_clf_soft.score(X_test, y_test)

In [11]:
print("Hard-voting score : ", hard_voting_score)
print("Soft-voting score : ", soft_voting_score)

Hard-voting score :  0.7709764108689161
Soft-voting score :  0.787996416840848


In [12]:
from sklearn.ensemble import BaggingClassifier

In [16]:
bagging_clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100, random_state=0, max_samples=100)
bagging_clf.fit(X_train, y_train)
score_bagging = bagging_clf.score(X_test, y_test)
print("Bagging score : ", score_bagging)

Bagging score :  0.8139743206927441


In [20]:
pasting_clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100, random_state=0, max_samples=100, bootstrap=False)
pasting_clf.fit(X_train, y_train)
score_pasting = pasting_clf.score(X_test, y_test)
print("Pasting score : ", score_pasting)

Pasting score :  0.8112869513287548


In [22]:
from sklearn.ensemble import RandomForestClassifier

In [23]:
rf_clf = RandomForestClassifier(n_estimators=100,n_jobs=100,random_state=0, min_samples_leaf=100)
rf_clf.fit(X_train, y_train)
score_rf = rf_clf.score(X_test, y_test)
print("Random Forest score : ", score_rf)

Random Forest score :  0.8220364287847118


In [26]:
from sklearn.model_selection import GridSearchCV

In [27]:
parameter_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

In [32]:
clf = RandomForestClassifier(random_state=0)
grid_search = GridSearchCV(clf, parameter_grid)
grid_search.fit(X_train, y_train)
score_gs = grid_search.score(X_test, y_test)
print(score_gs)

0.8369662585846521


In [33]:
from sklearn.model_selection import RandomizedSearchCV

In [34]:
clf = RandomForestClassifier(random_state=0)
random_search = RandomizedSearchCV(clf, parameter_grid, n_iter=20, random_state=0)
random_search.fit(X_train, y_train)
score_rs = random_search.score(X_test, y_test)
print(score_rs)

0.8384592415646461


In [36]:
from mlxtend.classifier import StackingClassifier

In [37]:
classifier1 = DecisionTreeClassifier(random_state=0)
classifier2= DecisionTreeClassifier(random_state=1)
classifier3 = DecisionTreeClassifier(random_state=2)
classifier4= DecisionTreeClassifier(random_state=3)
classifier_list=[classifier1,classifier2,classifier3,classifier4]

m_classifier=LogisticRegression(random_state=0)

In [42]:
sclf = StackingClassifier(classifier_list, m_classifier)
sclf.fit(X_train, y_train)
s_score = sclf.score(X_test, y_test)
print(s_score)

0.7751567632128994


# Assignment

In [44]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [43]:
path = r"Data\Mars Carter.csv"

In [45]:
df = pd.read_csv(path)
df.head()

Unnamed: 0.1,Unnamed: 0,attr0,attr1,attr2,attr3,attr4,attr5,attr6,attr7,attr8,...,attr1080,attr1081,attr1082,attr1083,attr1084,attr1085,attr1086,attr1087,attr1088,attr1089
0,0,-4.049514,-5.055907,4.813832,10.975995,10.599993,8.103339,7.260105,3.984216,-3.352366,...,39.055706,39.480231,48.177327,49.460693,50.797614,50.680413,44.778675,36.101397,44.447948,0
1,1,3.514292,4.721218,-2.536391,-8.388817,-10.787064,-9.024258,-5.473323,7.646837,15.297336,...,34.934308,34.22427,42.633077,46.226847,49.730228,49.624121,45.398516,39.585452,45.971939,0
2,2,-11.31818,9.405884,29.141795,21.277405,-5.122294,-21.736671,-12.850864,3.13446,7.207608,...,86.905997,78.811334,84.165826,86.976997,116.149402,107.729029,96.534329,80.428859,114.810516,0
3,3,-7.143218,-9.869755,-7.905797,4.271652,22.89072,28.454973,12.167586,-2.024773,-2.093635,...,65.272842,63.522759,67.886176,67.388943,93.066755,91.394297,70.704254,63.252282,82.057148,0
4,4,-5.027305,5.600857,10.312207,3.855865,-1.988057,4.749132,9.700589,1.991069,-2.491197,...,80.31346,77.703464,93.575195,104.748562,129.462818,124.996294,118.110321,108.709732,139.685624,0


In [50]:
X = df.drop(columns='attr1089')
y = df.attr1089
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)

In [51]:
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [52]:
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [53]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
roc_score = roc_auc_score(y_test,y_pred)
print(roc_score)

0.834042610697639


In [54]:
from sklearn.tree import DecisionTreeClassifier

In [55]:
dt = DecisionTreeClassifier(random_state=4)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
roc_score = roc_auc_score(y_test, y_pred)
print(roc_score)

0.8732834218291986


In [56]:
from sklearn.ensemble import RandomForestClassifier

In [57]:
rfc = RandomForestClassifier(random_state=4)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
roc_score = roc_auc_score(y_test, y_pred)
print(roc_score)

0.9072251284029547


In [58]:
from sklearn.ensemble import BaggingClassifier

In [59]:
bagging_clf = BaggingClassifier(base_estimator= DecisionTreeClassifier(), n_estimators=100 , max_samples=100, random_state=0)
bagging_clf.fit(X_train, y_train)
score_bagging = bagging_clf.score(X_test, y_test)
print(score_bagging)

0.832579185520362


In [60]:
from sklearn.ensemble import VotingClassifier

In [61]:
clf_1 = LogisticRegression()
clf_2 = DecisionTreeClassifier(random_state=4)
clf_3 = RandomForestClassifier(random_state=4)

model_list = [('lr',clf_1),('DT',clf_2),('RF',clf_3)]

In [63]:
voting_clf_hard = VotingClassifier(model_list, voting='hard')
voting_clf_hard.fit(X_train, y_train)
hard_voting_score = voting_clf_hard.score(X_test, y_test)
print(hard_voting_score)

0.9085972850678733


In [64]:
voting_clf_soft = VotingClassifier(model_list, voting='soft')
voting_clf_soft.fit(X_train, y_train)
soft_voting_score = voting_clf_soft.score(X_test, y_test)
print(soft_voting_score)

0.9063348416289593
