In [1]:
# Import the standard modules to be used in this lab
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline


In [2]:
data_pd = pd.read_csv('heart.csv')
data_pd.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))
normData = scaler.fit_transform(data_pd.iloc[:,0:-1])

In [4]:
normData_pd = pd.DataFrame(normData)
normData_pd.columns = data_pd.columns[:-1]
normData_pd['target'] = data_pd['target']
normData_pd.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,0.708333,1.0,1.0,0.481132,0.244292,1.0,0.0,0.603053,0.0,0.370968,0.0,0.0,0.333333,1
1,0.166667,1.0,0.666667,0.339623,0.283105,0.0,0.5,0.885496,0.0,0.564516,0.0,0.0,0.666667,1
2,0.25,0.0,0.333333,0.339623,0.178082,0.0,0.0,0.770992,0.0,0.225806,1.0,0.0,0.666667,1
3,0.5625,1.0,0.333333,0.245283,0.251142,0.0,0.5,0.816794,0.0,0.129032,1.0,0.0,0.666667,1
4,0.583333,0.0,0.0,0.245283,0.520548,0.0,0.5,0.70229,1.0,0.096774,1.0,0.0,0.666667,1


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(normData_pd.iloc[:,0:-1],
normData_pd.iloc[:,-1],test_size=0.3, random_state=80)

In [6]:
X_test.shape


(91, 13)

# Voting

Let’s build a voting classifier using Decision Tree, Support Vector Machine and Logistic Regression
classifiers. The voting scheme is set to ‘soft’. That’s means every individual classifier provides a
probability value of an example belongs to a particular target class.

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
dtc = DecisionTreeClassifier(max_depth=5, random_state=0)
svc = SVC(gamma='auto', probability=True) # probability parameter is set to true to output the probability score.
lgc = LogisticRegression()
classifiers = []
classifiers.append(('c1', dtc))
classifiers.append(('c2', svc))
classifiers.append(('c3', lgc))
# voting: hard (class label)/soft (sum of probability)
ensemble = VotingClassifier(classifiers, voting='soft')
ensemble.fit(X_train, y_train)
y_pred_vot = ensemble.predict(X_test)

In [8]:
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, y_pred_vot))
print(classification_report(y_test, y_pred_vot))

[[28  6]
 [ 6 51]]
              precision    recall  f1-score   support

           0       0.82      0.82      0.82        34
           1       0.89      0.89      0.89        57

    accuracy                           0.87        91
   macro avg       0.86      0.86      0.86        91
weighted avg       0.87      0.87      0.87        91



# Bagging
Now, let build a bagging classifier using Decision Tree. The number of classifier is set to 500.


In [9]:
from sklearn.ensemble import BaggingClassifier
num_model = 500
#model = LogisticRegression()
model = DecisionTreeClassifier(max_depth=4, random_state=0)
bag_clf = BaggingClassifier(base_estimator=model,
n_estimators=num_model,
bootstrap=True,
random_state=1)
bag_clf.fit(X_train, y_train)
y_pred_bag = bag_clf.predict(X_test)

In [10]:
print(confusion_matrix(y_test, y_pred_bag))
print(classification_report(y_test, y_pred_bag))

[[29  5]
 [ 6 51]]
              precision    recall  f1-score   support

           0       0.83      0.85      0.84        34
           1       0.91      0.89      0.90        57

    accuracy                           0.88        91
   macro avg       0.87      0.87      0.87        91
weighted avg       0.88      0.88      0.88        91



Here we build a Random Forest classifier with each decision tree has a maximum depth of 5. The
number of decision tree is 500.

In [11]:
from sklearn.ensemble import RandomForestClassifier
num_model = 500
rf_clf = RandomForestClassifier(n_estimators=num_model,
max_depth=5, criterion='entropy')
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)

In [12]:
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

[[30  4]
 [ 7 50]]
              precision    recall  f1-score   support

           0       0.81      0.88      0.85        34
           1       0.93      0.88      0.90        57

    accuracy                           0.88        91
   macro avg       0.87      0.88      0.87        91
weighted avg       0.88      0.88      0.88        91



# Stacking

Stacking classifier consisting of Decision Tree, naive Bayes and Logistic Regression classifiers. The
final model is Support Vector Machine classifier

In [13]:
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import StackingClassifier
dtc = DecisionTreeClassifier(max_depth=5, random_state=0)
svc = SVC(gamma='auto', probability=True)
lgc = LogisticRegression()
nbc = GaussianNB()
classifiers = []
classifiers.append(('c1', dtc))
classifiers.append(('c2', nbc))
classifiers.append(('c3', lgc))
stk_clf = StackingClassifier(estimators=classifiers,
final_estimator=svc)
stk_clf.fit(X_train, y_train)
y_pred_stk = stk_clf.predict(X_test)


In [14]:
print(confusion_matrix(y_test, y_pred_stk))
print(classification_report(y_test, y_pred_stk))


[[26  8]
 [11 46]]
              precision    recall  f1-score   support

           0       0.70      0.76      0.73        34
           1       0.85      0.81      0.83        57

    accuracy                           0.79        91
   macro avg       0.78      0.79      0.78        91
weighted avg       0.80      0.79      0.79        91



# Boosting
Let’s build an Adaboost classifier. The base model is the decision tree with a maximum depth of
1 (only 1 split). The number of base model is 20.

In [15]:
from sklearn.ensemble import AdaBoostClassifier
num_model = 20
#model = LogisticRegression()
model = DecisionTreeClassifier(max_depth=1, random_state=0)
ada_clf = AdaBoostClassifier(base_estimator=model,
n_estimators=num_model, algorithm='SAMME')
ada_clf.fit(X_train, y_train)
y_pred_ada = ada_clf.predict(X_test)

In [16]:
print(confusion_matrix(y_test, y_pred_ada))
print(classification_report(y_test, y_pred_ada))

[[30  4]
 [ 8 49]]
              precision    recall  f1-score   support

           0       0.79      0.88      0.83        34
           1       0.92      0.86      0.89        57

    accuracy                           0.87        91
   macro avg       0.86      0.87      0.86        91
weighted avg       0.87      0.87      0.87        91



In [17]:
from sklearn.ensemble import GradientBoostingClassifier
num_model = 60
gd_model = GradientBoostingClassifier(loss='deviance', n_estimators=num_model,max_depth=1)
gd_model.fit(X_train, y_train)
y_pred_gd = gd_model.predict(X_test)

In [18]:
print(confusion_matrix(y_test, y_pred_gd))
print(classification_report(y_test, y_pred_gd))


[[30  4]
 [ 5 52]]
              precision    recall  f1-score   support

           0       0.86      0.88      0.87        34
           1       0.93      0.91      0.92        57

    accuracy                           0.90        91
   macro avg       0.89      0.90      0.89        91
weighted avg       0.90      0.90      0.90        91



In [19]:
# import dataset
#auto_pd = pd.read_csv("auto_mpg.csv", sep=" ")
auto_pd = pd.read_csv('auto_mpg.csv', delimiter=r"\s+")

In [20]:
# look at the first five data
auto_pd.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [21]:
# sum of null data based on attributes. In this case 3 column have missing values; (Age, Cabin, and Embarked)
auto_pd.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model_year      0
origin          0
car_name        0
dtype: int64

In [22]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))
normData = scaler.fit_transform(auto_pd.iloc[:,0:-1])

In [23]:
normData_pd = pd.DataFrame(normData)
normData_pd.columns = auto_pd.columns[:-1]
normData_pd['origin'] = auto_pd['origin']
normData_pd.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,0.239362,1.0,0.617571,0.456522,0.53615,0.238095,0.0,1
1,0.159574,1.0,0.728682,0.646739,0.589736,0.208333,0.0,1
2,0.239362,1.0,0.645995,0.565217,0.51687,0.178571,0.0,1
3,0.18617,1.0,0.609819,0.565217,0.516019,0.238095,0.0,1
4,0.212766,1.0,0.604651,0.51087,0.520556,0.14881,0.0,1


In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(normData_pd.iloc[:,0:-1],
normData_pd.iloc[:,-1],test_size=0.3, random_state=5)

In [25]:
X_test.shape


(120, 7)

# Voting

Let’s build a voting classifier using Decision Tree, Support Vector Machine and Logistic Regression
classifiers. The voting scheme is set to ‘soft’. That’s means every individual classifier provides a
probability value of an example belongs to a particular target class.

In [26]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
dtc = DecisionTreeClassifier(max_depth=5, random_state=0)
svc = SVC(gamma='auto', probability=True) # probability parameter is set to true to output the probability score.
lgc = LogisticRegression()
classifiers = []
classifiers.append(('c1', dtc))
classifiers.append(('c2', svc))
classifiers.append(('c3', lgc))
# voting: hard (class label)/soft (sum of probability)
ensemble = VotingClassifier(classifiers, voting='soft')
ensemble.fit(X_train, y_train)
y_pred_vot = ensemble.predict(X_test)

In [27]:
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, y_pred_vot))
print(classification_report(y_test, y_pred_vot))

[[65  2  6]
 [ 5 10 11]
 [ 3  2 16]]
              precision    recall  f1-score   support

           1       0.89      0.89      0.89        73
           2       0.71      0.38      0.50        26
           3       0.48      0.76      0.59        21

    accuracy                           0.76       120
   macro avg       0.70      0.68      0.66       120
weighted avg       0.78      0.76      0.75       120



# Bagging
Now, let build a bagging classifier using Decision Tree. The number of classifier is set to 500.


In [28]:
from sklearn.ensemble import BaggingClassifier
num_model = 500
#model = LogisticRegression()
model = DecisionTreeClassifier(max_depth=4, random_state=0)
bag_clf = BaggingClassifier(base_estimator=model,
n_estimators=num_model,
bootstrap=True,
random_state=1)
bag_clf.fit(X_train, y_train)
y_pred_bag = bag_clf.predict(X_test)

In [29]:
print(confusion_matrix(y_test, y_pred_bag))
print(classification_report(y_test, y_pred_bag))

[[70  1  2]
 [ 5  4 17]
 [ 3  1 17]]
              precision    recall  f1-score   support

           1       0.90      0.96      0.93        73
           2       0.67      0.15      0.25        26
           3       0.47      0.81      0.60        21

    accuracy                           0.76       120
   macro avg       0.68      0.64      0.59       120
weighted avg       0.77      0.76      0.72       120



Here we build a Random Forest classifier with each decision tree has a maximum depth of 5. The
number of decision tree is 500.

In [31]:
from sklearn.ensemble import RandomForestClassifier
num_model = 500
rf_clf = RandomForestClassifier(n_estimators=num_model,
max_depth=5, criterion='entropy')
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)

In [32]:
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

[[67  1  5]
 [ 7  6 13]
 [ 4  1 16]]
              precision    recall  f1-score   support

           1       0.86      0.92      0.89        73
           2       0.75      0.23      0.35        26
           3       0.47      0.76      0.58        21

    accuracy                           0.74       120
   macro avg       0.69      0.64      0.61       120
weighted avg       0.77      0.74      0.72       120



# Stacking

Stacking classifier consisting of Decision Tree, naive Bayes and Logistic Regression classifiers. The
final model is Support Vector Machine classifier

In [33]:
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import StackingClassifier
dtc = DecisionTreeClassifier(max_depth=5, random_state=0)
svc = SVC(gamma='auto', probability=True)
lgc = LogisticRegression()
nbc = GaussianNB()
classifiers = []
classifiers.append(('c1', dtc))
classifiers.append(('c2', nbc))
classifiers.append(('c3', lgc))
stk_clf = StackingClassifier(estimators=classifiers,
final_estimator=svc)
stk_clf.fit(X_train, y_train)
y_pred_stk = stk_clf.predict(X_test)

In [34]:
print(confusion_matrix(y_test, y_pred_stk))
print(classification_report(y_test, y_pred_stk))

[[68  1  4]
 [ 3 11 12]
 [ 2  1 18]]
              precision    recall  f1-score   support

           1       0.93      0.93      0.93        73
           2       0.85      0.42      0.56        26
           3       0.53      0.86      0.65        21

    accuracy                           0.81       120
   macro avg       0.77      0.74      0.72       120
weighted avg       0.84      0.81      0.80       120



# Boosting
Let’s build an Adaboost classifier. The base model is the decision tree with a maximum depth of
1 (only 1 split). The number of base model is 20.

In [35]:
from sklearn.ensemble import AdaBoostClassifier
num_model = 20
#model = LogisticRegression()
model = DecisionTreeClassifier(max_depth=1, random_state=0)
ada_clf = AdaBoostClassifier(base_estimator=model,
n_estimators=num_model, algorithm='SAMME')
ada_clf.fit(X_train, y_train)
y_pred_ada = ada_clf.predict(X_test)

In [36]:
print(confusion_matrix(y_test, y_pred_ada))
print(classification_report(y_test, y_pred_ada))

[[66  5  2]
 [ 5  6 15]
 [ 3  3 15]]
              precision    recall  f1-score   support

           1       0.89      0.90      0.90        73
           2       0.43      0.23      0.30        26
           3       0.47      0.71      0.57        21

    accuracy                           0.73       120
   macro avg       0.60      0.62      0.59       120
weighted avg       0.72      0.72      0.71       120



In [37]:
from sklearn.ensemble import GradientBoostingClassifier
num_model = 60
gd_model = GradientBoostingClassifier(loss='deviance', n_estimators=num_model,max_depth=1)
gd_model.fit(X_train, y_train)
y_pred_gd = gd_model.predict(X_test)

In [38]:
print(confusion_matrix(y_test, y_pred_gd))
print(classification_report(y_test, y_pred_gd))

[[66  3  4]
 [ 7  6 13]
 [ 3  0 18]]
              precision    recall  f1-score   support

           1       0.87      0.90      0.89        73
           2       0.67      0.23      0.34        26
           3       0.51      0.86      0.64        21

    accuracy                           0.75       120
   macro avg       0.68      0.66      0.62       120
weighted avg       0.76      0.75      0.73       120

