# Loading data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


# Splitting data

In [3]:
from sklearn.model_selection import train_test_split
X= df.drop(columns = 'target')
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 10, stratify = y)
print(len(X_train), len(X_test))

212 91


In [4]:
X_train.shape

(212, 13)

# One hot Encoding

In [5]:
cat_columns = ['cp', 'exang', 'slope', 'thal']
num_columns = [c for c in X_train.columns if c not in cat_columns]

In [6]:
from sklearn.preprocessing import OneHotEncoder

In [7]:
#Create an OneHotEncoder instance
encoder = OneHotEncoder(handle_unknown = 'ignore')

#Fit on categorical columns
encoder.fit(X_train[cat_columns])

#Transform on training data
X_train_cat_encoded = encoder.transform(X_train[cat_columns])

column_names = encoder.get_feature_names(input_features = cat_columns)
#print(X_train_cat_encoded.toarray())
print(X_train_cat_encoded.todense().shape)
print(column_names)

X_train_encoded_df = pd.DataFrame(X_train_cat_encoded.todense(),
                                  columns = column_names,
                                  index = X_train.index)

#X_train_encoded_df.head()

(212, 13)
['cp_0' 'cp_1' 'cp_2' 'cp_3' 'exang_0' 'exang_1' 'slope_0' 'slope_1'
 'slope_2' 'thal_0' 'thal_1' 'thal_2' 'thal_3']


In [8]:
X_train_encoded = pd.concat([X_train[num_columns], X_train_encoded_df], axis = 1)
#X_train_encoded.head()

In [9]:
#Fit on categorical columns
encoder.fit(X_test[cat_columns])

#Transform on training data
X_test_cat_encoded = encoder.transform(X_test[cat_columns])

column_names = encoder.get_feature_names(input_features = cat_columns)
#print(X_test_cat_encoded.toarray())
print(X_test_cat_encoded.todense().shape)
print(column_names)

X_test_encoded_df = pd.DataFrame(X_test_cat_encoded.todense(),
                                  columns = column_names,
                                  index = X_test.index)

#print(X_test_encoded_df.head())
X_test_encoded = pd.concat([X_test[num_columns], X_test_encoded_df], axis = 1)
#X_test_encoded.head()

(91, 13)
['cp_0' 'cp_1' 'cp_2' 'cp_3' 'exang_0' 'exang_1' 'slope_0' 'slope_1'
 'slope_2' 'thal_0' 'thal_1' 'thal_2' 'thal_3']


# AdaBoost classifier

In [1]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
# Instantiate a classification-tree 'dt'
dt = DecisionTreeClassifier(max_depth= 3 ,
                            min_samples_leaf= 2 ,
                            random_state=1)
# Instantiate a BaggingClassifier 'bc'; 
bc = AdaBoostClassifier(base_estimator=dt, 
                       n_estimators= 300)
# Fit 'bc' to the traing set
bc.fit(X_train_encoded, y_train)
# Predict the test set labels
y_pred = bc.predict(X_test_encoded)

NameError: name 'X_train_encoded' is not defined

In [12]:
# Evaluate test set accuracy
test_accuracy = accuracy_score(y_test, y_pred)
# Print test set accuracy
print( 'Test set accuracy: {:.3f}' .format(test_accuracy))

Test set accuracy: 0.780


In [13]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.78      0.76        41
           1       0.81      0.78      0.80        50

    accuracy                           0.78        91
   macro avg       0.78      0.78      0.78        91
weighted avg       0.78      0.78      0.78        91

