In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import GradientBoostingClassifier

from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import preprocessing

https://www.kaggle.com/ronitf/heart-disease-uci?select=heart.csv

# Heart Dataset

In [51]:
heart_df = pd.read_csv("../data/heart.csv")
heart_target = 'target'
heart_df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


## Cleaning the Data

Not much clearning needed to be done. The dataset seems to be well cleaned.

In [52]:
clean_df = heart_df.copy()

In [53]:
# No missing values

clean_df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [54]:
clean_df[heart_target].value_counts()

1    165
0    138
Name: target, dtype: int64

## Preprocessing

### Feature Selection

In [55]:
# Continuous features
heart_continuous_features = ['testbps','chol','thalach','oldpeak']

# Categorical features (not including response)
heart_categorical_features = ['sex', 'cp','fbs','restecg','exang','slope','ca','thal']

# Discrete features
heart_discrete_features = ['age']

heart_target = 'target'

print(len(heart_continuous_features)+len(heart_categorical_features)+len(heart_discrete_features)+1)

14


In [68]:
featureselection_df = clean_df.copy()
ordEnc = OrdinalEncoder()

featureselection_df[heart_categorical_features] = ordEnc.fit(featureselection_df[heart_categorical_features]).transform(featureselection_df[heart_categorical_features])

In [69]:
cor = featureselection_df.corr()

#Correlation with output variable
cor_target = abs(cor[heart_target])

#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.1]
relevant_features

age         0.225439
sex         0.280937
cp          0.433798
trestbps    0.144931
restecg     0.137230
thalach     0.421741
exang       0.436757
oldpeak     0.430696
slope       0.345877
ca          0.391724
thal        0.344029
target      1.000000
Name: target, dtype: float64

In [70]:
X = featureselection_df.drop(columns=[heart_target])
y = featureselection_df[heart_target].values

for i in range (14):
    print("_______________________________________________________________________________________Best "+str(i)+":")
    # Create and fit selector
    selector = SelectKBest(chi2, k=i)
    selector.fit(X, y)
    # Get columns to keep and create new dataframe with those only
    cols = selector.get_support(indices=True)
    features_df_new = X.iloc[:,cols]
    print(features_df_new.columns)

_______________________________________________________________________________________Best 0:
Index([], dtype='object')
_______________________________________________________________________________________Best 1:
Index(['thalach'], dtype='object')
_______________________________________________________________________________________Best 2:
Index(['thalach', 'oldpeak'], dtype='object')
_______________________________________________________________________________________Best 3:
Index(['thalach', 'oldpeak', 'ca'], dtype='object')
_______________________________________________________________________________________Best 4:
Index(['cp', 'thalach', 'oldpeak', 'ca'], dtype='object')
_______________________________________________________________________________________Best 5:
Index(['cp', 'thalach', 'exang', 'oldpeak', 'ca'], dtype='object')
_______________________________________________________________________________________Best 6:
Index(['cp', 'chol', 'thalach', 'exang', 'oldpeak',

In [71]:
best_features = ['thalach','oldpeak','ca','cp','exang','chol','age','trestbps','slope','sex','thal','restecg','fbs']

best_categorical_features = np.intersect1d(best_features, heart_categorical_features)

In [72]:
featureselection_df[best_categorical_features] = ordEnc.fit(featureselection_df[best_categorical_features]).transform(featureselection_df[best_categorical_features])
featureselection_df[heart_categorical_features] = featureselection_df[heart_categorical_features].astype('category') 

X = featureselection_df[best_features]
y = featureselection_df[heart_target].values

temp = []

for feature in best_features:
    temp.append(feature)
    print("________________________________________________")
    print(temp)
    X_train, X_test, y_train, y_test = train_test_split(X[temp], y, test_size=0.3, stratify=y)
    model = GradientBoostingClassifier(learning_rate=1, max_depth=9, n_estimators=500)
    model.fit(X_train, y_train)
    y_preds_rf = model.predict(X_test)
    ac = accuracy_score(y_test, y_preds_rf)
    print('Overall Accuracy: %.9f'%ac)

________________________________________________
['thalach']
Overall Accuracy: 0.659340659
________________________________________________
['thalach', 'oldpeak']
Overall Accuracy: 0.648351648
________________________________________________
['thalach', 'oldpeak', 'ca']
Overall Accuracy: 0.670329670
________________________________________________
['thalach', 'oldpeak', 'ca', 'cp']
Overall Accuracy: 0.747252747
________________________________________________
['thalach', 'oldpeak', 'ca', 'cp', 'exang']
Overall Accuracy: 0.681318681
________________________________________________
['thalach', 'oldpeak', 'ca', 'cp', 'exang', 'chol']
Overall Accuracy: 0.659340659
________________________________________________
['thalach', 'oldpeak', 'ca', 'cp', 'exang', 'chol', 'age']
Overall Accuracy: 0.725274725
________________________________________________
['thalach', 'oldpeak', 'ca', 'cp', 'exang', 'chol', 'age', 'trestbps']
Overall Accuracy: 0.670329670
___________________________________________

In [73]:
final_best_features = ['thalach', 'oldpeak', 'ca', 'cp', 'exang', 'chol', 'age', 'trestbps', 'slope', 'sex', 'thal']

featureselection_df = clean_df.copy()
featureselection_df = featureselection_df[final_best_features]
featureselection_df[heart_target] = clean_df[heart_target]

featureselection_df

Unnamed: 0,thalach,oldpeak,ca,cp,exang,chol,age,trestbps,slope,sex,thal,target
0,150,2.3,0,3,0,233,63,145,0,1,1,1
1,187,3.5,0,2,0,250,37,130,0,1,2,1
2,172,1.4,0,1,0,204,41,130,2,0,2,1
3,178,0.8,0,1,0,236,56,120,2,1,2,1
4,163,0.6,0,0,1,354,57,120,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
298,123,0.2,0,0,1,241,57,140,1,0,3,0
299,132,1.2,0,3,0,264,45,110,1,1,3,0
300,141,3.4,2,0,0,193,68,144,1,1,3,0
301,115,1.2,1,0,1,131,57,130,1,1,3,0


### Feature Transformation

In [74]:
featureEngineering_df = featureselection_df.copy()

final_best_features = ['thalach', 'oldpeak', 'ca', 'cp', 'exang', 'chol', 'age', 'trestbps', 'slope', 'sex', 'thal']

# Continuous features
continuous_features = ['chol','thalach','oldpeak']

# Categorical features (not including response)
categorical_features = ['sex', 'cp','exang','slope','ca','thal']

# Discrete features
discrete_features = ['age']

heart_target = 'target'
print(len(continuous_features)+len(categorical_features)+len(discrete_features)+1)

11


In [75]:
# 1-of-C Encoding for Categorical and Discrete Variables

def encodingHelper(list, item):
    newList = []
    for i in list:
        if str(i)==str(item):
            newList.append(1)
        else:
            newList.append(0)
    return newList

# The categorical variables are mapped to 1-of-C coding
new_categorical_features = []

for cat in categorical_features:
    items = sorted(featureEngineering_df[cat].unique())
    for item in items:
        featureEngineering_df[cat+"_"+str(item).replace(" ", "_")] = encodingHelper(featureEngineering_df[cat], item)
        new_categorical_features.append(cat+"_"+str(item).replace(" ", "_"))
    featureEngineering_df = featureEngineering_df.drop(columns=cat)
    

# All Discrete features should be classified as categorical
featureEngineering_df[new_categorical_features] = featureEngineering_df[new_categorical_features].astype('category') 
featureEngineering_df[heart_target] = featureEngineering_df[heart_target].astype('category')  

featureEngineering_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 26 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   thalach   303 non-null    int64   
 1   oldpeak   303 non-null    float64 
 2   chol      303 non-null    int64   
 3   age       303 non-null    int64   
 4   trestbps  303 non-null    int64   
 5   target    303 non-null    category
 6   sex_0     303 non-null    category
 7   sex_1     303 non-null    category
 8   cp_0      303 non-null    category
 9   cp_1      303 non-null    category
 10  cp_2      303 non-null    category
 11  cp_3      303 non-null    category
 12  exang_0   303 non-null    category
 13  exang_1   303 non-null    category
 14  slope_0   303 non-null    category
 15  slope_1   303 non-null    category
 16  slope_2   303 non-null    category
 17  ca_0      303 non-null    category
 18  ca_1      303 non-null    category
 19  ca_2      303 non-null    category
 20  ca_3      

In [76]:
# The continuous and discrete features are standardized to zero mean and unit variance

numerical_features = continuous_features + discrete_features

scaler = preprocessing.StandardScaler().fit(featureEngineering_df[numerical_features])
featureEngineering_df[numerical_features] = scaler.transform(featureEngineering_df[numerical_features])

print("__________________________________________Mean:")
print(featureEngineering_df[numerical_features].mean(axis=0))
print("___________________________________________STD:")
print(featureEngineering_df[numerical_features].std(axis=0))

# Note that the scaler cannot make the mean EXACTLY zero due to number representation limitations. 
# In practice, this is close enough.

__________________________________________Mean:
chol      -9.828955e-17
thalach   -5.203025e-16
oldpeak   -3.140136e-16
age        5.825923e-17
dtype: float64
___________________________________________STD:
chol       1.001654
thalach    1.001654
oldpeak    1.001654
age        1.001654
dtype: float64


## Oversampling to Balance the Dataset

In [79]:
oversampling_df = featureEngineering_df.copy()

# Balancing (oversampling)
count_class_0, count_class_1 = oversampling_df[heart_target].value_counts()
df_class_0 = oversampling_df[oversampling_df[heart_target] == 0]
df_class_1 = oversampling_df[oversampling_df[heart_target] == 1]
# Oversample
df_class_1_over = df_class_1.sample(count_class_0, replace=True)
final_df = pd.concat([df_class_0, df_class_1_over], axis=0)
# Split back into X_train and y_train    
print('Random over-sampling:\n'+ str(final_df[marketing_target].value_counts()))

NameError: name 'marketing_target' is not defined

In [66]:
final_df.to_csv("../data/preprocessedMarketingCampaign.csv")