In [88]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
data = pd.read_csv('cleveland.csv', header=None)
data.head()

data.columns = ['age', 'sex', 'cp', 'trestbps', 'chol',
              'fbs', 'restecg', 'thalach', 'exang', 
              'oldpeak', 'slope', 'ca', 'thal', 'target']

data.head()


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,2
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0,0


In [5]:
data.shape

(303, 14)

In [3]:
#checking null val
data.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
target      0
dtype: int64

 - ca nd thal are the only who got null values

In [4]:
# handling missing values

data = data.fillna(data.thal.median())
data = data.fillna(data.ca.median())

data.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [13]:
data.target.value_counts()

0    164
1     55
2     36
3     35
4     13
Name: target, dtype: int64

# Data preprocessing

#### Features extraction

In [31]:
data.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')

In [33]:
xfeatures = data[['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
               'exang', 'oldpeak', 'slope', 'ca', 'thal']]
ylabel = data['target']

In [34]:
#select KBest
skb = SelectKBest(score_func=chi2, k=7)
best_features_fit = skb.fit(xfeatures, ylabel)

In [35]:
best_features_trans = best_features_fit.transform(xfeatures)
feature_scores = pd.DataFrame(best_features_fit.scores_,columns=['Feature_Scores'])

In [36]:
print(feature_scores)

Unnamed: 0,Feature_Scores
0,27.922884
1,7.499223
2,16.881183
3,18.870256
4,37.700089
5,6.658223
6,14.05755
7,215.713336
8,41.534482
9,101.997523


In [38]:
feature_column_names = pd.DataFrame(xfeatures.columns,columns=['Feature_name'])
best_feat_df = pd.concat([feature_scores,feature_column_names],axis=1)
best_feat_df

Unnamed: 0,Feature_Scores,Feature_name
0,27.922884,age
1,7.499223,sex
2,16.881183,cp
3,18.870256,trestbps
4,37.700089,chol
5,6.658223,fbs
6,14.05755,restecg
7,215.713336,thalach
8,41.534482,exang
9,101.997523,oldpeak


- from the table above the highest score is the best feature 

In [39]:
# Get the best 7 for our prediction
# The higher the number the more important the feature
best_feat_df.nlargest(7,'Feature_Scores')

Unnamed: 0,Feature_Scores,Feature_name
7,215.713336,thalach
9,101.997523,oldpeak
11,94.365851,ca
12,70.294698,thal
8,41.534482,exang
4,37.700089,chol
0,27.922884,age


In [40]:
best_feat_df.nlargest(7,'Feature_Scores')['Feature_name'].unique()

array(['thalach', 'oldpeak', 'ca', 'thal', 'exang', 'chol', 'age'],
      dtype=object)

In [43]:
# restore best feature to new variable
data2 = data[['age', 'sex', 'chol', 'thalach','exang', 'oldpeak', 'ca', 'thal', 'target']]

data2

Unnamed: 0,age,sex,chol,thalach,exang,oldpeak,ca,thal,target
0,63,1,233,150,0,2.3,0.0,6.0,0
1,67,1,286,108,1,1.5,3.0,3.0,2
2,67,1,229,129,1,2.6,2.0,7.0,1
3,37,1,250,187,0,3.5,0.0,3.0,0
4,41,0,204,172,0,1.4,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...
298,45,1,264,132,0,1.2,0.0,7.0,1
299,68,1,193,141,0,3.4,2.0,7.0,2
300,57,1,131,115,1,1.2,1.0,7.0,3
301,57,0,236,174,0,0.0,1.0,3.0,1


In [81]:
def preprocessing(df):
    df = df.copy()
    
    # replace 2,3,4 to 1 in target column
    df['target'] = df['target'].replace({2:1,3:1,4:1})
    
    #split data
    X = df.drop(['target'], axis=1)
    y = df['target']
    
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.1, random_state=1)
    
    #scale data
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index = X_train.index, columns = X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index = X_test.index, columns = X_test.columns)
    
    
  
    
    return X_train,X_test,y_train,y_test, X, y

In [105]:
y_test.shape

(31,)

In [82]:
X_train,X_test,y_train,y_test, X, y = preprocessing(data2)

In [107]:
X_train

Unnamed: 0,age,sex,chol,thalach,exang,oldpeak,ca,thal
12,0.183984,0.703211,0.160879,-0.366536,1.422049,-0.389705,0.326984,0.631181
29,-1.579672,0.703211,-1.541291,-1.601731,1.422049,0.849022,-0.719364,1.146741
59,-0.367158,0.703211,-0.661518,-1.116476,1.422049,0.318139,0.326984,-0.915497
257,2.388555,-1.422049,-0.967526,-1.513503,-0.703211,0.052698,-0.719364,-0.915497
73,1.176041,0.703211,0.007875,0.339289,-0.703211,-0.389705,1.373331,0.631181
...,...,...,...,...,...,...,...,...
203,1.065813,-1.422049,1.251033,-0.763563,-0.703211,-0.743627,-0.719364,1.146741
255,-1.359215,-1.422049,-0.738020,1.001001,-0.703211,-0.920588,-0.719364,-0.915497
72,0.845356,0.703211,0.371260,-2.263442,1.422049,0.672061,1.373331,1.146741
235,-0.036473,0.703211,0.734645,-1.513503,1.422049,1.910788,1.373331,-0.915497


# Model Training

In [103]:
model = LogisticRegression()
model.fit(X_train,y_train)

LogisticRegression()

# Model Evaluation

In [109]:
# accuracy
y_pred = model.predict(X_test)
acc = accuracy_score(y_pred, y_test)
print('Accuracy : ', acc*100)

# cros validation
cross = cross_val_score(model, X, y, cv=5)
print('Accuracy based on Cross Validation : ', cross.mean())

#confussion matrix
matrix = confusion_matrix(y_pred,y_test)
print(matrix)
print('Accuracy based on confussion matrix: ',  (matrix[0][0] + matrix[1][1])/len(y_test))

Accuracy :  87.09677419354838
Accuracy based on Cross Validation :  0.8250819672131149
[[16  2]
 [ 2 11]]
Accuracy based on confussion matrix:  0.8709677419354839
