## Step 1 
First we are importing everything necessary for the assignment, including the types of models I will be using. 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

## Step 2 
I next imported the heart train test as well as the heart validation which is labelled heart new and will be used for making the final predictions at the end. 

In [26]:
heart = pd.read_csv("heart_train_test.csv")
heart_new = pd.read_csv("heart_validation_nt.csv")

## Step 3
Data Analysis- Here I can examine the features in the dataset. I can also see the target variable information which tells which has the number of people who had heart disease and who didnt. 

In [3]:
heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243 entries, 0 to 242
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       243 non-null    int64  
 1   sex       243 non-null    int64  
 2   cp        243 non-null    int64  
 3   trestbps  243 non-null    int64  
 4   chol      243 non-null    int64  
 5   fbs       243 non-null    int64  
 6   restecg   243 non-null    int64  
 7   thalach   243 non-null    int64  
 8   exang     243 non-null    int64  
 9   oldpeak   243 non-null    float64
 10  slope     243 non-null    int64  
 11  ca        243 non-null    int64  
 12  thal      243 non-null    int64  
 13  target    243 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 26.7 KB


In [4]:
heart.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,243.0,243.0,243.0,243.0,243.0,243.0,243.0,243.0,243.0,243.0,243.0,243.0,243.0,243.0
mean,54.065844,0.683128,0.950617,131.769547,245.353909,0.152263,0.563786,150.26749,0.325103,1.019342,1.407407,0.736626,2.300412,0.563786
std,9.006184,0.466218,1.023298,17.731596,52.535921,0.360018,0.529155,23.165022,0.46938,1.177122,0.618805,1.022766,0.606678,0.496938
min,29.0,0.0,0.0,94.0,131.0,0.0,0.0,88.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.0,0.0,0.0,120.0,207.5,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,239.0,0.0,1.0,155.0,0.0,0.6,1.0,0.0,2.0,1.0
75%,60.0,1.0,2.0,140.0,274.5,0.0,1.0,168.5,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [5]:
heart['target'].value_counts()

1    137
0    106
Name: target, dtype: int64

## Step 4 
Here we are doing a train/test split and examining the shape of the split. 

In [6]:

features = heart.drop('target', axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    features, heart['target'], random_state=7, 
    stratify = heart['target'],
    test_size = 0.20)

In [7]:
X_train.shape

(194, 13)

In [8]:
X_test.shape

(49, 13)

In [9]:
y_train.shape

(194,)

In [10]:
y_test.shape

(49,)

In [11]:
[col for col in heart.dtypes.index if heart[col].dtype=='object']

[]

In [12]:
[col for col in heart.dtypes.index if heart[col].dtype in['float64','int64']]

['age',
 'sex',
 'cp',
 'trestbps',
 'chol',
 'fbs',
 'restecg',
 'thalach',
 'exang',
 'oldpeak',
 'slope',
 'ca',
 'thal',
 'target']

## Step 5 
Here is the colomn transformer and no OneHotEncoder because as we can see above there is no datatype labelled "Object"
So we are using just the standard scaler here. Next is the pipeline creation which transforms and manipulates the data to work for our scaler. The  parameter grid which we use to do hyperameter tuning to determine the optimal values for the model. After this step we have our predictions from our gridsearch for our models.

In [27]:
col_trans_std = make_column_transformer(
    (StandardScaler(), ['age','sex','cp','trestbps','chol','fbs','restecg',
                                 'thalach','exang','oldpeak','slope', 
                                 'ca', 'thal']))




In [14]:
pipe = Pipeline([('preprocessing', col_trans_std),
                  ('classifier', LogisticRegression(max_iter = 1000))])

In [17]:
param_grid = [
    {'preprocessing': [col_trans_std],
     'classifier__C': np.logspace(-5, 4, 10)},
    {'classifier': [GradientBoostingClassifier(random_state=0)],
     'preprocessing':[col_trans_std],
     'classifier__min_samples_leaf': [60,50,40,30,20,10,5,2],
     'classifier__n_estimators': [100,200],
     'classifier__learning_rate': np.linspace(0.01,0.8,5)},
    {'classifier': [KNeighborsClassifier()],
     'classifier__n_neighbors': np.arange(1, 23, 3)}
]

In [19]:
gs = GridSearchCV(pipe, param_grid, cv=10, scoring='roc_auc', verbose = 1)

In [20]:
gs.fit(X_train, y_train)

Fitting 10 folds for each of 98 candidates, totalling 980 fits


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocessing',
                                        ColumnTransformer(transformers=[('standardscaler',
                                                                         StandardScaler(),
                                                                         ['age',
                                                                          'sex',
                                                                          'cp',
                                                                          'trestbps',
                                                                          'chol',
                                                                          'fbs',
                                                                          'restecg',
                                                                          'thalach',
                                                                          'exang

In [21]:
print("Best estimator:\n{}".format(gs.best_estimator_))

Best estimator:
Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['age', 'sex', 'cp',
                                                   'trestbps', 'chol', 'fbs',
                                                   'restecg', 'thalach',
                                                   'exang', 'oldpeak', 'slope',
                                                   'ca', 'thal'])])),
                ('classifier',
                 GradientBoostingClassifier(learning_rate=0.6025,
                                            min_samples_leaf=30,
                                            random_state=0))])


In [22]:
print("Test set score: {:.3f}".format(gs.score(X_test, y_test)))

Test set score: 0.844


In [233]:
print("Best parameters: {}".format(gs.best_params_))

Best parameters: {'classifier': KNeighborsClassifier(n_neighbors=28), 'classifier__n_neighbors': 28}


In [234]:
print("Best cross-validation score: {:.3f}".format(gs.best_score_))

Best cross-validation score: 0.921


## Step 6 
Here we are taking our results and applying it to the heart validation dataset. After doing that we are creating a new csv file with our predictions. 

In [67]:
results = pd.DataFrame(gs.cv_results_)

In [68]:
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__C,param_preprocessing,param_classifier,param_classifier__learning_rate,param_classifier__min_samples_leaf,param_classifier__n_estimators,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.008285,0.002812,0.003992,0.001781,0.00001,ColumnTransformer(transformers=[('standardscal...,,,,,...,0.939394,1.000000,0.977273,0.886364,0.875000,0.681818,0.911111,0.898813,0.086477,55
1,0.005856,0.000438,0.002663,0.000318,0.0001,ColumnTransformer(transformers=[('standardscal...,,,,,...,0.939394,1.000000,0.977273,0.886364,0.875000,0.681818,0.911111,0.899823,0.086650,53
2,0.005689,0.000754,0.002947,0.000610,0.001,ColumnTransformer(transformers=[('standardscal...,,,,,...,0.939394,1.000000,0.977273,0.886364,0.875000,0.681818,0.911111,0.899823,0.086650,53
3,0.005185,0.000205,0.002413,0.000145,0.01,ColumnTransformer(transformers=[('standardscal...,,,,,...,0.939394,1.000000,0.988636,0.886364,0.886364,0.715909,0.900000,0.908434,0.076839,23
4,0.005441,0.000677,0.002480,0.000154,0.1,ColumnTransformer(transformers=[('standardscal...,,,,,...,0.949495,1.000000,1.000000,0.875000,0.886364,0.750000,0.855556,0.902348,0.070284,42
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,0.003525,0.000061,0.002964,0.000086,,,KNeighborsClassifier(),,,,...,0.898990,1.000000,1.000000,0.857955,0.914773,0.732955,0.905556,0.906275,0.075224,27
67,0.003551,0.000059,0.002968,0.000072,,,KNeighborsClassifier(),,,,...,0.909091,1.000000,1.000000,0.875000,0.909091,0.738636,0.905556,0.910505,0.074644,18
68,0.003585,0.000110,0.002969,0.000086,,,KNeighborsClassifier(),,,,...,0.919192,1.000000,1.000000,0.880682,0.920455,0.744318,0.916667,0.914394,0.074318,12
69,0.003546,0.000063,0.003018,0.000065,,,KNeighborsClassifier(),,,,...,0.898990,0.994318,1.000000,0.897727,0.909091,0.727273,0.900000,0.910013,0.075551,19


In [23]:
gs.predict(heart_new)

array([0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0])

In [25]:
pd.Series(gs.predict(heart_new)).to_csv('Nolan_assign_9.csv', index= False)