# Problem definition

Data vocabulary:
1. #3 (age): age in years
2. #4 (sex): sex (1 = male; 0 = female)
3. #9 (cp): cp: chest pain type | Value 0: typical angina | Value 1: atypical angina | Value 2: non-anginal pain | Value 3: asymptomatic
4. #10 (trestbps): resting blood pressure (in mm Hg on admission to the hospital)
5. #12 (chol): serum cholestoral in mg/dl
6. #16 (fbs): (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
7. #19 (restecg): resting electrocardiographic results | Value 0: normal | Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV) | Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
8. #32 (thalach): maximum heart rate achieved
9. #38 (exang): exercise induced angina (1 = yes; 0 = no)
10. #40 (oldpeak): ST depression induced by exercise relative to rest
11. #41 (slope): the slope of the peak exercise ST segment | Value 1: upsloping | Value 2: flat | Value 3: downsloping
12. #44 (ca): number of major vessels (0-3) colored by flourosopy
13. #51 (thal): 3 = normal; 6 = fixed defect; 7 = reversable defect
14. #58 (num) (the predicted attribute): Value 0: < 50% diameter narrowing | Value 1: > 50% diameter narrowing Data vocabulary:

# Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder,StandardScaler,PowerTransformer, MinMaxScaler, RobustScaler
from sklearn.model_selection import KFold, cross_val_predict, train_test_split,GridSearchCV,cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.feature_selection import SelectKBest,SelectPercentile,f_classif,f_regression,mutual_info_regression
# from xgboost import XGBRegressor
# from xgboost import plot_importance


#importing plotly and cufflinks in offline mode
import cufflinks as cf
import plotly.offline
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

import warnings
warnings.filterwarnings('ignore')

plt.rcParams["figure.figsize"] = (10,6)
pd.set_option('max_colwidth',200)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 200)
pd.set_option('display.float_format', lambda x: '%.7f' % x)

# Data Gathering

In [2]:
df0 = pd.read_csv("heart.csv")
df = df0.copy()

FileNotFoundError: [Errno 2] No such file or directory: 'heart.csv'

# Data Preparation

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.nunique()

In [None]:
numerical= df.drop(['target'], axis=1).select_dtypes('number').columns

print(f'Numerical Columns:  {df0[numerical].columns}')

**Check for Duplicated Values**

In [3]:
df.duplicated().value_counts()

NameError: name 'df' is not defined

In [None]:
df[df.duplicated() == True]

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape

**Check for Missing Values**

In [None]:
miss_val = []
[miss_val.append(i) for i in df0.columns if any(df[i].isnull())]
miss_val

In [None]:

df.isnull().sum()

In [None]:
sns.heatmap(df.corr(), annot=True);

## The Examination of Target Variable

In [None]:
df['target'].describe()

In [None]:
print( f"Skewness: {df['target'].skew()}")

In [None]:
df['target'].iplot(kind='hist')

## Numerical Features

In [None]:
df[numerical].describe()

In [None]:
#sns.pairplot(data=df0, hue="target", height=4, aspect =1)

In [None]:
df[numerical].iplot(kind='hist');

In [None]:
df[numerical].iplot(kind='histogram',subplots=True,bins=50)

In [None]:
index = 0
plt.figure(figsize=(20,20))
for feature in numerical:
    if feature != "target":
        index += 1
        plt.subplot(4,4,index)
        sns.boxplot(x='target',y=feature,data=df)

In [None]:
skew_limit = 0.75 # This is our threshold-limit to evaluate skewness. Overall below abs(1) seems acceptable for the linear models. 
skew_vals = df[numerical].skew()
skew_cols= skew_vals[abs(skew_vals)> skew_limit].sort_values(ascending=False)
skew_cols

In [None]:
df[skew_cols.index].iplot(kind='hist');

In [None]:
df[skew_cols.index].iplot(kind='histogram',subplots=True,bins=50)

# Model Selection

### Data Preprocessing

In [None]:
X = df.drop("target", axis=1)
y = df["target"]

**Train & Split**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

**Scaling**

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()
scaler

In [None]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Implement Logistic Regression and Evaluate

**With Default Parameters**

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log_model = LogisticRegression()

In [None]:
log_model.fit(X_train_scaled, y_train)

In [None]:
y_test_pred=log_model.predict(X_test_scaled)

In [None]:
y_test_pred_proba = log_model.predict_proba(X_test_scaled)

In [None]:
test_data = pd.concat([X_test, y_test], axis=1)
test_data["pred"] = y_test_pred
test_data["pred_proba"] = y_test_pred_proba[:,1]
test_data.sample(10)

## Model Performance on Classification Tasks

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, plot_confusion_matrix

In [None]:
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

In [None]:
plot_confusion_matrix(log_model, X_test_scaled, y_test);

In [None]:
plot_confusion_matrix(log_model, X_test_scaled, y_test, normalize='true');

In [None]:
from yellowbrick.classifier import ConfusionMatrix

# The ConfusionMatrix visualizer taxes a model
cm = ConfusionMatrix(log_model)

# Fit fits the passed model. This is unnecessary if you pass the visualizer a pre-fitted model
cm.fit(X_train_scaled, y_train)

# To create the ConfusionMatrix, we need some test data. Score runs predict() on the data
from yellowbrick.classifier import ClassificationReport

visualizer = ClassificationReport(log_model, support=True)
visualizer.fit(X_train_scaled, y_train)        # Fit the visualizer and the model
visualizer.score(X_test_scaled, y_test)        # Evaluate the model on the test data
visualizer.show()

In [None]:
# print(confusion_matrix(y_test, y_test_pred)) (0.10)
# print(classification_report(y_test, y_test_pred))

In [None]:
# print(confusion_matrix(y_test, y_test_pred)) (0.15)
# print(classification_report(y_test, y_test_pred))

## Cross Validate

**NOTE: It can be implemented by either default parameter or GridSearch**

**a) Implemendation of Cross-Validation With Default Parameter**

In [None]:
from sklearn.model_selection import cross_validate

In [None]:
model = LogisticRegression()
model

In [None]:
scores = cross_validate(model, X_train_scaled, y_train, scoring = ['accuracy', 'precision','recall','f1'], cv = 10)
df_scores = pd.DataFrame(scores, index = range(1, 11))
#df_scores
df_scores.mean()[2:]

**b) Implemendation of Cross-Validation With Best Parameters (GridsearchCV)**

**LogisticRegression automatically makes regularizition and Default parameter is Ridge. We can change this parameter in accordance with our concept and/or dataset.**

Special Note:

Algorithm to use in the optimization problem. Default is ‘lbfgs’. To choose a solver, you might want to consider the following aspects:

For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and ‘saga’ are faster for large ones;

For multiclass problems, only ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’ handle multinomial loss;

‘liblinear’ is limited to one-versus-rest schemes.

Warning:

The choice of the algorithm depends on the penalty chosen: Supported penalties by solver: ‘newton-cg’ - [‘l2’, ‘none’]

‘lbfgs’ - [‘l2’, ‘none’]

‘liblinear’ - [‘l1’, ‘l2’]

‘sag’ - [‘l2’, ‘none’]

‘saga’ - [‘elasticnet’, ‘l1’, ‘l2’, ‘none’]

**We prefer "saga" since we selected "l1", "l2" and "elasticnet" as penalty.**

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html


In [None]:
log_model = LogisticRegression(solver='saga', max_iter=5000)

In [None]:
penalty = ["l1", "l2", "elasticnet"]
l1_ratio = np.linspace(0, 1, 20)
C = np.logspace(0, 10, 20)

param_grid = {"penalty" : penalty,
             "l1_ratio" : l1_ratio,
             "C" : C}

In [None]:
grid_model = GridSearchCV(log_model, param_grid = param_grid)

In [None]:
grid_model.fit(X_train_scaled,y_train)

In [None]:
grid_model.best_params_

In [None]:
y_test_pred = grid_model.predict(X_test_scaled)
y_test_pred

In [None]:
plot_confusion_matrix(grid_model, X_test_scaled, y_test)

In [None]:
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

## ROC (Receiver Operating Curve) and AUC (Area Under Curve)

In [None]:
from sklearn.metrics import plot_roc_curve, plot_precision_recall_curve, roc_auc_score, auc, roc_curve, average_precision_score, precision_recall_curve

In [None]:
plot_roc_curve(grid_model, X_test_scaled, y_test);

In [None]:
plot_precision_recall_curve(grid_model, X_test_scaled, y_test);

## Finding Best Threshold

In [None]:
plot_roc_curve(grid_model, X_train_scaled, y_train);

In [None]:
y_pred_proba = grid_model.predict_proba(X_train_scaled)
roc_auc_score(y_train, y_pred_proba[:,1])

In [None]:
fp_rate, tp_rate, thresholds = roc_curve(y_train, y_pred_proba[:,1])

In [None]:
optimal_idx = np.argmax(tp_rate - fp_rate)
optimal_threshold = thresholds[optimal_idx]
optimal_threshold

In [None]:
test_data.head()

In [None]:
test_data["pred2"] = test_data["pred_proba"].apply(lambda x : 1 if x >= optimal_threshold else 0)

In [None]:
test_data.head()

In [None]:
y_test_pred2 = test_data["pred2"]

In [None]:
print(confusion_matrix(y_test,y_test_pred2))
print(classification_report(y_test,y_test_pred2))

In [None]:
print(confusion_matrix(y_test,y_test_pred))
print(classification_report(y_test,y_test_pred))

In [None]:
plot_precision_recall_curve(grid_model, X_train_scaled, y_train);

In [None]:
y_pred_proba = grid_model.predict_proba(X_train_scaled)
average_precision_score(y_train, y_pred_proba[:,1])

In [None]:
precisions, recalls, thresholds = precision_recall_curve(y_train, y_pred_proba[:,1])

In [None]:
optimal_idx = np.argmax((2 * precisions * recalls) / (precisions + recalls))  # harmonic mean
optimal_threshold = thresholds[optimal_idx]
optimal_threshold

In [None]:
test_data.head()

In [None]:
test_data["pred3"] = test_data["pred_proba"].apply(lambda x : 1 if x >= optimal_threshold else 0)

In [None]:
test_data.head()

In [None]:
y_test_pred3 = test_data["pred3"]

In [None]:
print(confusion_matrix(y_test,y_test_pred2))
print(classification_report(y_test,y_test_pred3))

# Implement KNN and Evaluate

In [None]:
X = df.drop("target", axis=1)
y = df["target"]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
scaler = MinMaxScaler()

In [None]:
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

### Modelling

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=5) 

In [None]:
knn_model.fit(scaled_X_train,y_train)

In [None]:
y_test_pred = knn_model.predict(scaled_X_test)
y_test_pred

In [None]:
y_test_pred_proba = knn_model.predict_proba(scaled_X_test)
y_test_pred_proba

In [None]:
pd.DataFrame(y_test_pred_proba)

In [None]:
my_dict = {"Actual": y_test, "Pred":y_test_pred, "Proba_1":y_test_pred_proba[:,1], "Proba_0":y_test_pred_proba[:,0]}

In [None]:
pd.DataFrame.from_dict(my_dict).sample(10)

### Model Performance on Classification Tasks

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, plot_confusion_matrix

In [None]:
confusion_matrix(y_test, y_test_pred)

In [None]:
plot_confusion_matrix(knn_model, scaled_X_test, y_test);

In [None]:
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

### Elbow Method for Choosing Reasonable K Values

In [None]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

In [None]:
test_error_rates = []

for k in range(1,30):
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(scaled_X_train,y_train) 
   
    y_test_pred = knn_model.predict(scaled_X_test)
    
    test_error = 1 - accuracy_score(y_test,y_test_pred)
    test_error_rates.append(test_error)
    print("k=", k, "için error_rate:", test_error)

In [None]:
plt.figure(figsize=(15,8))
plt.plot(range(1,30), test_error_rates, color='blue', linestyle='--', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K_values')
plt.ylabel('Error Rate')
plt.hlines(y=0.1311, xmin = 0, xmax = 30, colors= 'r', linestyles="--")
plt.hlines(y=0.1475, xmin = 0, xmax = 30, colors= 'r', linestyles="--")


### Scores by Value of K = 8

In [None]:
knn = KNeighborsClassifier(n_neighbors=9)

knn.fit(scaled_X_train,y_train)
pred = knn.predict(scaled_X_test)

print('WITH K=9')
print('\n')
print(confusion_matrix(y_test, pred))
print('\n')
print(classification_report(y_test, pred))

### Gridsearch Method for Choosing Reasonable K Values

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
knn_grid = KNeighborsClassifier()

In [None]:
k_values= range(1,30)

In [None]:
param_grid = {"n_neighbors":k_values, "p": [1,2], "weights": ['uniform', "distance"]}

In [None]:
knn_grid_model = GridSearchCV(knn_grid, param_grid, cv=10, scoring= 'accuracy')

In [None]:
knn_grid_model.fit(scaled_X_train, y_train)

In [None]:
k_best = GridSearchCV(knn, {"n_neighbors": np.arange(1,30,1)}, cv = 10).\
                      fit(X_train, y_train).best_params_["n_neighbors"]
    
knn_tuned = KNeighborsRegressor(n_neighbors = k_best).fit(X_train, y_train)
print(k_best)
print(knn_tuned.metric)
y_test_pred = knn_tuned.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_test_pred))


In [None]:
knn_grid_model.best_params_

In [None]:
knn = KNeighborsClassifier(n_neighbors = 9, p = 1)

knn.fit(scaled_X_train,y_train)
pred = knn.predict(scaled_X_test)

print('WITH K=9')
print('\n')
print(confusion_matrix(y_test, pred))
print('\n')
print(classification_report(y_test, pred))

In [None]:
# knn = KNeighborsClassifier(n_neighbors = 4, p = 1)

# knn.fit(scaled_X_train,y_train)
# pred = knn.predict(scaled_X_test)

# print('WITH K=8')
# print('\n')
# print(confusion_matrix(y_test, pred))
# print('\n')
# print(classification_report(y_test, pred))

In [None]:
plot_confusion_matrix(knn, X_test_scaled, y_test);

# Implement SVM and Evaluate

In [None]:
X = df.drop("target", axis=1)
y = df["target"]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()

In [None]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Modelling and Model Performance

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
svm_model_scaled = SVC()
svm_model_scaled.fit(X_train_scaled, y_train)
y_test_pred = svm_model_scaled.predict(X_test_scaled)
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

**SVC with Cross Validation**

In [None]:
model_SVC = SVC()

scores = cross_validate(model_SVC, X_train_scaled, y_train, scoring = ['accuracy', 'precision','recall','f1'], cv = 10)
df_scores = pd.DataFrame(scores, index = range(1, 11))
df_scores.mean()[2:]

### With Best Parameters (GridsearchCV)

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {'C': [0.1,1, 10, 100, 1000, 5000],
              'gamma': ["scale", "auto", 1,0.1,0.01,0.001,0.0001],
              'kernel': ['rbf', 'linear']}

In [None]:
model_SVC = SVC()
svm_model_grid = GridSearchCV(model_SVC, param_grid, verbose=3, refit=True)

In [None]:
svm_model_grid.fit(X_train_scaled, y_train)

In [None]:
svm_model_grid.best_params_

In [None]:
svm_model_grid.best_estimator_

In [None]:
y_test_pred = svm_model_grid.predict(X_test_scaled)
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

### Model Performance on Classification Tasks

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, plot_confusion_matrix

In [None]:
confusion_matrix(y_test, y_test_pred)

In [None]:
plot_confusion_matrix(svm_model_grid, scaled_X_test, y_test);

# Visually compare models based on your chosen metric

In [None]:
plot_confusion_matrix(log_model, X_test_scaled, y_test);

In [None]:
plot_confusion_matrix(grid_model, X_test_scaled, y_test);

In [None]:
plot_confusion_matrix(knn_grid_model, X_test_scaled, y_test);

In [None]:
plot_confusion_matrix(svm_model_grid, scaled_X_test, y_test);

In [None]:
y_test_pred = svm_model_grid.predict(X_test_scaled)
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

# Chose best model and make a random prediction

### Final Model

In [None]:
scaler = MinMaxScaler().fit(X)

In [None]:
X_scaled = scaler.transform(X)
X_scaled

In [None]:
final_model = grid_model.fit(X_scaled, y)
final_model

In [None]:
X.describe().T

### Model Deployment

In [None]:
import pickle
pickle.dump(scaler, open("scaler_target", 'wb'))

In [None]:
pickle.dump(final_model, open("final_model_target", 'wb'))

In [None]:
my_dict = {"age": [18,43,67,24,79],
           "sex":[0,0,1,1,0],
           "cp":[0,1,1,3,2],
           "trestbps": [90,110,130,80,120],
           "chol": [202,244,269,212,224],
           "fbs": [1,1,0,1,0],
           "restecg": [0,1,1,0,0],
           "thalach": [154,167,142,179,191],
           "exang": [0,1,0,1,1],
           "oldpeak": [0.90,1.40,2.20,1.70,1.10],
           "slope": [1,0,0,2,1],
           "ca": [1,0,3,4,2],
           "thal": [0,3,1,2,1]
          }

In [None]:
sample = pd.DataFrame(my_dict)
sample

In [None]:
scaler_target = pickle.load(open("scaler_target", "rb"))

In [None]:
sample_scaled = scaler_target.transform(sample)
sample_scaled

In [None]:
final_model = pickle.load(open("final_model_target", "rb"))

In [None]:
predictions = final_model.predict(sample_scaled)
predictions_proba = final_model.predict_proba(sample_scaled)
predictions2 = [1 if i >= optimal_threshold else 0 for i in predictions_proba[:,1]]

In [None]:
sample["pred_proba"] = predictions_proba[:,1]
sample["pred"] = predictions
sample["pred2"] = predictions2
sample