In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.gofplots import qqplot
import numpy as np
from scipy import stats as st
import random
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve, precision_recall_curve, auc
from sklearn.dummy import DummyClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression 

: 

Importing Dataset :

In [None]:
Data = pd.read_csv("health care diabetes.csv")

: 

Data Exploration :

In [None]:
Data.head(10)

: 

In [None]:
print('No. of rows and columns : ', Data.shape)

: 

In [None]:
print('Columns names :')
print('===============')
print(Data.columns)

: 

In [None]:
print('Dataset structure information :')
print('===============================')
Data.info()

: 

In [None]:
print('Count of missing values : ')
print('==========================')
print(Data.isnull().sum())

: 

In [None]:
print('Dataset data description :')
print('==========================')
Data.describe()

: 

Checking for duplicate data

In [None]:
dup = Data.duplicated()
print('Is there any duplicate rows ?')
print('=============================')
print(dup.value_counts())

: 

No duplicate rows found in the dataset.


Exploring the distribution of data of columns 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin' and 'BMI' using histogram and density plot.

In [None]:
f, axes = plt.subplots(5, 2, figsize=(20, 25))
sns.distplot( Data["Glucose"] , color="skyblue", ax=axes[0,0])
qqplot(Data["Glucose"], line="45", fit=True, ax=axes[0,1])
sns.distplot( Data["BloodPressure"] , color="olive", ax=axes[1, 0])
qqplot(Data["BloodPressure"], line ="45", fit=True, ax=axes[1,1])
sns.distplot( Data["SkinThickness"] , color="Orange", ax=axes[2, 0])
qqplot(Data["SkinThickness"], line ="45", fit=True, ax=axes[2,1])
sns.distplot( Data["Insulin"] , color="teal", ax=axes[3, 0])
qqplot(Data["Insulin"], line ="45", fit=True, ax=axes[3,1])
sns.distplot(Data["BMI"], color="Pink", ax=axes[4, 0])
qqplot(Data["BMI"], line ="45", fit=True, ax=axes[4,1])
plt.show()

: 

Above graph shows that apart from Insulin all other features follows alomst normal distribution. Insulin has a lot of zero value, so if we fill those it may too follow normal distribution.

A zero in columns 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin' and 'BMI' indicates a missing values. So finding all missing values in these columns.

In [None]:
print('Number of missing values in :')
print('=============================')
print('1. Glucose        = ', (Data.Glucose == 0).sum())
print('2. Blood Pressure = ', (Data.BloodPressure == 0).sum())
print('3. Skin Thickness = ', (Data.SkinThickness == 0).sum())
print('4. Insilin        = ', (Data.Insulin == 0).sum())
print('5. BMI            = ', (Data.BMI == 0).sum())

: 

From above data we can see that Insulin and Skinthickness is missing most of its datas.

In [None]:
Total_Records = Data.shape[0]
Insuline_Missing_Count = (Data.Insulin == 0).sum()
SkinThickness_Missing_Count = (Data.SkinThickness == 0).sum()
Insulin_Missing_Pct = (Insuline_Missing_Count / Total_Records) * 100
SkinThickness_Missing_Pct = (SkinThickness_Missing_Count / Total_Records) * 100
print('% of missing Insulin Data       = ', Insulin_Missing_Pct.round(2))
print('% of missing SkinThickness Data = ', SkinThickness_Missing_Pct.round(2))


: 

Filling missing values.

I have notice that other than insulin all other features have almost similar mean and median value. So, I am using median value to fill those features, as median value is independent of outliers. The small difference in mean and median these features have may be due to those outliers. The difference of mean and median for Insulin is very high compare to others. So, for this feature I am using trim-mean because I want to concentrate on the datas of denser region.

In [None]:
Insulin_Trim_Mean = st.trim_mean(Data.Insulin, .20, axis=0).astype(int)
print('Trim-Mean of Insulin : ', Insulin_Trim_Mean)

: 

In [None]:
Data.Glucose.replace(0, Data.Glucose.median(), inplace = True)
Data.BloodPressure.replace(0, Data.BloodPressure.median(), inplace = True)
Data.SkinThickness.replace(0, Data.SkinThickness.median(), inplace = True)
Data.Insulin.replace(0, Insulin_Trim_Mean, inplace = True)
Data.BMI.replace(0, Data.BMI.median(), inplace = True)

: 

In [None]:
Data.to_csv('Healthcare.csv')

: 

In [None]:
Data.Insulin.value_counts()

: 

In [None]:
print('Number of missing values in :')
print('=============================')
print('1. Glucose        = ', (Data.Glucose == 0).sum())
print('2. Blood Pressure = ', (Data.BloodPressure == 0).sum())
print('3. Skin Thickness = ', (Data.SkinThickness == 0).sum())
print('4. Insilin        = ', (Data.Insulin == 0).sum())
print('5. BMI            = ', (Data.BMI == 0).sum())

: 

Finding count of data types in the dataset.

In [None]:
D = pd.DataFrame(Data.dtypes)
D.rename(columns= {0: 'Type'}, inplace=True)
D.replace(['int64', 'float64'], ['Integer', 'Float'], inplace=True)
print("Types of data type :")
print('====================')
D

: 

In [None]:
print('Count of data types :')
print('=====================')
print(D.Type.value_counts())

: 

Plotting the data type count using bar graph

In [None]:
sns.countplot(x = 'Type', data = D)

: 

Univariate analysis

In [None]:
print('Count of Values in Pregnancies :')
print('================================')
P = Data.Pregnancies.value_counts().sort_index()
print(P)
sns.countplot(x='Pregnancies', data = Data)

: 

In [None]:
print('Count of Values in Glucose :')
print('============================')
P = Data.Glucose.value_counts().sort_index()
print(P)
plt.figure(figsize=(90,60))
sns.countplot(x='Glucose', data = Data)
plt.xticks(rotation='vertical', fontsize = 30)
plt.xlabel('Glucose', fontsize=70)
plt.yticks(fontsize = 50)
plt.ylabel('Counts', fontsize =70)
plt.show()

: 

In [None]:
print('Count of Values in Blood Presure :')
print('==================================')
P = Data.BloodPressure.value_counts().sort_index()
print(P)
plt.figure(figsize = (50, 20))
sns.countplot(x='BloodPressure', data = Data)
plt.xticks(rotation='vertical', fontsize=30)
plt.xlabel('Blood Pressure', fontsize =40)
plt.yticks(fontsize = 30)
plt.ylabel('Counts', fontsize = 40)
plt.show()

: 

In [None]:
print('Count of Values in Skin Thickness :')
print('===================================')
P = Data.SkinThickness.value_counts().sort_index()
print(P)
plt.figure(figsize = (90, 40))
sns.countplot(x='SkinThickness', data = Data)
plt.xticks(rotation='vertical', fontsize=50)
plt.xlabel('Skin Thickness', fontsize=70)
plt.yticks(fontsize = 50)
plt.ylabel('Counts', fontsize=70)
plt.show()

: 

In [None]:
print('Count of Values in Insulin :')
print('============================')
P = Data.Insulin.value_counts().sort_index()
print(P)
plt.figure(figsize = (150, 40))
sns.countplot(data=Data, x="Insulin")
plt.xticks(rotation='vertical', fontsize=25)
plt.xlabel('Insulin', fontsize=100)
plt.yticks(fontsize = 50)
plt.ylabel('Counts', fontsize=100)
plt.show()

: 

In [None]:
print('Count of Values in BMI :')
print('========================')
P = Data.BMI.value_counts().sort_index()
print(P)
plt.figure(figsize = (100, 40))
sns.countplot(data=Data, x="BMI")
plt.xticks(rotation='vertical', fontsize= 10)
plt.xlabel('BMI', fontsize=70)
plt.yticks(fontsize =50)
plt.ylabel("Counts", fontsize=70)
plt.show()


: 

In [None]:
print('Count of Values in Diabetes Pedigree Function :')
print('===============================================')
P = Data.DiabetesPedigreeFunction.value_counts().sort_index()
print(P)
plt.figure(figsize = (150, 40))
sns.countplot(data=Data, x="DiabetesPedigreeFunction")
plt.xticks(rotation='vertical', fontsize=7)
plt.xlabel('Diabetes Pedigree Function', fontsize=100)
plt.yticks(fontsize = 50)
plt.ylabel('Counts', fontsize=100)
plt.show()

: 

In [None]:
print('Count of Values in Age :')
print('========================')
P = Data.Age.value_counts().sort_index()
print(P)
plt.figure(figsize = (90, 40))
sns.countplot(data=Data, x="Age")
plt.xticks(rotation='vertical', fontsize=50)
plt.xlabel('Age', fontsize=60)
plt.yticks(fontsize =50)
plt.ylabel('Counts', fontsize=60)
plt.show()

: 

In [None]:
print('Count of Values in Outcome :')
print('============================')
P = Data.Outcome.value_counts().sort_index()
print(P)
plt.figure(figsize = (5, 5))
sns.countplot(data=Data, x="Outcome")
plt.xticks(fontsize=10)
plt.xlabel('Outcome', fontsize=15)
plt.yticks(fontsize =10)
plt.ylabel('Counts', fontsize=15)
plt.show()

: 

Analyzing the distribution of data after filling all missing values.

In [None]:
f, axes = plt.subplots(5, 2, figsize=(20, 25))
sns.distplot( Data["Glucose"] , color="skyblue", ax=axes[0,0])
qqplot(Data["Glucose"], line="45", fit=True, ax=axes[0,1])
sns.distplot( Data["BloodPressure"] , color="olive", ax=axes[1, 0])
qqplot(Data["BloodPressure"], line ="45", fit=True, ax=axes[1,1])
sns.distplot( Data["SkinThickness"] , color="Orange", ax=axes[2, 0])
qqplot(Data["SkinThickness"], line ="45", fit=True, ax=axes[2,1])
sns.distplot( Data["Insulin"] , color="teal", ax=axes[3, 0])
qqplot(Data["Insulin"], line ="45", fit=True, ax=axes[3,1])
sns.distplot(Data["BMI"], color="Pink", ax=axes[4, 0])
qqplot(Data["BMI"], line ="45", fit=True, ax=axes[4,1])
plt.show()

: 

Bivariate analysis

Scatter chart to understand the relation between two variables.

In [None]:
sns.pairplot(data=Data, hue='Outcome')
plt.xticks(fontsize = 50)

: 

Analyzing correlation among different variables.

In [None]:
Cor = Data.corr()
print('Correlation Chart for the dataset :')
print('===================================')
Cor

: 

Visiualizing the correlation using heatmap

In [None]:

sns.heatmap(data=Cor, vmin=-0.2, vmax=1.0, cmap='Purples')

: 

Data Modeling

In [None]:
Model_Data = Data.copy()

: 

In [None]:
Model_Comparision_Report = pd.DataFrame(columns = ['Classifier', 'Accuracy', 'Precision', 'Specificity',
                                                   'Recall/Sensitivity', 'F1', 'ROC-AUC-Score', 'PR-AUC-Score'])

: 

Data preprocessing
Selecting independent and target variables

In [None]:
x = pd.DataFrame(Model_Data.iloc[:, [0, 1, 2, 3, 4, 5, 6, 7]])
y = Data.iloc[:, -1]

: 

Splitting dataset into train and test set in the ratio of 80:20

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.20, random_state = 60)
xtrain.shape, xtest.shape, ytrain.shape, ytest.shape

: 

Scaling data

In [None]:
scaler = StandardScaler()
scaled_xtrain = scaler.fit_transform(xtrain)
scaled_xtest = scaler.fit_transform(xtest)

: 

Creating ROC curve for tpr=fpr or no-skill classifier

In [None]:
random_probs = [0 for i in range(len(ytest))]
fpr_ns, tpr_ns, thd = roc_curve(ytest, random_probs, pos_label=1)

: 

No skill model, stratified random class predictions for Precision_Recall curve

In [None]:
no_skill = len(y[y==1]) / len(y)
model = DummyClassifier(strategy='stratified', random_state=0)
model.fit(xtrain, ytrain)
yhat = model.predict_proba(xtest)
dummy_probs = yhat[:, 1]
precision, recall, _ = precision_recall_curve(ytest, dummy_probs)
ns_roc_score = metrics.roc_auc_score(ytest, dummy_probs)
ns_pr_score = metrics.auc(recall, precision)
print('ROC-AUC Score for no skill classifier = ', ns_roc_score)
print('Precision_Recal Score for no skill classifier = ', ns_pr_score)

: 

KNN Modeling

Elbow method to find optimum value of K.

In [None]:
error_rate = []
for i in range(1,30):
    model = KNeighborsClassifier(n_neighbors=i)
    model.fit(scaled_xtrain, ytrain)
    pred_i = model.predict(scaled_xtest)
    error_rate.append(np.mean(pred_i != ytest))

: 

In [None]:
plt.figure(figsize=(20,5))
plt.plot(range(1,30), error_rate,color='blue')
plt.xlabel('Values of K', fontsize=15)
plt.ylabel('Error Rate', fontsize=15)

: 

Elbow method is showing K=15 gives the lowest error rate.

In [None]:
knn_classifier = KNeighborsClassifier(n_neighbors = 15)
knn_model = knn_classifier.fit(scaled_xtrain, ytrain)
knn_model

: 

In [None]:
ypredict = knn_classifier.predict(scaled_xtest)
prob_predict = knn_classifier.predict_proba(scaled_xtest)

: 

In [None]:
print('Training model score for KNN : ', knn_model.score(scaled_xtrain, ytrain))
print('Test model score for KNN     : ', knn_model.score(scaled_xtest, ytest))

: 

In [None]:
train_matrix = confusion_matrix(ytrain, knn_classifier.predict(scaled_xtrain))
test_matrix = confusion_matrix(ytest, ypredict)
print('Confusion matrix for train data for KNN :')
print('=========================================')
print(train_matrix, '\n')
print('Confusion matrix for test data for KNN :')
print('========================================')
print(test_matrix)

: 

In [None]:
print('Classification report for train data for KNN :')
print('==============================================')
print(classification_report(ytrain, knn_classifier.predict(scaled_xtrain)))
print('Classification report for train data for KNN :')
print('==============================================')
print(classification_report(ytest, ypredict))

: 

In [None]:
spec = round((test_matrix[0,0]/(test_matrix[0,0] + test_matrix[0,1]) * 100), 1)
spec

: 

In [None]:
fpr_knn, tpr_knn, thresh = roc_curve(ytest, prob_predict[:,1], pos_label=1)
precision_knn, recall_knn, _ = precision_recall_curve(ytest, prob_predict[:,1], pos_label=1)
auc_score = roc_auc_score(ytest, prob_predict[:,1])
pr_auc_score = auc(recall_knn, precision_knn)
auc_score =  round((auc_score * 100), 1)
pr_auc_score = round((pr_auc_score * 100), 1)
print('ROC_AUC_Score for KNN : ', auc_score)
print('Precision-Recall Score for KNN : ', pr_auc_score)

: 

In [None]:
ac = round(((metrics.accuracy_score(ytest, ypredict))*100), 1)
f1 = round(((metrics.f1_score(ytest, ypredict))*100), 1)
re = round(((metrics.recall_score(ytest, ypredict))*100), 1)
pr = round(((metrics.precision_score(ytest, ypredict))*100), 1)
Model_Comparision_Report = Model_Comparision_Report.append({'Classifier': 'KNN', 'Accuracy': ac, 'Recall/Sensitivity': re, 
                                                            'Specificity': spec, 'Precision': pr, 'F1': f1, 
                                                            'ROC-AUC-Score': auc_score, 'PR-AUC-Score': pr_auc_score},
                                                             ignore_index=True)

: 

SVM Modeling

Using Standardized data as it is distance based algorithm.

Finding best parameter

In [None]:
param_grid = ({'C': [10, 100, 1000],
              'kernel': ['rbf', 'linear', 'poly'],
              'gamma': ['auto', 'scale']})

: 

In [None]:
cls_svm = svm.SVC()       
grid_search = GridSearchCV(cls_svm, param_grid)
grid_search_model = grid_search.fit(scaled_xtrain, ytrain)

: 

In [None]:
print('Best Parameters : ', grid_search_model.best_params_, '\n')   
print('Best Estimator  : ', grid_search_model.best_estimator_)

: 

In [None]:
svm_classifier = svm.SVC(kernel = 'linear', gamma='auto', C=1000)
svm_model = svm_classifier.fit(scaled_xtrain, ytrain)

: 

In [None]:
ypredict = svm_model.predict(scaled_xtest)
prob_predict = svm_classifier.decision_function(scaled_xtest)

: 

In [None]:
print('Training model score for SVM : ', svm_model.score(scaled_xtrain, ytrain))
print('Test model score for SVM     : ', svm_model.score(scaled_xtest, ytest))

: 

In [None]:
train_matrix = confusion_matrix(ytrain, svm_classifier.predict(scaled_xtrain))
test_matrix = confusion_matrix(ytest, ypredict)
print('Confusion matrix for train data for SVM :')
print('=========================================')
print(train_matrix, '\n')
print('Confusion matrix for test data for SVM :')
print('========================================')
print(test_matrix)

: 

In [None]:
print('Classification report for train data for SVM :')
print('==============================================')
print(classification_report(ytrain, svm_classifier.predict(scaled_xtrain)))
print('Classification report for test data for SVM :')
print('=============================================')
print(classification_report(ytest, ypredict))

: 

In [None]:
spec = round((test_matrix[0,0]/(test_matrix[0,0] + test_matrix[0,1]) * 100), 1)
spec

: 

In [None]:
fpr_svm, tpr_svm, thresh = roc_curve(ytest, prob_predict, pos_label=1)
precision_svm, recall_svm, _ = precision_recall_curve(ytest, prob_predict, pos_label=1)
auc_score = metrics.roc_auc_score(ytest, prob_predict)
pr_auc_score = metrics.auc(recall_svm, precision_svm)
auc_score =  round((auc_score * 100), 1)
pr_auc_score = round((pr_auc_score * 100), 1)
print('ROC_AUC_Score for SVM : ', auc_score)
print('Precision-Recall Score for SVM : ', pr_auc_score)
#prob_predict

: 

In [None]:
ac = round(((metrics.accuracy_score(ytest, ypredict))*100), 1)
f1 = round(((metrics.f1_score(ytest, ypredict))*100), 1)
re = round(((metrics.recall_score(ytest, ypredict))*100), 1)
pr = round(((metrics.precision_score(ytest, ypredict))*100), 1)
Model_Comparision_Report = Model_Comparision_Report.append({'Classifier': 'SVM', 'Accuracy': ac, 'Recall/Sensitivity': re, 
                                                            'Specificity': spec, 'Precision': pr, 'F1': f1, 
                                                            'ROC-AUC-Score': auc_score, 'PR-AUC-Score': pr_auc_score}, 
                                                             ignore_index=True)

: 

Naive Bayes Modeling

Using original data as the algorithm is not distance based

In [None]:

nb_classifier = GaussianNB()
nb_model = nb_classifier.fit(xtrain, ytrain)

: 

In [None]:
ypredict = nb_classifier.predict(xtest)
prob_predict = nb_classifier.predict_proba(scaled_xtest)

: 

In [None]:
print('Training model score for Naive_Bayes : ', nb_model.score(xtrain, ytrain))
print('Test model score for Naive_Bayes     : ', nb_model.score(xtest, ytest))

: 

In [None]:

train_matrix = confusion_matrix(ytrain, nb_classifier.predict(xtrain))
test_matrix = confusion_matrix(ytest, ypredict)
print('Confusion matrix for train data for naive bayes :')
print('=================================================')
print(train_matrix, '\n')
print('Confusion matrix for test data for naive bayes :')
print('================================================')
print(test_matrix)

: 

In [None]:
print('Classification report for train data for naive bayes :')
print('======================================================')
print(classification_report(ytrain, nb_classifier.predict(xtrain)))
print('Classification report for test data for naive bayes :')
print('=====================================================')
print(classification_report(ytest, ypredict))

: 

In [None]:
spec = round((test_matrix[0,0]/(test_matrix[0,0] + test_matrix[0,1]) * 100), 1)
spec

: 

In [None]:
fpr_nb, tpr_nb, thresh = roc_curve(ytest, prob_predict[:,1], pos_label=1)
precision_nb, recall_nb, _ = precision_recall_curve(ytest, prob_predict[:,1], pos_label=1)
auc_score = metrics.roc_auc_score(ytest, prob_predict[:,1])
pr_auc_score = metrics.auc(recall_nb, precision_nb)
auc_score =  round((auc_score * 100), 1)
pr_auc_score = round((pr_auc_score * 100), 1)
print('ROC_AUC_Score for Naive Bayes : ', auc_score)
print('Precision-Recall Score for Naive Bayes : ', pr_auc_score)

: 

In [None]:
ac = round(((metrics.accuracy_score(ytest, ypredict))*100), 1)
f1 = round(((metrics.f1_score(ytest, ypredict))*100), 1)
re = round(((metrics.recall_score(ytest, ypredict))*100), 1)
pr = round(((metrics.precision_score(ytest, ypredict))*100), 1)
Model_Comparision_Report = Model_Comparision_Report.append({'Classifier': 'Naive Bayes', 'Accuracy': ac, 
                                                            'Recall/Sensitivity': re, 'Specificity': spec, 
                                                            'Precision': pr, 'F1': f1, 'ROC-AUC-Score': auc_score, 
                                                            'PR-AUC-Score': pr_auc_score}, ignore_index=True)

: 

Random Forest Modeling

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=150, max_depth=15, max_features='log2', random_state=95)
rf_model = rf_classifier.fit(scaled_xtrain, ytrain)

: 

In [None]:
ypredict = rf_classifier.predict(scaled_xtest)
prob_predict = rf_classifier.predict_proba(scaled_xtest)

: 

In [None]:
print('Training model score for Random_Forest : ', rf_model.score(scaled_xtrain, ytrain))
print('Test model score for Random_Forest     : ', rf_model.score(scaled_xtest, ytest))

: 

In [None]:
train_matrix = confusion_matrix(ytrain, rf_classifier.predict(scaled_xtrain))
test_matrix = confusion_matrix(ytest, ypredict)
print('Confusion matrix for train data for random forest :')
print('===================================================')
print(train_matrix, '\n')
print('Confusion matrix for test data for random forest :')
print('==================================================')
print(test_matrix)

: 

In [None]:
print('Classification report for train data for random forest :')
print('========================================================')
print(classification_report(ytrain, rf_classifier.predict(scaled_xtrain)))
print('Classification report for test data for random forest :')
print('=======================================================')
print(classification_report(ytest, ypredict))

: 

In [None]:
spec = round((test_matrix[0,0]/(test_matrix[0,0] + test_matrix[0,1]) * 100), 1)
spec

: 

In [None]:
fpr_rf, tpr_rf, thresh = roc_curve(ytest, prob_predict[:,1], pos_label=1)
precision_rf, recall_rf, _ = precision_recall_curve(ytest, prob_predict[:,1], pos_label=1)
auc_score = metrics.roc_auc_score(ytest, prob_predict[:,1])
pr_auc_score = metrics.auc(recall_rf, precision_rf)
auc_score =  round((auc_score * 100), 1)
pr_auc_score = round((pr_auc_score * 100), 1)
print('ROC_AUC_Score for Random Forest : ', auc_score)
print('Precision-Recall Score for Random Forest : ', pr_auc_score)

: 

In [None]:
ac = round(((metrics.accuracy_score(ytest, ypredict))*100), 1)
f1 = round(((metrics.f1_score(ytest, ypredict))*100), 1)
re = round(((metrics.recall_score(ytest, ypredict))*100), 1)
pr = round(((metrics.precision_score(ytest, ypredict))*100), 1)
Model_Comparision_Report = Model_Comparision_Report.append({'Classifier': 'Random Forest', 'Accuracy': ac, 
                                                            'Recall/Sensitivity': re, 'Specificity': spec, 
                                                            'Precision': pr, 'F1': f1, 'ROC-AUC-Score': auc_score, 
                                                            'PR-AUC-Score': pr_auc_score}, ignore_index=True)

: 

Logistic Regression

In [None]:
lr_classifier = LogisticRegression()
lr_model = lr_classifier.fit(scaled_xtrain, ytrain)

: 

In [None]:
ypredict = lr_classifier.predict(scaled_xtest)
prob_predict = lr_classifier.predict_proba(scaled_xtest)

: 

In [None]:
print('Training model score for Logistic_Regression : ', lr_model.score(scaled_xtrain, ytrain))
print('Test model score for Logistic_Regression     : ', lr_model.score(scaled_xtest, ytest))

: 

In [None]:
train_matrix = confusion_matrix(ytrain, lr_classifier.predict(scaled_xtrain))
test_matrix = confusion_matrix(ytest, ypredict)
print('Confusion matrix for train data for logistic regression :')
print('=========================================================')
print(train_matrix, '\n')
print('Confusion matrix for test data for logistic regression :')
print('========================================================')
print(test_matrix)

: 

In [None]:
print('Classification report for train data for logistic regression :')
print('==============================================================')
print(classification_report(ytrain, lr_classifier.predict(scaled_xtrain)))
print('Classification report for test data for logistic regression :')
print('=============================================================')
print(classification_report(ytest, ypredict))

: 

In [None]:

spec = round((test_matrix[0,0]/(test_matrix[0,0] + test_matrix[0,1]) * 100), 1)
spec

: 

In [None]:
fpr_lr, tpr_lr, thresh = roc_curve(ytest, prob_predict[:,1], pos_label=1)
precision_lr, recall_lr, _ = precision_recall_curve(ytest, prob_predict[:,1], pos_label=1)
auc_score = metrics.roc_auc_score(ytest, prob_predict[:,1])
pr_auc_score = metrics.auc(recall_lr, precision_lr)
auc_score =  round((auc_score * 100), 1)
pr_auc_score = round((pr_auc_score * 100), 1)
print('ROC_AUC_Score for Logistic Regression : ', auc_score)
print('Precision-Recall Score for Logistic Regression : ', pr_auc_score)

: 

In [None]:
ac = round(((metrics.accuracy_score(ytest, ypredict))*100), 1)
f1 = round(((metrics.f1_score(ytest, ypredict))*100), 1)
re = round(((metrics.recall_score(ytest, ypredict))*100), 1)
pr = round(((metrics.precision_score(ytest, ypredict))*100), 1)
Model_Comparision_Report = Model_Comparision_Report.append({'Classifier': 'Logistic Regression', 'Accuracy': ac, 
                                                            'Recall/Sensitivity': re, 'Specificity': spec, 
                                                            'Precision': pr, 'F1': f1, 'ROC-AUC-Score': auc_score, 
                                                             'PR-AUC-Score': pr_auc_score}, ignore_index=True)

: 

ROC - AUC Plot

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(fpr_knn, tpr_knn, color='orange', label='KNN', marker='.')
plt.plot(fpr_svm, tpr_svm, color='green', label='SVM', marker='.')
plt.plot(fpr_nb, tpr_nb, color='crimson', label='NB', marker='.')
plt.plot(fpr_rf, tpr_rf, color='blue', label='RF', marker='.')
plt.plot(fpr_lr, tpr_lr, color='darkviolet', label='LR', marker='.')
plt.plot(fpr_ns, tpr_ns, linestyle='--', color='dimgrey', label='No skill', marker='.')

plt.title('ROC-AUC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')
plt.legend(loc='best')

plt.show()

: 

Precesion - Recall Curve

For imbalance data set Precesion-Recall Curve is also used for checking the tradeoff between Recall and Recall

In [None]:
plt.figure(figsize=(15, 10))
plt.plot(precision_knn, recall_knn, color='orange', label='KNN', marker='.')
plt.plot(precision_svm, recall_svm, color='green', label='SVM', marker='.')
plt.plot(precision_nb, recall_nb, color='crimson', label='NB', marker='.')
plt.plot(precision_rf, recall_rf, color='blue', label='RF', marker='.')
plt.plot(precision_lr, recall_lr, color='darkviolet', label='LR', marker='.')
plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill', marker='.')

plt.title('Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend(loc='best')

plt.show()

: 

In [None]:
plt.figure(figsize=(25, 10))
barWidth = 0.17
 
bars1 = Model_Comparision_Report.iloc[0, 1:8]
bars2 = Model_Comparision_Report.iloc[1, 1:8]
bars3 = Model_Comparision_Report.iloc[2, 1:8]
bars4 = Model_Comparision_Report.iloc[3, 1:8]
bars5 = Model_Comparision_Report.iloc[4, 1:8]
 
r1 = np.arange(len(bars1))
r2 = [x + barWidth for x in r1]
r3 = [x + barWidth for x in r2]
r4 = [x + barWidth for x in r3]
r5 = [x + barWidth for x in r4]
 
plt.bar(r1, bars1, color='lightcoral', width=barWidth, edgecolor='black', label='KNN')
plt.bar(r2, bars2, color='cornflowerblue', width=barWidth, edgecolor='black', label='SVM')
plt.bar(r3, bars3, color='bisque', width=barWidth, edgecolor='black', label='NB')
plt.bar(r4, bars4, color='mediumaquamarine', width=barWidth, edgecolor='black', label='RF')
plt.bar(r5, bars5, color='plum', width=barWidth, edgecolor='black', label='LR')
 
plt.title('Metric comparision of different Classifiers', fontsize=17)
plt.xlabel('Metrics', fontsize=18, fontweight='bold')
plt.ylabel('Scores', fontsize=18, fontweight='bold')
plt.xticks([r + 2*barWidth for r in range(len(bars1))], ['Accuracy', 'Precision', 'Specificity', 
                                                         'Recall/Sensitivity', 'F1', 'ROC-AUC-Score', 'PR-AUC-Score'], 
            fontsize=15)
 
plt.yticks(fontsize = 15)
plt.legend(title='Classifier', fontsize=10, title_fontsize=12, loc='best')
plt.show()

: 

Classification report

In [None]:
Model_Comparision_Report

: 

From the above table it is clear that KNN is the best clustering algorithm for the given dataset. It performs best in all the parameters. Its accuracy is the highest

: 

: 

: 

: 

: 