In [None]:
import pandas as pd

import seaborn as sns
from matplotlib.pyplot import figure
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

### Reading Data

In [None]:
path_to_file = 'diabetes_data_upload.csv'
df = pd.read_csv(path_to_file)

###### Checking number of Null values

In [None]:
df.isna().sum()

In [None]:
df.head()

###### Renaming output column from "class" to "diabetic"

In [None]:
df.columns = [col if col != 'class' else 'diabetic' for col in df.columns]

###### Columns other than "Age" are categorical variables

In [None]:
categorical_variables = [col for col in df.columns
                         if col != 'Age' ]

###### Creating dummy encoding for all the categorical variables

In [None]:
df = pd.concat([df, pd.get_dummies(df[categorical_variables])], axis = 1)

###### Dropping original categorical columns

In [None]:
df.drop([col for col in df.columns if col in categorical_variables ], inplace = True, axis = 1)

###### Removing extra columns as follows:
1. Male Gender column (Gender 1 if female, 0 if male)
2. Columns with "no" in their name
3. Diabetic column having "negative" in name

In [None]:
df.drop([col for col in df.columns if '_' in col and 
         col[col.index('_') + 1:].lower() not in ['yes', 'female', 'positive']], 
        inplace = True, axis = 1)

###### Renaming new column names to original column names

In [None]:
df.columns = [col[:col.index('_')] if '_' in col else col for col in df.columns]

In [None]:
df.head()

###### Normalizing age

In [None]:
df['Age'] = df['Age']/100

In [None]:
figure(num=None, figsize=(15, 12), dpi=80, facecolor='w', edgecolor='k')

In [None]:
sns.heatmap(df[[col for col in df.columns if col != 'diabetic']].corr())

###### Checking for correlations greater than 0.7

In [None]:
df.corr().stack()[(df.corr().stack() > 0.7) & (df.corr().stack() != 1)]

###### Checking for correlation less than -0.5

In [None]:
df.corr().stack()[df.corr().stack() < -0.5]

###### Checking number of intances of each outcome

In [None]:
df['diabetic'].value_counts()

###### Creating a function to print Confusion Matrix, Precision, Recall and F1-score

In [None]:
def print_results(method, true_value, predicted_value):
    
    print(f"Classification Algorithm: {method}", end = "\n\n")
    
    print('Confusion Matrix')
    print(confusion_matrix(true_value, predicted_value), end = "\n\n")
    
    print("Precision")
    print(round(precision_score(true_value, predicted_value), 4), end = "\n\n")
    
    print("Recall")
    print(round(recall_score(true_value, predicted_value), 4), end = "\n\n")
    
    print("F1-Score")
    print(round(f1_score(true_value, predicted_value), 4))

###### Dividing data into train and test using stratified sampling

In [None]:
X = df[[col for col in df.columns if col != 'diabetic']]
y = df['diabetic']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42, stratify=y)

### Logistic Regression

In [None]:
lrc = LogisticRegression(random_state=0)
lrc.fit(X_train, y_train)
lr_predictions  = lrc.predict(X_test)
print_results("Logistic Regression", y_test, lr_predictions)

### Support Vector Machine Classification

In [None]:
svmc = svm.SVC()
svmc.fit(X_train, y_train)
svm_predictions = svmc.predict(X_test)
print_results("Support Vector Machine", y_test, svm_predictions)

### K - Nearest Neighbors

In [None]:
knnc = KNeighborsClassifier(n_neighbors=5)
knnc.fit(X_train, y_train)
knn_predictions = knnc.predict(X_test)
print_results("KNN", y_test, knn_predictions)

### Decision Tree Classification

In [None]:
dtc = DecisionTreeClassifier(random_state = 42)
dtc.fit(X_train, y_train)
dtc_predictions = dtc.predict(X_test)
print_results("Decision Tree", y_test, dtc_predictions)