In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


%matplotlib inline
sns.set_style("whitegrid")

In [4]:
df = pd.read_csv('diabetes.csv')
df.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [5]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [6]:
# to manage the number of places after decimal
pd.set_option('float_format', '{:.2f}'.format)
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.85,120.89,69.11,20.54,79.8,31.99,0.47,33.24,0.35
std,3.37,31.97,19.36,15.95,115.24,7.88,0.33,11.76,0.48
min,0.0,0.0,0.0,0.0,0.0,0.0,0.08,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.37,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.63,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [7]:
# changing the outcome column category to boolean values
diabetes_map = {False:0, True:1}
df["Outcome"] = df.Outcome.map(diabetes_map)


In [8]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.63,50,1
1,1,85,66,29,0,26.6,0.35,31,0
2,8,183,64,0,0,23.3,0.67,32,1
3,1,89,66,23,94,28.1,0.17,21,0
4,0,137,40,35,168,43.1,2.29,33,1


## Data Visualization

In [None]:
# Visualizing the distribution for every feature
sns.set(font_scale=1.5)
df.hist(edgecolor='black', figsize=(20,20), linewidth=1.2)

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x1a1f2b9ed0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1fa15e10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1fa53650>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1a1fa84e50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1fac5690>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1faf9e90>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1a1fb39250>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1fb6bed0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1fb76ad0>]],
      dtype=object)

In [None]:
plt.figure(figsize=(30,30))
sns.pairplot(df, hue='Outcome', height=3, diag_kind='hist')

In [None]:
# Lets check the count of the Diabetic 
sns.catplot('Outcome', data=df, kind='count')

In [None]:
plt.figure(figsize=(12, 10))
df.Age.hist(bins=80)
plt.xlabel('Age')
plt.ylabel('Number')

In [None]:
print('Average age of patients: {:.2f}'.format(df.Age.mean()))

## Relation b/w features as Confusion Matrix

In [None]:
plt.figure(figsize=(20, 20))
sns.heatmap(df.corr(), annot=True, cmap="RdYlGn", annot_kws={"size":15})

## 'Data Wrangling' Time 

In [None]:
# All the features
df.columns

In [None]:
len(df['Pregnancies'][df['Pregnancies'] == 0])

In [None]:
# Finding missing values as zeros in each feature
feature_cols = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 
                'BMI', 'DiabetesPedigreeFunction', 'Age']

for column in feature_cols:
    print("*"*40)
    print("{}'s missing zeros:----> {} \n".format(column, len(df[column][df[column] == 0])))

In [None]:
# The missing values as zeros can't be ignored and 
# even can't be removed from the features as we'll miss the important data.

# Lets try to replace the missing zeros with some normalized or balanced value such as mean

# Help: https://datascience.stackexchange.com/questions/51890/how-to-use-simpleimputer-class-to-replace-missing-values-with-mean-values-using

from sklearn.impute import SimpleImputer

impt = SimpleImputer(missing_values=0, strategy="mean", copy=False)

df[feature_cols] = impt.fit_transform(df[feature_cols])

In [None]:
# Checking the missing values as zeros again

for column in feature_cols:
    print("*"*40)
    print("{}'s missing zeros:----> {} \n".format(column, len(df[column][df[column] == 0])))

In [None]:
# Splitting the data into train and test

from sklearn.model_selection import train_test_split

X = df[feature_cols].values
y = df.Outcome.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



## Now applying different ML algorithms 

In [None]:
# Importing the required libraries

from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

---------------------------------------------------------

Some definations for references and understandings:--

### Confusion matrix:
In the field of machine learning and specifically the problem of statistical classification, 
a confusion matrix, also known as an error matrix, is a specific table layout that allows visualization of the performance of an algorithm, typically a supervised learning one (in unsupervised learning it is usually called a matching matrix). Each row of the matrix represents the instances in a predicted class while each column represents the instances in an actual class (or vice versa).
It is a special kind of contingency table, with two dimensions ("actual" and "predicted"), and identical sets of "classes" in both dimensions (each combination of dimension and class is a variable in the contingency table).


### Accuracy score:
The accuracy_score function computes the accuracy, either the fraction (default) or the count (normalize=False) of correct predictions.
In multilabel classification, the function returns the subset accuracy. If the entire set of predicted labels for a sample strictly match with the true set of labels, then the subset accuracy is 1.0; otherwise it is 0.0.


In [None]:
def display_score(model_name, y_train, y_train_pred, y_test, y_test_pred):
    '''
    Function to print the specified model's Accuracy score & Confusion Matrix for both Train and Test data 
    '''
    print(f"==============================={model_name}===========================================")
    print(f"========Train data============ :\n=>Accuracy Score {accuracy_score(y_train, y_train_pred):.4f}")
    print(f"=>Confusion Matrix :\n{confusion_matrix(y_train, y_train_pred)}")

    print(f"=========Test data============ :\n=>Accuracy Score {accuracy_score(y_test, y_test_pred):.4f}")
    print(f"=>Confusion Matrix :\n{confusion_matrix(y_test, y_test_pred)}")

### 1. Logistic Regression 

Some useful understanding:--

### numpy.ravel() :
numpy.ravel(array, order = ‘C’) : returns contiguous flattened array(1D array with all the input-array elements and with the same type as it). A copy is made only if needed.

  x = np.array([[1, 2, 3], [4, 5, 6]])    
  np.ravel(x)

Output:
array([1, 2, 3, 4, 5, 6])




In [None]:
log_reg = LogisticRegression(solver='liblinear')
score = cross_val_score(log_reg, X, y.ravel(), cv=10)
score.mean()

### 2. Decision Tree Classifier

In [1]:
params = {"criterion":("gini", "entropy"), 
          "splitter":("best", "random"), 
          "max_depth":(list(range(1, 20))), 
          "min_samples_split":(list(range(10,500,20))), 
          "min_samples_leaf":list(range(1, 20)), 
          "max_features":(None, "auto", "log2") 
          }

model = DecisionTreeClassifier(random_state=42)
grid_search_cv = GridSearchCV(model, params, scoring="accuracy", n_jobs=-1, verbose=1, cv=3, iid=True)

NameError: name 'DecisionTreeClassifier' is not defined