## Content
The datasets consists of several medical predictor variables and one target variable, **Outcome**. Predictor variables includes the number of pregnancies the patient has had, their BMI, insulin level, age, and so on.

## Context
This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.

## Inspiration
Can you build a machine learning model to accurately predict whether or not the patients in the dataset have diabetes or not?

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# Import dataset
data = pd.read_csv('diabetes.csv')
data

In [None]:
data.info(), data.isna().sum()

In [None]:
data.Outcome.value_counts().plot(kind="bar", color=['salmon','lightblue'])

## Age Distribution of the age column

In [None]:
fig, ax = plt.subplots(figsize=(20, 5))
ax.hist(data.Age.value_counts(), bins=20);

In [None]:
pd.crosstab(data.Age,data.Outcome).plot(kind="bar", figsize=(20,10));
plt.title('Diabetes frequency by Age');
plt.legend(['No Diabetes','Diabetic']);

In [None]:
# Make a correlation matrix
data.corr()

In [None]:
# Let' make our correlation more communicating
corr_matrix = data.corr()
fig, ax = plt.subplots(figsize=(10,10))
ax = sns.heatmap(corr_matrix, annot=True, linewidths=0.5, fmt=".2f", cmap="YlGnBu");
ax.set(title='Correlation between different labels');

## Choosing a model

Try different Machine Learning models
* RandomForestClassifier
* LogisticRegression

In [None]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score


models = {
    "RandomForestClassifier": RandomForestClassifier(),
    "LinearRegression": LinearRegression(),
    "LinearSVC": SVC(),
    "LogisticRegression": LogisticRegression(),
    "KNearestNeighbors": KNeighborsClassifier(),
}

In [None]:
# Evaluate model Precision, Recall, F1 scores
def eval_prediction(y_test, y_preds):
   return {"Accuracy": round(accuracy_score(y_test, y_preds),2),
           "Precision": round(precision_score(y_test, y_preds),2),
           "Recall": round(recall_score(y_test, y_preds),2),
           "F1": round(f1_score(y_test, y_preds),2)}

# Fit and score model
def fit_and_score(models, X_train, X_test, y_train, y_test):
    """
        Evaluates different machine learning models.
        
        model: A dict of different sklearn models
        X_train: Training data (no label)
        X_test: Testing data (no labels)
        y_train: Training label
        y_test: Testing Label
    """
    model_scores = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        model_scores[name] = round(model.score(X_test, y_test)*100,2)
        
    return model_scores

In [None]:
# Split data into features and label
X = data.drop('Outcome', axis=1)
y = data['Outcome']


# Split data into train and test splits
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)
model_scores = fit_and_score(models,X_train, X_test, y_train, y_test)
model_scores

# Model Comparison

In [None]:
model_comparison = pd.DataFrame(model_scores, index=['accuracy'])
model_comparison.T.plot.bar();

**Note**: A models default accuracy is not always true, so we are going to evaluate it the more to increase accuracy

**What I'm going to look into:**
1. Hyperparameter Tuning
2. Feature Importance
3. Confusion matrix
4. Cross validation
5. Precision
6. Recall
7. F1 score
8. Classification Report
9. ROC curve
10. AUC curve

### Hyperparameter Tuning (By Hand)

In [None]:
# Tune KNN
train_scores = []
test_scores = []

# Create a list of different values for n_neighbors
neighbors = range(1, 21)

# Instatiate KNN
knn = KNeighborsClassifier()

# Loop through neigbors
for i in neighbors:
    knn.set_params(n_neighbors=i)
    
    # Fit the model
    knn.fit(X_train, y_train)
    
    # Update Training scores
    train_scores.append(knn.score(X_train, y_train))
    
    # Update Test scores
    test_scores.append(knn.score(X_test,y_test))

# train_scores, test_scores

In [None]:
plt.plot(neighbors, train_scores, label="Train Scores");
plt.plot(neighbors, test_scores, label="Test Scores");
plt.xlabel("Number of Neighbors")
plt.ylabel("Model Score")
plt.legend()
plt.show()
print(f"The maximum Test score is {max(test_scores)*100:.2f}%")

## Hyperparameter Tuning with RandomizedSearchCV

I'm going to tune:
* LogisticRegression 
* RandomForestClassifier

In [None]:
from sklearn.model_selection import RandomizedSearchCV

logistic_grid = {
    "C" : np.logspace(-4, 4, 50),
    "solver":['liblinear'],
}

# Create Hyperparameter grid for RandomforestClassifier
rs_grid ={
    "n_estimators": np.arange(10,500, 50),
    "max_depth": [None, 3,5,10],
    "min_samples_split": np.arange(2,40,2),
    "min_samples_leaf": np.arange(1,25,2)
}

# Now Let's tune LogisticRegression using RandomizedSearchCV
logistic_rs_model = RandomizedSearchCV(estimator=LogisticRegression(), 
                              param_distributions=logistic_grid, 
                              cv=5, verbose=2, n_iter=20)

# Fit model for Logistic Regression
logistic_rs_model.fit(X_train, y_train);
logistic_rs_model.best_params_

In [None]:
logistic_rs_model.score(X_test, y_test)

### Hyperparameter Tuning for RandomForestClassifier

In [None]:
rf_rs_model = RandomizedSearchCV(estimator=RandomForestClassifier(), param_distributions=rs_grid, 
                                 cv=5, n_iter=20, verbose=2)
# Fit the RandomForestClassifier
rf_rs_model.fit(X_train, y_train);

In [None]:
# Score the RandomForestClassifier
rf_rs_model.score(X_test, y_test)

## Hyperparameter Tuning using the GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

logistic_grid = {
    "C" : np.logspace(-4, 4, 50),
    "solver":['liblinear'],
}

# Create Hyperparameter grid for RandomforestClassifier
rs_grid ={
    "n_estimators": np.arange(10,200, 50),
    "max_depth": [None, 3,5,10],
    "min_samples_split": np.arange(2,10,2),
    "min_samples_leaf": np.arange(1,10,2)
}

logistic_gs_model = GridSearchCV(estimator=LogisticRegression(), param_grid=logistic_grid, cv=5,verbose=2)

# Fit the Logistic Regression Model
logistic_gs_model.fit(X_train, y_train);

In [None]:
logistic_gs_model.score(X_test,y_test)

In [None]:
# Hyperparameter tuning using of RandomForestClassifier using GridSearchCV
rf_gs_model = GridSearchCV(estimator=RandomForestClassifier(), param_grid=rs_grid,cv=5, verbose=2)
rf_gs_model.fit(X_train, y_train);

In [None]:
rf_gs_model.score(X_test, y_test)

## Evaluating Our Tuned model Beyond Accuracy
* ROC curve
* Confusion matrix
* Classification Report
* Precision
* Recall
* F1 Score

In [None]:
# Make Prediction with tuned model
logistic_preds = logistic_rs_model.predict(X_test)

from sklearn.metrics import plot_roc_curve
plot_roc_curve(logistic_rs_model,X_test, y_test)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, logistic_preds)

In [None]:
# Visualize confusion matrix using seaborn
sns.set(font_scale=0.5)
def plot_conf_matrix(y_test, y_preds):
    """
        Plots a confusion matrix using seaborn's heatmap
    """
    fig, ax = plt.subplots(figsize=(10,5))
    ax = sns.heatmap(confusion_matrix(y_test,y_preds), annot=True, cbar = False)
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label");
    
plot_conf_matrix(y_test, logistic_preds)

In [None]:
# Classification report
from sklean.metrics import classification_report
classification_report(y_test, logistic_preds)

In [None]:
# Cross validation score
def cross_val(model, scoring_param):
    """
        Returns cross validated score of a model according to the scoring parameter passed
        scoring_param: Cross validation scoring parameter
        model: working estimator
    """
    score = cross_val_score(model,X, y, cv=5, scoring=scoring_param)
    try:
        return f"The {scoring_param} score is: {score*100:.2f}"
    except NameError:
        print ("Invalid scoring parameter passed")
    except:
        print ("Something went wrong")

In [None]:
from sklearn.metrics import roc_curve
y_probs = clf.predict_proba(X_test)
# y_probs
from sklearn.metrics import confusion_matrix
y_preds = clf.predict(X_test)
confusion_matrix(y_test, y_preds)
# roc_curve(y_test, y_preds)
pd.crosstab(y_test, y_preds, rownames=['Actual label'], colnames=['Predicted label'])

In [None]:
# Hyper parameter Tuning using RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
grid ={
    'n_estimators': [100,200,500,1200],
    'max_depth': [2,3],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf':[2,5,7],
    'min_samples_split': [2,4]
}

rs = RandomizedSearchCV(estimator=clf, param_distributions=grid, n_iter=5, cv=5, verbose=2)
rs.fit(X_train, y_train)

In [None]:
rs.predict(X_test)
rs.score(X_test,y_test)*100, clf.score(X_test, y_test)*100