# Setup

### Basic

In [1]:
import pandas as pd
import numpy as np

### Plots

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.offline as py
import plotly.graph_objs as go

### Data Preprocessing

In [3]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, cross_val_predict
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, roc_curve, precision_recall_curve, accuracy_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer

### Machine Learning 

In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier

## Data & EDA

In [5]:
df = pd.read_csv("../clean_out.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0,6.0,148.0,72.0,35.0,209.76,33.6,0.627,50.0,1.0
1,1,1.0,85.0,66.0,29.0,72.4,26.6,0.351,31.0,0.0
2,2,8.0,183.0,64.0,27.96,193.04,23.3,0.672,32.0,1.0
3,3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0.0
4,4,5.0,116.0,74.0,24.64,96.6,25.6,0.201,30.0,0.0


In [6]:
df.shape

(753, 10)

In [10]:
df = df.drop(['Unnamed: 0'], axis = 'columns')

In [11]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.00,209.76,33.6,0.627,50.0,1.0
1,1.0,85.0,66.0,29.00,72.40,26.6,0.351,31.0,0.0
2,8.0,183.0,64.0,27.96,193.04,23.3,0.672,32.0,1.0
3,1.0,89.0,66.0,23.00,94.00,28.1,0.167,21.0,0.0
4,5.0,116.0,74.0,24.64,96.60,25.6,0.201,30.0,0.0
...,...,...,...,...,...,...,...,...,...
748,10.0,101.0,76.0,48.00,180.00,32.9,0.171,63.0,0.0
749,2.0,122.0,70.0,27.00,143.68,36.8,0.340,27.0,0.0
750,5.0,121.0,72.0,23.00,112.00,26.2,0.245,30.0,0.0
751,1.0,126.0,60.0,26.88,176.64,30.1,0.349,47.0,1.0


# Machine Learning

In [12]:
x = df.drop(['Outcome'], axis=1)
y = df['Outcome']

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42, stratify = y, shuffle = True)

# Balancing

In [14]:
oversample = SMOTE(random_state = 42, k_neighbors = 10)

x_smote, y_smote = oversample.fit_resample(x_train, y_train)
x_train, y_train = x_smote, y_smote

In [15]:
y_smote.value_counts()

1.0    397
0.0    397
Name: Outcome, dtype: int64

## Classifiers

In [16]:
# models 
classifiers = [
    DecisionTreeClassifier(max_depth = 3, random_state = 42),
    AdaBoostClassifier(DecisionTreeClassifier(random_state = 42)),
    RandomForestClassifier(max_depth = 5, random_state = 42),
    GradientBoostingClassifier(random_state = 42),
    LogisticRegression(random_state = 42, solver='lbfgs', max_iter=10000),
    SVC(random_state = 42, probability = True),        
    KNeighborsClassifier(n_neighbors = 5, algorithm = "kd_tree"),
    GaussianNB(),
    MLPClassifier(solver = 'lbfgs', alpha = 1e-5, hidden_layer_sizes=(5, 2), random_state = 1),
    BaggingClassifier(SVC(random_state = 42, probability = True), max_samples = 0.5, max_features = 0.7),
    ExtraTreesClassifier(n_estimators = 10, max_depth = None, min_samples_split = 2, random_state = 0),
    XGBClassifier(learning_rate= 0.01,max_depth = 3,n_estimators = 1000)
]

In [17]:
def make_classification(classifiers, x_train, x_test, y_train, y_test) :
    
    # metrics
    acc, f1, AUC, recall, cross_val, prec = [ ], [ ], [ ], [ ], [ ], [ ]
    models = [ ]
    
    for classifier in classifiers : 
        clf = classifier
        clf.fit(x_train, y_train)
        
        y_pred = clf.predict(x_test)
        y_prob = clf.predict_proba(x_test)
        
        acc.append(((accuracy_score(y_test,y_pred))) * 100)
        cross_val.append(sum(cross_val_score(clf, x_train, y_train, cv = 10, scoring = "accuracy"))/10)
        f1.append(((f1_score(y_test,y_pred))) * 100)
        AUC.append(((roc_auc_score(y_test,y_prob[:, 1]))) * 100)
        recall.append(((recall_score(y_test,y_pred))) * 100)
        prec.append(((precision_score(y_test,y_pred))) * 100)
        models.append(clf.__class__.__name__)
        
    res = pd.DataFrame({
        "Accuracy" : acc,
        "Cross Val" : cross_val,
        "F1" : f1,
        "ROC" : AUC,
        "Recall" : recall,
        "Precision" : prec,
        "ML Models" : models,
    })
    
    res = (res.sort_values(by = ['ROC','F1'], ascending = False).reset_index(drop =  True))
    return res

In [18]:
res = make_classification(classifiers, x_train, x_test, y_train, y_test)
res


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number 































































Unnamed: 0,Accuracy,Cross Val,F1,ROC,Recall,Precision,ML Models
0,85.430464,0.866661,79.62963,88.228438,82.692308,76.785714,GradientBoostingClassifier
1,86.092715,0.860348,80.373832,88.092463,82.692308,78.181818,XGBClassifier
2,82.781457,0.859003,77.192982,87.645688,84.615385,70.967742,RandomForestClassifier
3,80.794702,0.828766,75.213675,87.247475,84.615385,67.692308,KNeighborsClassifier
4,85.430464,0.829953,79.245283,86.159674,80.769231,77.777778,DecisionTreeClassifier
5,76.15894,0.799684,71.428571,85.324398,86.538462,60.810811,SVC
6,75.496689,0.803513,69.421488,83.974359,80.769231,60.869565,BaggingClassifier
7,80.13245,0.877991,72.727273,82.614608,76.923077,68.965517,ExtraTreesClassifier
8,77.483444,0.768275,71.666667,82.566045,82.692308,63.235294,MLPClassifier
9,75.496689,0.772009,67.826087,81.177156,75.0,61.904762,LogisticRegression


In [19]:
def cross_val(classifiers, x_train, y_train) :
    cv_train, cv_test, diff, models =  [], [], [], []
    
    for classifier in classifiers : 
        clf = classifier
        #clf.fit(x_train, y_train)
        
        cv = cross_validate(clf, x_train, y_train, cv = 5, scoring = "accuracy", return_train_score = True)
        
        cv_train.append(cv['train_score'].mean() * 100)
        cv_test.append(cv['test_score'].mean() * 100)
        diff.append((cv['train_score'].mean() - cv['test_score'].mean()) * 100)
        models.append(clf.__class__.__name__)
        
        
    res = pd.DataFrame({
        "CV Train" : cv_train,
        "CV Test" : cv_test,
        "Diff" : diff,
        "ML Model" : models
    })
    
    res = (res.sort_values(by = ['CV Test', 'CV Train'], ascending = False).reset_index(drop =  True))
    
    return res

# Fine Tuning

### 1) GradientBoostingClassifier

In [21]:
gb_clf = GradientBoostingClassifier( 
    random_state = 42, 
    min_samples_split = 100,
    min_samples_leaf = 20,
    max_depth = 2,
    max_features = 3,
    learning_rate = 0.05,
    subsample = 0.65,
)

In [22]:
cross_val([gb_clf], x_train, y_train)

Unnamed: 0,CV Train,CV Test,Diff,ML Model
0,87.657257,84.881777,2.775481,GradientBoostingClassifier


### 2) ExtraTreesClassifier

In [23]:
ex_clf = ExtraTreesClassifier(
    random_state = 42,
    max_depth = 6,
    min_samples_split = 20,
    max_features = "log2",
)

In [24]:
cross_val([ex_clf], x_train, y_train)

Unnamed: 0,CV Train,CV Test,Diff,ML Model
0,87.657307,84.256827,3.40048,ExtraTreesClassifier


### 3) SVC

In [25]:
svm_clf = SVC(
    random_state = 42,
    probability = True,
    C = 1,
    kernel = "rbf",
    gamma = "scale",
)

In [26]:
cross_val([svm_clf], x_train, y_train)

Unnamed: 0,CV Train,CV Test,Diff,ML Model
0,80.698807,80.220524,0.478283,SVC


### 4) Ensemble

In [27]:
voting_clf = VotingClassifier(
    estimators = [('gb', gb_clf), ('ex', ex_clf), ('svm', svm_clf)],
    voting = 'soft'
)

In [28]:
cross_val([voting_clf], x_train, y_train)

Unnamed: 0,CV Train,CV Test,Diff,ML Model
0,86.334918,83.873895,2.461023,VotingClassifier


In [29]:
make_classification([gb_clf, voting_clf], x_train, x_test, y_train, y_test)

Unnamed: 0,Accuracy,Cross Val,F1,ROC,Recall,Precision,ML Models
0,79.470199,0.841345,72.072072,87.043512,76.923077,67.79661,VotingClassifier
1,81.456954,0.850206,74.074074,86.907537,76.923077,71.428571,GradientBoostingClassifier
