In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from dateutil import parser
%matplotlib inline


from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from sklearn.model_selection import GridSearchCV
import pickle
from lightgbm import LGBMClassifier
print('Library Loaded')

Library Loaded


In [4]:
import io
import requests
def iterable_to_stream(iterable, buffer_size=io.DEFAULT_BUFFER_SIZE):
    class IterStream(io.RawIOBase):
        def __init__(self):
            self.leftover = None
        def readable(self):
            return True
        def readinto(self, b):
            try:
                l = len(b)
                chunk = self.leftover or next(iterable)
                output, self.leftover = chunk[:l], chunk[l:] # chunk, up to limit
                b[:len(output)] = output 
                return len(output)
            except StopIteration:
                return 0
    return io.BufferedReader(IterStream(), buffer_size=buffer_size)


In [5]:
url = "https://raw.githubusercontent.com/cosmicudemy/ML_Casestudies/master/diabetes/diabetes.csv"

In [38]:
response = requests.get(url, stream=True)
next(response.iter_content())
df = pd.read_csv(iterable_to_stream(response.iter_content()), sep=',')
df.rename(columns={"regnancies":"Pregnancies"}, inplace=True)

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [44]:
# for sagemaker use: move outcome column to position 0
def move_column_inplace(df, col, pos):
    col = df.pop(col)
    df.insert(pos, col.name, col)
    return df

df2 = move_column_inplace(df, "Outcome", 0)
df2.to_csv("./diabetes.csv", index=False, header=True)

RangeIndex(start=0, stop=768, step=1)

In [7]:
X = df.drop('Outcome',axis=1) # predictor feature coloumns
y = df.Outcome

X_train , X_test , y_train , y_test = train_test_split(X, y, test_size = 0.20, random_state = 11)

print('Training Set :',len(X_train))
print('Test Set :',len(X_test))
print('Training labels :',len(y_train))
print('Test Labels :',len(y_test))

Training Set : 614
Test Set : 154
Training labels : 614
Test Labels : 154


In [20]:
def FitModel(X_train,y_train,X_test,y_test,algo_name,algorithm,gridSearchParams,cv):
    np.random.seed(10)
   
#     x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

#     train = pd.concat([y_train, x_train], axis=1)
#     train.to_csv("./train.csv", index=False, header=False)
#     y_train.to_csv("./Y-train.csv", index=False, header=False)

#     test = pd.concat([y_test, x_test], axis=1)
#     test.to_csv("./test.csv", index=False, header=False)
#     y_test.to_csv("./Y-test.csv", index=False, header=False)

    
    grid = GridSearchCV(
        estimator=algorithm,
        param_grid=gridSearchParams,
        cv=cv, scoring='accuracy', verbose=1, n_jobs=-1)
    
    
    grid_result = grid.fit(X_train, y_train)
    best_params = grid_result.best_params_
    pred = grid_result.predict(X_test)
    cm = confusion_matrix(y_test, pred)
   # metrics =grid_result.gr
    print(pred)
    pickle.dump(grid_result,open(algo_name+'.pkl','wb'))
   
    print('Best Params :',best_params)
    print('Classification Report :',classification_report(y_test,pred))
    print('Accuracy Score : ' + str(accuracy_score(y_test,pred)))
    print('Confusion Matrix : \n', cm)

In [21]:
# SMOTE works by resampling x and y with replacement
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=867)
X_res_OS, Y_res_OS = sm.fit_resample(X, y)

In [22]:
X_train , X_test , y_train , y_test = train_test_split(X_res_OS, Y_res_OS, test_size = 0.20, random_state = 11)

print('Training Set :',len(X_train))
print('Test Set :',len(X_test))
print('Training labels :',len(y_train))
print('Test Labels :',len(y_test))

Training Set : 800
Test Set : 200
Training labels : 800
Test Labels : 200


In [23]:
from sklearn.impute import SimpleImputer
fill = SimpleImputer(missing_values = 0, strategy="mean")

X_train = fill.fit_transform(X_train)
X_test = fill.fit_transform(X_test)

### Logit regression after oversampling

In [24]:
# Logit
penalty = ["l1", "l2"]
C = np.logspace(0, 4, 10)
hyperparameters = dict(C=C, penalty=penalty)

In [25]:
FitModel(X_train,y_train,X_test,y_test,'LR_OS',LogisticRegression(),hyperparameters,cv=5)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[1 0 0 0 1 1 0 0 1 0 1 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 1 1 0 1 0 1
 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 1 1 0 0 0 0 0 1 1 0 1 1
 0 1 1 1 1 1 1 1 0 1 1 1 0 0 0 0 0 1 1 0 1 0 0 1 1 0 0 0 0 1 0 1 1 0 1 1 1
 0 0 0 0 1 1 0 0 1 1 1 1 1 1 0 0 1 1 1 0 0 0 0 0 1 1 0 0 0 1 1 1 0 0 1 0 0
 0 0 1 0 1 0 1 1 1 1 1 0 0 0 1 1 0 0 1 1 0 1 0 0 1 1 1 1 0 1 1 1 1 1 0 0 1
 0 0 1 0 1 1 0 1 1 0 1 0 1 1 0]
Best Params : {'C': 10000.0, 'penalty': 'l2'}
Classification Report :               precision    recall  f1-score   support

           0       0.74      0.72      0.73       109
           1       0.68      0.69      0.68        91

    accuracy                           0.71       200
   macro avg       0.71      0.71      0.71       200
weighted avg       0.71      0.71      0.71       200

Accuracy Score : 0.71
Confusion Matrix : 
 [[79 30]
 [28 63]]


 0.73125     nan 0.725       nan 0.72875     nan 0.73125     nan 0.73125
     nan 0.73375]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# Logit
penalty = ["l1", "l2"]
C = np.logspace(0, 4, 10)
hyperparameters = dict(C=C, penalty=penalty)
FitModel(X_train,y_train,X_test,y_test,'LogisticRegression',LogisticRegression(),hyperparameters,cv=5)

In [27]:
# To fit XGboost: grid search over this dict of param values.
param = {
    "n_estimators": [100, 600, 1200],  # why should this cross 2 orders of magnitude?
    "max_depth": [2, 3, 4, 5],  # depth of sequential classifiers
    "learning_rate": np.arange(0.01, 0.1, 0.01).tolist(),  # controls the greediness
}
FitModel(X_train,y_train,X_test,y_test,'XGBoost_OS',XGBClassifier(),param,cv=5)

Fitting 5 folds for each of 108 candidates, totalling 540 fits




[0 0 0 0 1 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 0 1 0 1
 0 1 0 0 0 1 1 1 0 1 0 1 0 0 0 1 1 1 1 1 1 1 0 0 1 0 1 0 0 1 0 1 0 1 0 0 1
 0 1 1 1 0 0 1 1 0 1 1 1 0 0 0 0 1 1 0 0 1 0 0 1 1 0 0 0 0 1 0 1 1 0 1 1 0
 0 0 1 0 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 0 0 0 1 0 1 1 1 0 0 1 1 0 0 1 1 0 0
 0 0 1 0 1 0 1 1 1 1 1 0 0 0 1 0 0 0 1 0 0 1 0 0 1 1 1 1 0 0 0 1 1 1 0 0 1
 0 1 0 0 1 1 0 1 1 0 0 0 1 0 0]
Best Params : {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 1200}
Classification Report :               precision    recall  f1-score   support

           0       0.80      0.79      0.80       109
           1       0.75      0.77      0.76        91

    accuracy                           0.78       200
   macro avg       0.78      0.78      0.78       200
weighted avg       0.78      0.78      0.78       200

Accuracy Score : 0.78
Confusion Matrix : 
 [[86 23]
 [21 70]]


In [None]:
# To fit random forest:
param = {
    "n_estimators": [100, 500, 1000, 1500, 2000]
}
FitModel(X_train,y_train,X_test,y_test,'RandomForest',RandomForestClassifier(),param,cv=5)