In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.feature_extraction import DictVectorizer

from sklearn.metrics import roc_auc_score

from sklearn.model_selection import KFold


In [3]:
df_init = pd.read_csv("data/diabetes.csv")

### Exploratory Data Analysis (EDA)

In [4]:
print("len(df_init) = ", len(df_init))

len(df_init) =  768


#### Check missing values

In [6]:
print(df_init.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [7]:
print("Outcome unique values : ", df_init.Outcome.unique())

Outcome unique values :  [1 0]


#### Check global diabetes rate

In [9]:
df_init.Outcome.value_counts(normalize=True)

Outcome
0    0.651042
1    0.348958
Name: proportion, dtype: float64

#### Check data types

In [10]:
print(df_init.dtypes)

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object


There is no categorical values

In [20]:
# Create a summary DataFrame with max, min, and mean
summary = pd.DataFrame({
    'Min': df_init.min(),
    'Max': df_init.max(),
    'Mean': df_init.mean(),
    'Median': df_init.median()
})

print("Min, Max, Mean and Median values for each column:")
print(summary)

Min, Max, Mean and Median values for each column:
                             Min     Max        Mean    Median
Pregnancies                0.000   17.00    3.845052    3.0000
Glucose                    0.000  199.00  120.894531  117.0000
BloodPressure              0.000  122.00   69.105469   72.0000
SkinThickness              0.000   99.00   20.536458   23.0000
Insulin                    0.000  846.00   79.799479   30.5000
BMI                        0.000   67.10   31.992578   32.0000
DiabetesPedigreeFunction   0.078    2.42    0.471876    0.3725
Age                       21.000   81.00   33.240885   29.0000
Outcome                    0.000    1.00    0.348958    0.0000


### Feature importance

In [32]:
df_init[df_init["Pregnancies"] > 3].Outcome.mean()

0.45058139534883723

Number of pregnancies are important

In [13]:
df_init[df_init["Age"] > 40].Outcome.mean()

0.5257731958762887

Age is highly important

In [21]:
df_init[df_init["Glucose"] > 117].Outcome.mean()

0.5411140583554377

Glucose level is highly important

In [24]:
df_init[df_init["Insulin"] > 30.5].Outcome.mean()

0.3333333333333333

Insulin is not important

In [26]:
df_init[df_init["BMI"] > 32].Outcome.mean()

0.468586387434555

Body mass index is important

In [31]:
df_init[df_init["DiabetesPedigreeFunction"] > 0.3725].Outcome.mean()


0.4036458333333333

DiabetesPedigreeFunction is important

In [30]:
df_init[df_init["BloodPressure"] > 72].Outcome.mean()

0.40974212034383956

BloodPressure is important

#### Correlations

In [40]:
df_init.corrwith(df_init.Outcome).sort_values(ascending=False)

Outcome                     1.000000
Glucose                     0.466581
BMI                         0.292695
Age                         0.238356
Pregnancies                 0.221898
DiabetesPedigreeFunction    0.173844
Insulin                     0.130548
SkinThickness               0.074752
BloodPressure               0.065068
dtype: float64

The most important features are __Glucose__, __BMI__, __Age__, and __Pregnancies__

### Data preparation

#### lowercase, and adding underscore between words

In [97]:
def add_underscore(string):
    l_newstr = []
    for letters in string:
        if letters.isupper():
            letters = "_"+letters
        l_newstr.append(letters)
    newstr = ''.join(l_newstr)

    if newstr.startswith('_'):
        newstr = newstr[1:]
    return newstr

df_init.rename(columns=lambda col: add_underscore(col), inplace=True)

df_init.columns = df_init.columns.str.lower()

df_init.rename(columns={"b_m_i": "bmi"}, inplace=True)

print(df_init.columns)

Index(['pregnancies', 'glucose', 'blood_pressure', 'skin_thickness', 'insulin',
       'bmi', 'diabetes_pedigree_function', 'age', 'outcome'],
      dtype='object')


In [20]:
df_init.head().T

Unnamed: 0,0,1,2,3,4
pregnancies,6.0,1.0,8.0,1.0,0.0
glucose,148.0,85.0,183.0,89.0,137.0
blood_pressure,72.0,66.0,64.0,66.0,40.0
skin_thickness,35.0,29.0,0.0,23.0,35.0
insulin,0.0,0.0,0.0,94.0,168.0
bmi,33.6,26.6,23.3,28.1,43.1
diabetes_pedigree_function,0.627,0.351,0.672,0.167,2.288
age,50.0,31.0,32.0,21.0,33.0
outcome,1.0,0.0,1.0,0.0,1.0


### Split data

Do train/validation/test split with 60%/20%/20% distribution.

Use the train_test_split function and set the random_state parameter to 42.

In [131]:
df_full_train, df_test = train_test_split(df_init, test_size = 0.2, random_state = 42)

In [12]:
len(df_full_train), len(df_test)

(614, 154)

In [132]:
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state = 42)

In [14]:
len(df_train), len(df_val), len(df_test)

(460, 154, 154)

In [28]:
df_train.head()

Unnamed: 0,pregnancies,glucose,blood_pressure,skin_thickness,insulin,bmi,diabetes_pedigree_function,age,outcome
335,0,165,76,43,255,47.9,0.259,26,0
467,0,97,64,36,100,36.8,0.6,25,0
51,1,101,50,15,36,24.2,0.526,26,0
131,9,122,56,0,0,33.3,1.114,33,1
649,0,107,60,25,0,26.4,0.133,23,0


In [133]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [134]:
y_train = df_train.outcome.values
y_val = df_val.outcome.values
y_test = df_test.outcome.values

In [135]:
del df_train["outcome"]
del df_val["outcome"]
del df_test["outcome"]

In [136]:
df_full_train.head()

Unnamed: 0,pregnancies,glucose,blood_pressure,skin_thickness,insulin,bmi,diabetes_pedigree_function,age,outcome
60,2,84,0,0,0,0.0,0.304,21,0
618,9,112,82,24,0,28.2,1.282,50,1
346,1,139,46,19,83,28.7,0.654,22,0
294,0,161,50,0,0,21.9,0.254,65,0
231,6,134,80,37,370,46.2,0.238,46,1


In [137]:
y_full_train = df_full_train.outcome.values

In [138]:
del df_full_train["outcome"]

### Logistic regression

In [109]:
def train(df_train, y_train, C=10):
    dicts = df_train.to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)

    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    #model = LogisticRegression(C=C, max_iter=1000, random_state=42)

    model.fit(X_train, y_train)
    
    return dv, model

In [99]:
def predict(df, dv, model):
    dicts = df.to_dict(orient='records')

    X = dv.transform(dicts)
    y_pred = model.predict_proba(X)[:, 1]

    return y_pred

In [114]:
n_splits = 5

In [118]:
len(y_train)

460

In [119]:
len(df_train)

491

In [122]:
C=10

In [127]:
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)

scores = []

for train_idx, val_idx in kfold.split(df_full_train):
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]

    y_train = df_train.outcome.values
    y_val = df_val.outcome.values

    del df_train["outcome"]
    del df_val["outcome"]

    dv, model = train(df_train, y_train, C=10)
    y_pred = predict(df_val, dv, model)

    auc = roc_auc_score(y_val, y_pred)
    scores.append(auc)

print('C=%s %.3f +- %.3f' % (C, np.mean(scores), np.std(scores)))

C=10 0.831 +- 0.027


In [128]:
scores

[0.8089359200470312,
 0.84622248661511,
 0.7980599647266313,
 0.827710843373494,
 0.8727477477477478]

In [110]:
dv, model = train(df_full_train, y_full_train)
y_pred = predict(df_test, dv, model)

auc = roc_auc_score(y_test, y_pred)
auc

0.8159779614325069

### Save the model

In [111]:
import pickle

In [112]:
output_file = f'model_C=10.bin'

In [113]:
f_out = open(output_file, 'wb') 
pickle.dump((dv, model), f_out)
f_out.close()

In [129]:
input_file = f'model_C=10.bin'


In [130]:
with open(input_file, 'rb') as f_in: 
    dv, model = pickle.load(f_in)

In [26]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [61]:
dicts_train = df_train.to_dict(orient='records')

In [62]:
X_train = dv.fit_transform(dicts_train)


In [63]:
model.fit(X_train, y_train)

In [64]:
dicts_val = df_val.to_dict(orient='records')
X_val = dv.transform(dicts_val)


In [65]:
y_pred_val = model.predict_proba(X_val)[:, 1]

In [38]:
#diabetes_prediction = (y_pred >= 0.5)

In [39]:
#(y_val == diabetes_prediction.astype(int)).mean()

0.7532467532467533

In [67]:
auc_pred_val = roc_auc_score(y_val, y_pred_val)
print(f'AUC for y_pred_val: {round(auc_pred_val,3)}')

AUC for y_pred_val: 0.84


### Using the model

In [80]:
model_c_10 = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

In [68]:
dicts_full_train = df_full_train.to_dict(orient='records')

In [69]:
X_full_train = dv.fit_transform(dicts_full_train)

In [81]:
model_c_10.fit(X_full_train, y_full_train)

In [139]:
dicts_test = df_test.to_dict(orient='records')
X_test = dv.transform(dicts_test)

In [87]:
y_pred_test = model_c_10.predict_proba(X_test)[:, 1]

In [88]:
auc_pred_test = roc_auc_score(y_test, y_pred_test)
print(f'AUC for y_pred_test: {round(auc_pred_test,3)}')

AUC for y_pred_test: 0.816


In [79]:
c_arr = [0.01, 0.1, 1, 10, 100]

for c in c_arr:
    print('c=',c)
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred_val_c = model.predict_proba(X_val)[:, 1]

    auc_pred_val_c = roc_auc_score(y_val, y_pred_val_c)
    print(f'AUC for y_pred_val: {round(auc_pred_val_c,3)}')
    print('----------------')
    

c= 0.01
AUC for y_pred_val: 0.618
----------------
c= 0.1
AUC for y_pred_val: 0.705
----------------
c= 1
AUC for y_pred_val: 0.84
----------------
c= 10
AUC for y_pred_val: 0.86
----------------
c= 100
AUC for y_pred_val: 0.862
----------------


### Transform to dict

In [152]:
trial_participant = dicts_test[11]

In [158]:
X_small = dv.transform([trial_participant])

In [159]:
trial_participant

{'pregnancies': 10,
 'glucose': 111,
 'blood_pressure': 70,
 'skin_thickness': 27,
 'insulin': 0,
 'bmi': 27.5,
 'diabetes_pedigree_function': 0.141,
 'age': 40}

In [154]:
model.predict_proba(pd.DataFrame(X_small)).round(3)[0,1]

0.265

In [155]:
model.predict(pd.DataFrame(X_small))[0]

0

In [156]:
y_test[10]

1

### Random forest regressor

In [25]:
from sklearn.ensemble import RandomForestRegressor

In [26]:
rf = RandomForestRegressor(n_estimators=4, random_state=1, n_jobs=-1)

In [28]:
rf.fit(df_train, y_train)

In [31]:
y_pred_val_rf = rf.predict(df_val)

In [32]:
auc_pred_val_rf = roc_auc_score(y_val, y_pred_val_rf)
print(f'AUC for y_pred_val_rf: {round(auc_pred_val_rf,3)}')

AUC for y_pred_val_rf: 0.744


In [36]:
for estim in range(2, 50, 1):
    print(f"estim = {estim}")
    rf = RandomForestRegressor(n_estimators=estim, random_state=1, n_jobs=-1)
    rf.fit(df_train, y_train)
    y_pred_val_rf = rf.predict(df_val)
    auc_pred_val_rf = roc_auc_score(y_val, y_pred_val_rf)
    print(f'AUC for y_pred_val_rf: {round(auc_pred_val_rf,3)}')
    print("------------------------")

estim = 2
AUC for y_pred_val_rf: 0.732
------------------------
estim = 3
AUC for y_pred_val_rf: 0.738
------------------------
estim = 4
AUC for y_pred_val_rf: 0.744
------------------------
estim = 5
AUC for y_pred_val_rf: 0.747
------------------------
estim = 6
AUC for y_pred_val_rf: 0.748
------------------------
estim = 7
AUC for y_pred_val_rf: 0.763
------------------------
estim = 8
AUC for y_pred_val_rf: 0.78
------------------------
estim = 9
AUC for y_pred_val_rf: 0.791
------------------------
estim = 10
AUC for y_pred_val_rf: 0.809
------------------------
estim = 11
AUC for y_pred_val_rf: 0.819
------------------------
estim = 12
AUC for y_pred_val_rf: 0.816
------------------------
estim = 13
AUC for y_pred_val_rf: 0.821
------------------------
estim = 14
AUC for y_pred_val_rf: 0.826
------------------------
estim = 15
AUC for y_pred_val_rf: 0.826
------------------------
estim = 16
AUC for y_pred_val_rf: 0.821
------------------------
estim = 17
AUC for y_pred_val_rf: 

In [37]:
for md in range(2,10):
    print(f"max_depth = {md}")
    
    
    rmse_curr_md = []
    
    

        
    rf = RandomForestRegressor(n_estimators=10, max_depth = md, random_state=1, n_jobs=-1)
    rf.fit(df_train, y_train)

    y_pred_val_rf = rf.predict(df_val)
    auc_pred_val_rf = roc_auc_score(y_val, y_pred_val_rf)
    print(f'AUC for y_pred_val_rf: {round(auc_pred_val_rf,3)}')
    
    print("------------------------")

max_depth = 2
AUC for y_pred_val_rf: 0.795
------------------------
max_depth = 3
AUC for y_pred_val_rf: 0.799
------------------------
max_depth = 4
AUC for y_pred_val_rf: 0.817
------------------------
max_depth = 5
AUC for y_pred_val_rf: 0.83
------------------------
max_depth = 6
AUC for y_pred_val_rf: 0.839
------------------------
max_depth = 7
AUC for y_pred_val_rf: 0.819
------------------------
max_depth = 8
AUC for y_pred_val_rf: 0.816
------------------------
max_depth = 9
AUC for y_pred_val_rf: 0.812
------------------------


### XGBoost model

In [54]:
#import xgboost as xgb

from xgboost import XGBClassifier


In [55]:
model = XGBClassifier(random_state=42)
model.fit(df_train, y_train)

In [57]:
y_pred_val_proba = model.predict_proba(df_val)[:, 1]

In [58]:
# Calculate the ROC AUC score
roc_auc = roc_auc_score(y_val, y_pred_val_proba)

print(f"ROC AUC Score: {roc_auc:.3f}")

ROC AUC Score: 0.817


In [39]:
#dtrain = xgb.DMatrix(df_train, label=y_train, feature_names = list(df_train.columns.values))

In [40]:
#dval = xgb.DMatrix(df_val, label=y_val, feature_names = list(df_train.columns.values))

In [52]:
# xgb_params_1 = {
#         'eta': 0.3, 
#         'max_depth': 6,
#         'min_child_weight': 1,
        
#         'objective': 'binary:logistic',
#         'nthread': 8,
        
#         'seed': 1,
#         'verbosity': 1,
#     }

In [42]:
#watchlist = [(dtrain, 'train'), (dval, 'val')]

In [49]:
#from matplotlib import pyplot as plt

In [47]:
# def parse_xgb_output(output):
#     results = []

#     for line in output.stdout.strip().split('\n'):
#         it_line, train_line, val_line = line.split('\t')

#         it = int(it_line.strip('[]'))
#         train = float(train_line.split(':')[1])
#         val = float(val_line.split(':')[1])

#         results.append((it, train, val))
    
#     columns = ['num_iter', 'train_auc', 'val_auc']
#     df_results = pd.DataFrame(results, columns=columns)
#     return df_results

In [53]:
#%%capture output_1

#model_1 = xgb.train(xgb_params_1, dtrain, evals = watchlist, num_boost_round=100)

#s = output_1.stdout

[0]	train-logloss:0.52554	val-logloss:0.56498
[1]	train-logloss:0.44785	val-logloss:0.52744
[2]	train-logloss:0.39008	val-logloss:0.50714
[3]	train-logloss:0.34831	val-logloss:0.49492
[4]	train-logloss:0.32249	val-logloss:0.48837
[5]	train-logloss:0.28906	val-logloss:0.48391
[6]	train-logloss:0.27461	val-logloss:0.48530
[7]	train-logloss:0.25150	val-logloss:0.48980
[8]	train-logloss:0.23646	val-logloss:0.49376
[9]	train-logloss:0.22351	val-logloss:0.49989
[10]	train-logloss:0.21495	val-logloss:0.50964
[11]	train-logloss:0.20336	val-logloss:0.51430
[12]	train-logloss:0.18999	val-logloss:0.51440
[13]	train-logloss:0.18070	val-logloss:0.52322
[14]	train-logloss:0.17207	val-logloss:0.51692
[15]	train-logloss:0.16332	val-logloss:0.51712
[16]	train-logloss:0.15680	val-logloss:0.51290
[17]	train-logloss:0.14830	val-logloss:0.51904
[18]	train-logloss:0.14139	val-logloss:0.51915
[19]	train-logloss:0.13579	val-logloss:0.52249
[20]	train-logloss:0.13171	val-logloss:0.52608
[21]	train-logloss:0.12

In [50]:
# df_score = parse_xgb_output(output_1)

# plt.plot(df_score.num_iter, df_score.train_auc, label='train')
# plt.plot(df_score.num_iter, df_score.val_auc, label='val')
# plt.legend()

AttributeError: 'DataFrame' object has no attribute 'train_auc'