In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [2]:
df = pd.read_csv("C:\\Users\\emman\\ML-deployment\\Captsone project\\diabetes_risk_prediction_dataset.csv")

df.columns = df.columns.str.lower().str.replace(' ','_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower()

df['class'] = (df['class'] == 'positive').astype(int)

In [3]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [4]:
categorical_variables = ['gender', 'polyuria', 'polydipsia', 'sudden_weight_loss','weakness', 'polyphagia', 'genital_thrush', 'visual_blurring',
                         'itching', 'irritability', 'delayed_healing', 'partial_paresis','muscle_stiffness', 'alopecia', 'obesity']

numerical_variables = ['age']

In [5]:
# Training function

def train(df_train, y_train, C=1.0):
    dicts = df_train[categorical_variables + numerical_variables].to_dict(orient='records')
    
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)
    
    model = LogisticRegression(C=C, max_iter=1000)
    model.fit(X_train, y_train)
    
    return dv, model

In [6]:
# function for prediction:

def predict(df, dv, model):
    dicts = df[categorical_variables + numerical_variables].to_dict(orient='records')
    
    X = dv.transform(dicts)
    y_pred = model.predict_proba(X)[:,1]
    
    return y_pred

In [7]:
C = 10
n_splits = 5              #we use the values to save our model below because they gave the highest auc score.

In [9]:
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)

scores = []

for train_idx, val_idx in kfold.split(df_full_train):
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]
    
    y_train = df_train['class'].values
    y_val = df_val['class'].values
    
    dv, model = train(df_train, y_train, C=C)
    y_pred = predict(df_val, dv, model)
    
    auc = roc_auc_score(y_val, y_pred)
    scores.append(auc)
        
print('C=%s %.3f +- %.3f' % (C, np.mean(scores), np.std(scores))) 

C=10 0.972 +- 0.011


In [10]:
scores

[0.9732142857142857,
 0.9527186761229313,
 0.9801920768307323,
 0.9693877551020409,
 0.9861635220125786]

In [12]:
dv, model = train(df_full_train, df_full_train['class'].values, C=10)
y_pred = predict(df_test, dv, model)

y_test = df_test['class'].values
auc = roc_auc_score(y_test, y_pred)
auc

0.9760765550239234

### Save the Model as File

In [13]:
import pickle

In [14]:
output_file = f'model_C={C}.bin'
output_file

'model_C=10.bin'

In [15]:
with open(output_file, 'wb') as f_out:
    pickle.dump((dv, model), f_out)

### Load the Model  --- restart the kernel before doing this

In [1]:
import pickle

In [2]:
model_file = 'model_C=10.bin'

In [3]:
with open(model_file, 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [4]:
dv, model

(DictVectorizer(sparse=False), LogisticRegression(C=10, max_iter=1000))

In [6]:
patient = {
    'gender': 'male',
    'polyuria': 'no',
    'polydipsia': 'no',
    'sudden_weight_loss': 'no',
    'weakness': 'yes',
    'polyphagia': 'no',
    'genital_thrush': 'yes',
    'visual_blurring': 'no',
    'itching': 'yes',
    'irritability': 'no',
    'delayed_healing': 'yes',
    'partial_paresis': 'no',
    'muscle_stiffness': 'no',
    'alopecia': 'yes',
    'obesity': 'no',
    'age': 61}

In [7]:
X = dv.transform(patient)

In [9]:
model.predict_proba(X)[0,1]            # this patient has no risk of early-stage diabetes

0.008195256268653259

In [18]:
patient_1 = {
    'gender': 'female',
    'polyuria': 'yes',
    'polydipsia': 'yes',
    'sudden_weight_loss': 'yes',
    'weakness': 'no',
    'polyphagia': 'yes',
    'genital_thrush': 'no',
    'visual_blurring': 'yes',
    'itching': 'no',
    'irritability': 'yes',
    'delayed_healing': 'no',
    'partial_paresis': 'yes',
    'muscle_stiffness': 'yes',
    'alopecia': 'no',
    'obesity': 'yes',
    'age': 30}

In [19]:
X = dv.transform(patient_1)

In [21]:
model.predict_proba(X)[0,1]            # this patient has high risk of early-stage diabetes

0.9999997473775736