In [16]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
%matplotlib inline

In [17]:
# import dataset

diabetes = pd.read_csv("diabetes_prediction_dataset.csv")
df = DataFrame(data=diabetes)
df

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


In [18]:
# feature preprocessing

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
cols_to_encode = ['smoking_history', 'gender'] 
encoders = {}

for column in cols_to_encode:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    encoders[column] = le

print(encoders['gender'].classes_)
print(encoders['smoking_history'].classes_)
df['smoking_history'].value_counts()

['Female' 'Male' 'Other']
['No Info' 'current' 'ever' 'former' 'never' 'not current']


smoking_history
0    35816
4    35095
3     9352
1     9286
5     6447
2     4004
Name: count, dtype: int64

In [19]:
# normalizing dataset and extracting the target and features

from sklearn.preprocessing import StandardScaler

# Normalizing dataset and extracting the target and features
x = df.iloc[:, :-1]
scaler = StandardScaler()
standardized_data = scaler.fit_transform(x)
X = pd.DataFrame(standardized_data, columns=x.columns)
y = df.iloc[:,-1]
X = X.iloc[:, :-2]


# define oversampling strategy
oversample = SMOTE()

# fit and apply the transform
X_over, y_over = oversample.fit_resample(X, y)

In [20]:
# split data

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size=0.2, random_state=1)

In [21]:
# select model(Logistic Regression)

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

In [22]:
# parameter tuning
# use random search to narrow down C

from sklearn.model_selection import RandomizedSearchCV

param_c = {
    "C": np.logspace(-5, 5, 100)
}

random_search = RandomizedSearchCV(lr, param_c, n_iter=10, cv=5)
random_search.fit(X_train, y_train)
c_range = random_search.best_params_["C"]
c_range

0.010722672220103232

In [23]:
lr = LogisticRegression(solver='liblinear')  # Added solver here

param_grid = {
    "penalty":["l1", "l2"],
    "C": np.linspace(c_range/10, c_range*10, 20)
}

grid = GridSearchCV(lr, param_grid, cv=10, scoring="accuracy")
grid.fit(X_train, y_train)


In [24]:
# use the best parameter to train the final model
best_lr = LogisticRegression(solver='liblinear', penalty=grid.best_params_["penalty"], C=grid.best_params_["C"])
best_lr.fit(X_train, y_train)


In [25]:
# Predict the values using the test data
y_pred = best_lr.predict(X_test)

# Calculate the accuracy of the model
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

print("The accuracy of the model is: ", accuracy)


The accuracy of the model is:  0.756912568306011
