### Univariate logistic regression 

In [1]:
import pandas as pd
import numpy as np
from collections import Counter

import sklearn as skl
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

import pickle

In [2]:
clean_df2 = pd.read_csv('/Users/yinglin/Desktop/NHIS/clean.csv')
clean_df2 

Unnamed: 0.1,Unnamed: 0,ID,General_Health_Status,Diabetes,Prediabetes,Weight_Lbs,Categorical_BMI,Region,Age,Gender,Education,Race,Poverty_Ratio
0,0,H056808,Very_Good,no,yes,199.0,Overweight,South,50,Male,Grade_1-11,White_Only,1.93
1,1,H018779,Very_Good,yes,yes,205.0,Overweight,South,53,Male,Associates_Academic_Program,African_American_Only,4.45
2,2,H049265,Very_Good,no,no,160.0,Overweight,South,56,Male,Bachelor,White_Only,5.94
3,3,H007699,Fair,no,no,190.0,Obese,South,57,Female,Some_College_no_degree,White_Only,3.70
4,4,H066034,Good,no,no,250.0,Obese,South,25,Male,High_School_Graduate,African_American_Only,1.66
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29477,29477,H012375,Very_Good,no,no,140.0,Overweight,West,70,Female,Masters,White_Only,5.11
29478,29478,H052160,Fair,yes,yes,220.0,Obese,West,35,Female,Associates_Academic_Program,Not_Ascertained,3.03
29479,29479,H051563,Very_Good,no,no,130.0,Overweight,West,72,Female,High_School_Graduate,White_Only,2.07
29480,29480,H058432,Good,no,no,168.0,Healthy_Weight,West,58,Male,Some_College_no_degree,White_Only,2.05


In [3]:
# a test df of the following variables
test_df = clean_df2.loc[:, ['Age',
                            'Gender',
                            'Prediabetes',
                            'Diabetes',
                            'Categorical_BMI',
                            'Education',
                            'Poverty_Ratio',
                            'Race']]
test_df.head()                        

Unnamed: 0,Age,Gender,Prediabetes,Diabetes,Categorical_BMI,Education,Poverty_Ratio,Race
0,50,Male,yes,no,Overweight,Grade_1-11,1.93,White_Only
1,53,Male,yes,yes,Overweight,Associates_Academic_Program,4.45,African_American_Only
2,56,Male,no,no,Overweight,Bachelor,5.94,White_Only
3,57,Female,no,no,Obese,Some_College_no_degree,3.7,White_Only
4,25,Male,no,no,Obese,High_School_Graduate,1.66,African_American_Only


In [4]:
# drop any null value
test_df1 = test_df.dropna()
test_df1.head()

Unnamed: 0,Age,Gender,Prediabetes,Diabetes,Categorical_BMI,Education,Poverty_Ratio,Race
0,50,Male,yes,no,Overweight,Grade_1-11,1.93,White_Only
1,53,Male,yes,yes,Overweight,Associates_Academic_Program,4.45,African_American_Only
2,56,Male,no,no,Overweight,Bachelor,5.94,White_Only
3,57,Female,no,no,Obese,Some_College_no_degree,3.7,White_Only
4,25,Male,no,no,Obese,High_School_Graduate,1.66,African_American_Only


In [5]:
# convert numeric categorical variables into indicators
test_df2 = pd.get_dummies(test_df1, columns=[
                            'Gender',
                            'Prediabetes',
                            'Diabetes',
                            'Categorical_BMI',
                            'Education',
                            'Race'])                              
test_df2

Unnamed: 0,Age,Poverty_Ratio,Gender_Female,Gender_Male,Gender_Refused,Gender_Unknown,Prediabetes_Dont_Know,Prediabetes_Refused,Prediabetes_no,Prediabetes_yes,...,Education_Some_College_no_degree,Race_AIAN_AND_other,Race_AIAN_Only,Race_African_American_Only,Race_Asian_Only,Race_Dont_Know,Race_Not_Ascertained,Race_Other,Race_Refused,Race_White_Only
0,50,1.93,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1,53,4.45,0,1,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
2,56,5.94,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,57,3.70,1,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,1
4,25,1.66,0,1,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29477,70,5.11,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
29478,35,3.03,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
29479,72,2.07,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
29480,58,2.05,0,1,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,1


In [6]:
test_df2.columns

Index(['Age', 'Poverty_Ratio', 'Gender_Female', 'Gender_Male',
       'Gender_Refused', 'Gender_Unknown', 'Prediabetes_Dont_Know',
       'Prediabetes_Refused', 'Prediabetes_no', 'Prediabetes_yes',
       'Diabetes_Dont_Know', 'Diabetes_Refused', 'Diabetes_no', 'Diabetes_yes',
       'Categorical_BMI_Healthy_Weight', 'Categorical_BMI_Obese',
       'Categorical_BMI_Overweight', 'Categorical_BMI_Underweight',
       'Categorical_BMI_Unknown', 'Education_12th_Grade_no_diploma',
       'Education_Associates_Academic_Program',
       'Education_Associates_Occupational_Technical_Vocational',
       'Education_Bachelor', 'Education_Dont_Know', 'Education_GED_Equivalent',
       'Education_Grade_1-11', 'Education_Greater_Than_Master',
       'Education_High_School_Graduate', 'Education_Masters',
       'Education_Refused', 'Education_Some_College_no_degree',
       'Race_AIAN_AND_other', 'Race_AIAN_Only', 'Race_African_American_Only',
       'Race_Asian_Only', 'Race_Dont_Know', 'Race_Not_Asce

In [7]:
test_df3 = test_df2.drop( columns={'Gender_Refused',
                                  'Gender_Unknown',
                                  'Prediabetes_Dont_Know', 
                                  'Prediabetes_Refused',
                                  'Diabetes_Dont_Know',
                                  'Diabetes_Refused',
                                  'Diabetes_no',
                                  'Categorical_BMI_Unknown',
                                  'Education_Refused',
                                  'Education_Dont_Know',
                                  'Race_Dont_Know',
                                  'Race_Not_Ascertained',
                                  'Race_Refused'
                                } )
test_df3

Unnamed: 0,Age,Poverty_Ratio,Gender_Female,Gender_Male,Prediabetes_no,Prediabetes_yes,Diabetes_yes,Categorical_BMI_Healthy_Weight,Categorical_BMI_Obese,Categorical_BMI_Overweight,...,Education_Greater_Than_Master,Education_High_School_Graduate,Education_Masters,Education_Some_College_no_degree,Race_AIAN_AND_other,Race_AIAN_Only,Race_African_American_Only,Race_Asian_Only,Race_Other,Race_White_Only
0,50,1.93,0,1,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1,53,4.45,0,1,0,1,1,0,0,1,...,0,0,0,0,0,0,1,0,0,0
2,56,5.94,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
3,57,3.70,1,0,1,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,1
4,25,1.66,0,1,1,0,0,0,1,0,...,0,1,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29477,70,5.11,1,0,1,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
29478,35,3.03,1,0,0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
29479,72,2.07,1,0,1,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,1
29480,58,2.05,0,1,1,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,1


In [8]:
# create features by dropping our target
#X = test_df3.drop(columns=['Diabetes_yes'])
X = test_df3.drop(columns=['Diabetes_yes'])
X

Unnamed: 0,Age,Poverty_Ratio,Gender_Female,Gender_Male,Prediabetes_no,Prediabetes_yes,Categorical_BMI_Healthy_Weight,Categorical_BMI_Obese,Categorical_BMI_Overweight,Categorical_BMI_Underweight,...,Education_Greater_Than_Master,Education_High_School_Graduate,Education_Masters,Education_Some_College_no_degree,Race_AIAN_AND_other,Race_AIAN_Only,Race_African_American_Only,Race_Asian_Only,Race_Other,Race_White_Only
0,50,1.93,0,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,53,4.45,0,1,0,1,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
2,56,5.94,0,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,57,3.70,1,0,1,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,1
4,25,1.66,0,1,1,0,0,1,0,0,...,0,1,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29477,70,5.11,1,0,1,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1
29478,35,3.03,1,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
29479,72,2.07,1,0,1,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
29480,58,2.05,0,1,1,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,1


In [9]:
X.columns

Index(['Age', 'Poverty_Ratio', 'Gender_Female', 'Gender_Male',
       'Prediabetes_no', 'Prediabetes_yes', 'Categorical_BMI_Healthy_Weight',
       'Categorical_BMI_Obese', 'Categorical_BMI_Overweight',
       'Categorical_BMI_Underweight', 'Education_12th_Grade_no_diploma',
       'Education_Associates_Academic_Program',
       'Education_Associates_Occupational_Technical_Vocational',
       'Education_Bachelor', 'Education_GED_Equivalent',
       'Education_Grade_1-11', 'Education_Greater_Than_Master',
       'Education_High_School_Graduate', 'Education_Masters',
       'Education_Some_College_no_degree', 'Race_AIAN_AND_other',
       'Race_AIAN_Only', 'Race_African_American_Only', 'Race_Asian_Only',
       'Race_Other', 'Race_White_Only'],
      dtype='object')

In [10]:
# create our target
#y = test_df3['Diabetes_yes']
y = test_df3['Diabetes_yes']
y

0        0
1        1
2        0
3        0
4        0
        ..
29477    0
29478    1
29479    0
29480    0
29481    1
Name: Diabetes_yes, Length: 29482, dtype: uint8

In [11]:
# check the balance of the target value
y.value_counts()

0    26348
1     3134
Name: Diabetes_yes, dtype: int64

In [12]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=1, stratify=y)

In [13]:
# create a scaler instance
X_scaler = skl.preprocessing.StandardScaler()

In [14]:
# fit/train the scaler
X_scaler.fit(X_train)

StandardScaler()

In [15]:
# scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [16]:
# implement random oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({0: 19761, 1: 19761})

In [17]:
# create a logistic regression model
#model = LogisticRegression(solver='lbfgs', random_state=1)
from sklearn.linear_model import LogisticRegressionCV
model = LogisticRegressionCV(cv=5, penalty='l1', solver='saga', 
            max_iter=10000, random_state=1)
model

LogisticRegressionCV(cv=5, max_iter=10000, penalty='l1', random_state=1,
                     solver='saga')

In [18]:
# fit (train) model using the training data
model.fit(X_resampled, y_resampled)

LogisticRegressionCV(cv=5, max_iter=10000, penalty='l1', random_state=1,
                     solver='saga')

In [19]:
# predict outcomes for the test data set
#predictions = model.predict(X_test_scaled)
predictions = model.predict(X_test)
pd.DataFrame( {'Prediction': predictions, 'Acutal': y_test} )

Unnamed: 0,Prediction,Acutal
10675,1,0
14080,0,0
7111,0,0
2101,0,0
22333,0,0
...,...,...
2592,0,0
4787,0,0
16708,0,0
17847,1,0


In [20]:
# cal. the balanced accuracy score
#y_pred = model.predict(X_test_scaled)
y_pred = model.predict(X_test)
score = balanced_accuracy_score(y_test, y_pred)
print('Accuracy score: ', score)

Accuracy score:  0.7920498167385976


In [21]:
# show a text report of the main classification metrics 
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.97      0.82      0.89      6587
           1       0.34      0.76      0.47       784

    accuracy                           0.82      7371
   macro avg       0.65      0.79      0.68      7371
weighted avg       0.90      0.82      0.84      7371



In [22]:
# list the model coefficient into a df
df_logistic = pd.DataFrame( 
        data=model.coef_[0], 
        index=X.columns.to_numpy(), 
        columns=['coef'] )

df_logistic

Unnamed: 0,coef
Age,0.045128
Poverty_Ratio,-0.08021
Gender_Female,-0.006308
Gender_Male,0.302793
Prediabetes_no,-1.249282
Prediabetes_yes,1.332099
Categorical_BMI_Healthy_Weight,-1.023268
Categorical_BMI_Obese,0.164604
Categorical_BMI_Overweight,-0.432885
Categorical_BMI_Underweight,-1.368392


In [23]:
X.columns

Index(['Age', 'Poverty_Ratio', 'Gender_Female', 'Gender_Male',
       'Prediabetes_no', 'Prediabetes_yes', 'Categorical_BMI_Healthy_Weight',
       'Categorical_BMI_Obese', 'Categorical_BMI_Overweight',
       'Categorical_BMI_Underweight', 'Education_12th_Grade_no_diploma',
       'Education_Associates_Academic_Program',
       'Education_Associates_Occupational_Technical_Vocational',
       'Education_Bachelor', 'Education_GED_Equivalent',
       'Education_Grade_1-11', 'Education_Greater_Than_Master',
       'Education_High_School_Graduate', 'Education_Masters',
       'Education_Some_College_no_degree', 'Race_AIAN_AND_other',
       'Race_AIAN_Only', 'Race_African_American_Only', 'Race_Asian_Only',
       'Race_Other', 'Race_White_Only'],
      dtype='object')

In [24]:
# save the model to curent directory
filename = 'model.pkl'
pickle.dump(model, open(filename, 'wb'))

In [25]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
loaded_model

LogisticRegressionCV(cv=5, max_iter=10000, penalty='l1', random_state=1,
                     solver='saga')