### Univariate logistic regression 

In [1]:
from sqlalchemy import create_engine
import psycopg2
from config import db_password

import pandas as pd
import numpy as np
from collections import Counter
import sklearn as skl
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report


In [2]:
# localserver, the connection string
dbEngine = f"postgresql://postgres:{db_password}@127.0.0.1:5433/Drops_of_Jupyter"

# create the database engine
engine = create_engine(dbEngine)
conn = engine.connect()

In [3]:
# Read data from PostgreSQL database table and load into a DataFrame instance
ghs_df = pd.read_sql("select * from general_health_status", conn);
ghs_df

Unnamed: 0,ID,General_Health_Status,Diabetes,Prediabetes,Weight_Lbs,Categorical_BMI
0,H056808,Very_Good,no,yes,199.0,Overweight
1,H018779,Very_Good,yes,yes,205.0,Overweight
2,H049265,Very_Good,no,no,160.0,Overweight
3,H007699,Fair,no,no,190.0,Obese
4,H066034,Good,no,no,250.0,Obese
...,...,...,...,...,...,...
29477,H012375,Very_Good,no,no,140.0,Overweight
29478,H052160,Fair,yes,yes,220.0,Obese
29479,H051563,Very_Good,no,no,130.0,Overweight
29480,H058432,Good,no,no,168.0,Healthy_Weight


In [4]:
# Read data from PostgreSQL database table and load into a DataFrame instance
ind_df = pd.read_sql("select * from individual", conn)
ind_df

Unnamed: 0,ID,Region,Age,Gender,Education,Race,Poverty_Ratio
0,H056808,South,50,Male,Grade_1-11,White_Only,1.93
1,H018779,South,53,Male,Associates_Academic_Program,African_American_Only,4.45
2,H049265,South,56,Male,Bachelor,White_Only,5.94
3,H007699,South,57,Female,Some_College_no_degree,White_Only,3.70
4,H066034,South,25,Male,High_School_Graduate,African_American_Only,1.66
...,...,...,...,...,...,...,...
29477,H012375,West,70,Female,Masters,White_Only,5.11
29478,H052160,West,35,Female,Associates_Academic_Program,,3.03
29479,H051563,West,72,Female,High_School_Graduate,White_Only,2.07
29480,H058432,West,58,Male,Some_College_no_degree,White_Only,2.05


In [5]:
# merge two dfs
clean_df2 = ghs_df.merge(ind_df, how='inner', on='ID')
clean_df2

Unnamed: 0,ID,General_Health_Status,Diabetes,Prediabetes,Weight_Lbs,Categorical_BMI,Region,Age,Gender,Education,Race,Poverty_Ratio
0,H056808,Very_Good,no,yes,199.0,Overweight,South,50,Male,Grade_1-11,White_Only,1.93
1,H018779,Very_Good,yes,yes,205.0,Overweight,South,53,Male,Associates_Academic_Program,African_American_Only,4.45
2,H049265,Very_Good,no,no,160.0,Overweight,South,56,Male,Bachelor,White_Only,5.94
3,H007699,Fair,no,no,190.0,Obese,South,57,Female,Some_College_no_degree,White_Only,3.70
4,H066034,Good,no,no,250.0,Obese,South,25,Male,High_School_Graduate,African_American_Only,1.66
...,...,...,...,...,...,...,...,...,...,...,...,...
29477,H012375,Very_Good,no,no,140.0,Overweight,West,70,Female,Masters,White_Only,5.11
29478,H052160,Fair,yes,yes,220.0,Obese,West,35,Female,Associates_Academic_Program,,3.03
29479,H051563,Very_Good,no,no,130.0,Overweight,West,72,Female,High_School_Graduate,White_Only,2.07
29480,H058432,Good,no,no,168.0,Healthy_Weight,West,58,Male,Some_College_no_degree,White_Only,2.05


In [6]:
# columns in df
clean_df2.columns

Index(['ID', 'General_Health_Status', 'Diabetes', 'Prediabetes', 'Weight_Lbs',
       'Categorical_BMI', 'Region', 'Age', 'Gender', 'Education', 'Race',
       'Poverty_Ratio'],
      dtype='object')

In [7]:
clean_df2.dropna(inplace=True)
clean_df2

Unnamed: 0,ID,General_Health_Status,Diabetes,Prediabetes,Weight_Lbs,Categorical_BMI,Region,Age,Gender,Education,Race,Poverty_Ratio
0,H056808,Very_Good,no,yes,199.0,Overweight,South,50,Male,Grade_1-11,White_Only,1.93
1,H018779,Very_Good,yes,yes,205.0,Overweight,South,53,Male,Associates_Academic_Program,African_American_Only,4.45
2,H049265,Very_Good,no,no,160.0,Overweight,South,56,Male,Bachelor,White_Only,5.94
3,H007699,Fair,no,no,190.0,Obese,South,57,Female,Some_College_no_degree,White_Only,3.70
4,H066034,Good,no,no,250.0,Obese,South,25,Male,High_School_Graduate,African_American_Only,1.66
...,...,...,...,...,...,...,...,...,...,...,...,...
29476,H044531,Good,yes,yes,160.0,Overweight,West,69,Female,Masters,White_Only,7.67
29477,H012375,Very_Good,no,no,140.0,Overweight,West,70,Female,Masters,White_Only,5.11
29479,H051563,Very_Good,no,no,130.0,Overweight,West,72,Female,High_School_Graduate,White_Only,2.07
29480,H058432,Good,no,no,168.0,Healthy_Weight,West,58,Male,Some_College_no_degree,White_Only,2.05


In [8]:
# a test df of the following variables
test_df = clean_df2.loc[:, ['Age',
                            'Gender',
                            'Prediabetes',
                            'Diabetes',
                            'Categorical_BMI',
                            'Education',
                            'Poverty_Ratio',
                            'Race']]
test_df.head()                        

Unnamed: 0,Age,Gender,Prediabetes,Diabetes,Categorical_BMI,Education,Poverty_Ratio,Race
0,50,Male,yes,no,Overweight,Grade_1-11,1.93,White_Only
1,53,Male,yes,yes,Overweight,Associates_Academic_Program,4.45,African_American_Only
2,56,Male,no,no,Overweight,Bachelor,5.94,White_Only
3,57,Female,no,no,Obese,Some_College_no_degree,3.7,White_Only
4,25,Male,no,no,Obese,High_School_Graduate,1.66,African_American_Only


In [9]:
# convert numeric categorical variables into indicators
test_df2 = pd.get_dummies(test_df, columns=[
                            'Gender',
                            'Prediabetes',
                            'Diabetes',
                            'Categorical_BMI',
                            'Education',
                            'Race'])                              
test_df2

Unnamed: 0,Age,Poverty_Ratio,Gender_Female,Gender_Male,Prediabetes_no,Prediabetes_yes,Diabetes_no,Diabetes_yes,Categorical_BMI_Healthy_Weight,Categorical_BMI_Obese,...,Education_Greater_Than_Master,Education_High_School_Graduate,Education_Masters,Education_Some_College_no_degree,Race_AIAN_AND_other,Race_AIAN_Only,Race_African_American_Only,Race_Asian_Only,Race_Other,Race_White_Only
0,50,1.93,0,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,53,4.45,0,1,0,1,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
2,56,5.94,0,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,57,3.70,1,0,1,0,1,0,0,1,...,0,0,0,1,0,0,0,0,0,1
4,25,1.66,0,1,1,0,1,0,0,1,...,0,1,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29476,69,7.67,1,0,0,1,0,1,0,0,...,0,0,1,0,0,0,0,0,0,1
29477,70,5.11,1,0,1,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1
29479,72,2.07,1,0,1,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,1
29480,58,2.05,0,1,1,0,1,0,1,0,...,0,0,0,1,0,0,0,0,0,1


In [10]:
test_df2.columns

Index(['Age', 'Poverty_Ratio', 'Gender_Female', 'Gender_Male',
       'Prediabetes_no', 'Prediabetes_yes', 'Diabetes_no', 'Diabetes_yes',
       'Categorical_BMI_Healthy_Weight', 'Categorical_BMI_Obese',
       'Categorical_BMI_Overweight', 'Categorical_BMI_Underweight',
       'Education_12th_Grade_no_diploma',
       'Education_Associates_Academic_Program',
       'Education_Associates_Occupational_Technical_Vocational',
       'Education_Bachelor', 'Education_GED_Equivalent',
       'Education_Grade_1-11', 'Education_Greater_Than_Master',
       'Education_High_School_Graduate', 'Education_Masters',
       'Education_Some_College_no_degree', 'Race_AIAN_AND_other',
       'Race_AIAN_Only', 'Race_African_American_Only', 'Race_Asian_Only',
       'Race_Other', 'Race_White_Only'],
      dtype='object')

In [11]:
# create features by dropping our target
#X = test_df3.drop(columns=['Diabetes_yes'])
X = test_df2.drop(columns=['Diabetes_yes'])
X

Unnamed: 0,Age,Poverty_Ratio,Gender_Female,Gender_Male,Prediabetes_no,Prediabetes_yes,Diabetes_no,Categorical_BMI_Healthy_Weight,Categorical_BMI_Obese,Categorical_BMI_Overweight,...,Education_Greater_Than_Master,Education_High_School_Graduate,Education_Masters,Education_Some_College_no_degree,Race_AIAN_AND_other,Race_AIAN_Only,Race_African_American_Only,Race_Asian_Only,Race_Other,Race_White_Only
0,50,1.93,0,1,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1,53,4.45,0,1,0,1,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
2,56,5.94,0,1,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1
3,57,3.70,1,0,1,0,1,0,1,0,...,0,0,0,1,0,0,0,0,0,1
4,25,1.66,0,1,1,0,1,0,1,0,...,0,1,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29476,69,7.67,1,0,0,1,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
29477,70,5.11,1,0,1,0,1,0,0,1,...,0,0,1,0,0,0,0,0,0,1
29479,72,2.07,1,0,1,0,1,0,0,1,...,0,1,0,0,0,0,0,0,0,1
29480,58,2.05,0,1,1,0,1,1,0,0,...,0,0,0,1,0,0,0,0,0,1


In [12]:
X.columns

Index(['Age', 'Poverty_Ratio', 'Gender_Female', 'Gender_Male',
       'Prediabetes_no', 'Prediabetes_yes', 'Diabetes_no',
       'Categorical_BMI_Healthy_Weight', 'Categorical_BMI_Obese',
       'Categorical_BMI_Overweight', 'Categorical_BMI_Underweight',
       'Education_12th_Grade_no_diploma',
       'Education_Associates_Academic_Program',
       'Education_Associates_Occupational_Technical_Vocational',
       'Education_Bachelor', 'Education_GED_Equivalent',
       'Education_Grade_1-11', 'Education_Greater_Than_Master',
       'Education_High_School_Graduate', 'Education_Masters',
       'Education_Some_College_no_degree', 'Race_AIAN_AND_other',
       'Race_AIAN_Only', 'Race_African_American_Only', 'Race_Asian_Only',
       'Race_Other', 'Race_White_Only'],
      dtype='object')

In [13]:
# create our target
#y = test_df3['Diabetes_yes']
y = test_df2['Diabetes_yes']
y

0        0
1        1
2        0
3        0
4        0
        ..
29476    1
29477    0
29479    0
29480    0
29481    1
Name: Diabetes_yes, Length: 25338, dtype: uint8

In [14]:
# check the balance of the target value
y.value_counts()

0    22773
1     2565
Name: Diabetes_yes, dtype: int64

In [15]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=1, stratify=y)

In [16]:
# create a scaler instance
X_scaler = skl.preprocessing.StandardScaler()

In [17]:
# fit/train the scaler
X_scaler.fit(X_train)

In [18]:
# scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [19]:
# implement random oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({0: 17079, 1: 17079})

In [20]:
# create a logistic regression model
#model = LogisticRegression(solver='lbfgs', random_state=1)
from sklearn.linear_model import LogisticRegressionCV
model = LogisticRegressionCV(cv=5, penalty='l1', solver='saga', 
            max_iter=10000, random_state=1)
model

In [21]:
# fit (train) model using the training data
model.fit(X_resampled, y_resampled)

In [22]:
# predict outcomes for the test data set
#predictions = model.predict(X_test_scaled)
predictions = model.predict(X_test)
pd.DataFrame( {'Prediction': predictions, 'Acutal': y_test} )

Unnamed: 0,Prediction,Acutal
18182,0,0
10582,1,1
14494,0,0
18926,0,0
8687,0,0
...,...,...
2281,0,0
29050,0,0
17935,0,0
16500,0,0


In [23]:
# cal. the balanced accuracy score
#y_pred = model.predict(X_test_scaled)
y_pred = model.predict(X_test)
score = balanced_accuracy_score(y_test, y_pred)
print('Accuracy score: ', score)

Accuracy score:  1.0


In [24]:
# show a text report of the main classification metrics 
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5694
           1       1.00      1.00      1.00       641

    accuracy                           1.00      6335
   macro avg       1.00      1.00      1.00      6335
weighted avg       1.00      1.00      1.00      6335



In [25]:
# list the model coefficient into a df
df_logistic = pd.DataFrame( 
        data=model.coef_[0], 
        index=X.columns.to_numpy(), 
        columns=['coef'] )

df_logistic

Unnamed: 0,coef
Age,0.027973
Poverty_Ratio,0.0
Gender_Female,0.0
Gender_Male,0.0
Prediabetes_no,0.0
Prediabetes_yes,0.0
Diabetes_no,-4.783887
Categorical_BMI_Healthy_Weight,0.0
Categorical_BMI_Obese,0.0
Categorical_BMI_Overweight,0.0


In [26]:
X.columns

Index(['Age', 'Poverty_Ratio', 'Gender_Female', 'Gender_Male',
       'Prediabetes_no', 'Prediabetes_yes', 'Diabetes_no',
       'Categorical_BMI_Healthy_Weight', 'Categorical_BMI_Obese',
       'Categorical_BMI_Overweight', 'Categorical_BMI_Underweight',
       'Education_12th_Grade_no_diploma',
       'Education_Associates_Academic_Program',
       'Education_Associates_Occupational_Technical_Vocational',
       'Education_Bachelor', 'Education_GED_Equivalent',
       'Education_Grade_1-11', 'Education_Greater_Than_Master',
       'Education_High_School_Graduate', 'Education_Masters',
       'Education_Some_College_no_degree', 'Race_AIAN_AND_other',
       'Race_AIAN_Only', 'Race_African_American_Only', 'Race_Asian_Only',
       'Race_Other', 'Race_White_Only'],
      dtype='object')