# Logistic regression 

Classification, the process of predicting which the two categories an event belongs to.  Here, we are using classification to predict whether an individual would have diabetes or not. 

In [1]:
from sqlalchemy import create_engine
import psycopg2
from config import db_password

import pandas as pd
import numpy as np
from collections import Counter

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

In [2]:
# localserver, the connection string
dbEngine = f"postgresql://postgres:{db_password}@127.0.0.1:5432/Drops_of_Jupyter"

# create the database engine
engine = create_engine(dbEngine)
conn = engine.connect()

In [3]:
# Read data from PostgreSQL database table and load into a DataFrame instance
ghs_df = pd.read_sql("select * from general_health_status", conn);
ghs_df

Unnamed: 0,ID,General_Health_Status,Diabetes,Prediabetes,Weight_Lbs,Categorical_BMI
0,H056808,Very_Good,no,yes,199.0,Overweight
1,H018779,Very_Good,yes,yes,205.0,Overweight
2,H049265,Very_Good,no,no,160.0,Overweight
3,H007699,Fair,no,no,190.0,Obese
4,H066034,Good,no,no,250.0,Obese
...,...,...,...,...,...,...
29477,H012375,Very_Good,no,no,140.0,Overweight
29478,H052160,Fair,yes,yes,220.0,Obese
29479,H051563,Very_Good,no,no,130.0,Overweight
29480,H058432,Good,no,no,168.0,Healthy_Weight


In [4]:
# Read data from PostgreSQL database table and load into a DataFrame instance
ind_df = pd.read_sql("select * from individual", conn)
ind_df

Unnamed: 0,ID,Region,Age,Gender,Education,Race,Poverty_Ratio
0,H056808,South,50,Male,Grade_1-11,White_Only,1.93
1,H018779,South,53,Male,Associates_Academic_Program,African_American_Only,4.45
2,H049265,South,56,Male,Bachelor,White_Only,5.94
3,H007699,South,57,Female,Some_College_no_degree,White_Only,3.70
4,H066034,South,25,Male,High_School_Graduate,African_American_Only,1.66
...,...,...,...,...,...,...,...
29477,H012375,West,70,Female,Masters,White_Only,5.11
29478,H052160,West,35,Female,Associates_Academic_Program,Not_Ascertained,3.03
29479,H051563,West,72,Female,High_School_Graduate,White_Only,2.07
29480,H058432,West,58,Male,Some_College_no_degree,White_Only,2.05


In [5]:
# merge two dfs
clean_df2 = ghs_df.merge(ind_df, how='inner', on='ID')
clean_df2

Unnamed: 0,ID,General_Health_Status,Diabetes,Prediabetes,Weight_Lbs,Categorical_BMI,Region,Age,Gender,Education,Race,Poverty_Ratio
0,H056808,Very_Good,no,yes,199.0,Overweight,South,50,Male,Grade_1-11,White_Only,1.93
1,H018779,Very_Good,yes,yes,205.0,Overweight,South,53,Male,Associates_Academic_Program,African_American_Only,4.45
2,H049265,Very_Good,no,no,160.0,Overweight,South,56,Male,Bachelor,White_Only,5.94
3,H007699,Fair,no,no,190.0,Obese,South,57,Female,Some_College_no_degree,White_Only,3.70
4,H066034,Good,no,no,250.0,Obese,South,25,Male,High_School_Graduate,African_American_Only,1.66
...,...,...,...,...,...,...,...,...,...,...,...,...
29477,H012375,Very_Good,no,no,140.0,Overweight,West,70,Female,Masters,White_Only,5.11
29478,H052160,Fair,yes,yes,220.0,Obese,West,35,Female,Associates_Academic_Program,Not_Ascertained,3.03
29479,H051563,Very_Good,no,no,130.0,Overweight,West,72,Female,High_School_Graduate,White_Only,2.07
29480,H058432,Good,no,no,168.0,Healthy_Weight,West,58,Male,Some_College_no_degree,White_Only,2.05


In [6]:
# columns in df
clean_df2.columns

Index(['ID', 'General_Health_Status', 'Diabetes', 'Prediabetes', 'Weight_Lbs',
       'Categorical_BMI', 'Region', 'Age', 'Gender', 'Education', 'Race',
       'Poverty_Ratio'],
      dtype='object')

In [7]:
#clean_df2 = pd.read_csv('/Users/yinglin/Desktop/NHIS/clean.csv')
clean_df2 

Unnamed: 0,ID,General_Health_Status,Diabetes,Prediabetes,Weight_Lbs,Categorical_BMI,Region,Age,Gender,Education,Race,Poverty_Ratio
0,H056808,Very_Good,no,yes,199.0,Overweight,South,50,Male,Grade_1-11,White_Only,1.93
1,H018779,Very_Good,yes,yes,205.0,Overweight,South,53,Male,Associates_Academic_Program,African_American_Only,4.45
2,H049265,Very_Good,no,no,160.0,Overweight,South,56,Male,Bachelor,White_Only,5.94
3,H007699,Fair,no,no,190.0,Obese,South,57,Female,Some_College_no_degree,White_Only,3.70
4,H066034,Good,no,no,250.0,Obese,South,25,Male,High_School_Graduate,African_American_Only,1.66
...,...,...,...,...,...,...,...,...,...,...,...,...
29477,H012375,Very_Good,no,no,140.0,Overweight,West,70,Female,Masters,White_Only,5.11
29478,H052160,Fair,yes,yes,220.0,Obese,West,35,Female,Associates_Academic_Program,Not_Ascertained,3.03
29479,H051563,Very_Good,no,no,130.0,Overweight,West,72,Female,High_School_Graduate,White_Only,2.07
29480,H058432,Good,no,no,168.0,Healthy_Weight,West,58,Male,Some_College_no_degree,White_Only,2.05



## Objective:  to determine the correlation bet. "Diabetes" and "Education"

In [8]:
# a new df of 'Diabetes" & 'Education'
clean_df3 = clean_df2.loc[:, ['Diabetes', 'Education'] ]
clean_df3

Unnamed: 0,Diabetes,Education
0,no,Grade_1-11
1,yes,Associates_Academic_Program
2,no,Bachelor
3,no,Some_College_no_degree
4,no,High_School_Graduate
...,...,...
29477,no,Masters
29478,yes,Associates_Academic_Program
29479,no,High_School_Graduate
29480,no,Some_College_no_degree


In [9]:
# drop any null value
clean_df4 = clean_df3.dropna()
clean_df4

Unnamed: 0,Diabetes,Education
0,no,Grade_1-11
1,yes,Associates_Academic_Program
2,no,Bachelor
3,no,Some_College_no_degree
4,no,High_School_Graduate
...,...,...
29477,no,Masters
29478,yes,Associates_Academic_Program
29479,no,High_School_Graduate
29480,no,Some_College_no_degree


This df has no null values.  We start and end with almost 30k rows.  

In [10]:
# convert categorical variables into indicators
clean_df5 = pd.get_dummies(clean_df4, columns=['Diabetes',
                                              'Education'])
clean_df5

Unnamed: 0,Diabetes_Dont_Know,Diabetes_Refused,Diabetes_no,Diabetes_yes,Education_12th_Grade_no_diploma,Education_Associates_Academic_Program,Education_Associates_Occupational_Technical_Vocational,Education_Bachelor,Education_Dont_Know,Education_GED_Equivalent,Education_Grade_1-11,Education_Greater_Than_Master,Education_High_School_Graduate,Education_Masters,Education_Refused,Education_Some_College_no_degree
0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
4,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29477,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
29478,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
29479,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0
29480,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1


In [11]:
# drop the unnessary dummies
clean_df6 = clean_df5.drop(columns={'Diabetes_Dont_Know',
                                    'Diabetes_Refused',
                                    'Diabetes_no',
                                    'Education_Refused',
                                    'Education_Dont_Know'} )                        
clean_df6

Unnamed: 0,Diabetes_yes,Education_12th_Grade_no_diploma,Education_Associates_Academic_Program,Education_Associates_Occupational_Technical_Vocational,Education_Bachelor,Education_GED_Equivalent,Education_Grade_1-11,Education_Greater_Than_Master,Education_High_School_Graduate,Education_Masters,Education_Some_College_no_degree
0,0,0,0,0,0,0,1,0,0,0,0
1,1,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
29477,0,0,0,0,0,0,0,0,0,1,0
29478,1,0,1,0,0,0,0,0,0,0,0
29479,0,0,0,0,0,0,0,0,1,0,0
29480,0,0,0,0,0,0,0,0,0,0,1


In [12]:
# create our features by dropping our target
X = clean_df6.drop(columns=['Diabetes_yes'])
X.head()

Unnamed: 0,Education_12th_Grade_no_diploma,Education_Associates_Academic_Program,Education_Associates_Occupational_Technical_Vocational,Education_Bachelor,Education_GED_Equivalent,Education_Grade_1-11,Education_Greater_Than_Master,Education_High_School_Graduate,Education_Masters,Education_Some_College_no_degree
0,0,0,0,0,0,1,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,1,0,0


In [13]:
# create our target
y = clean_df6['Diabetes_yes']
y[:5]

0    0
1    1
2    0
3    0
4    0
Name: Diabetes_yes, dtype: uint8

In [14]:
# check the balance of the target value
y.value_counts()

0    26348
1     3134
Name: Diabetes_yes, dtype: int64

'0' codes for 'No diabetes'.

'1' codes for 'Yes'.

About 10% of the population are diabetic.


We don't want  any imbalance:  one class has too few or too much instances in the training set.  Need to do some balancing. 

### split the data into train and test set

In [15]:
# normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [16]:
y_test.value_counts()

0    6587
1     784
Name: Diabetes_yes, dtype: int64

### scale the data

In [17]:
# create a scaler instance
import sklearn as skl
X_scaler = skl.preprocessing.StandardScaler()

In [18]:
# fit/train the scaler
X_scaler.fit(X_train)

StandardScaler()

In [19]:
# scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [20]:
print (X_train_scaled)

[[-0.12792206 -0.3123107  -0.19822953 ...  1.865352   -0.35768646
  -0.41667728]
 [-0.12792206 -0.3123107  -0.19822953 ... -0.53609185 -0.35768646
  -0.41667728]
 [-0.12792206 -0.3123107  -0.19822953 ...  1.865352   -0.35768646
  -0.41667728]
 ...
 [-0.12792206 -0.3123107  -0.19822953 ... -0.53609185 -0.35768646
  -0.41667728]
 [-0.12792206 -0.3123107  -0.19822953 ... -0.53609185  2.79574466
  -0.41667728]
 [-0.12792206 -0.3123107  -0.19822953 ... -0.53609185  2.79574466
  -0.41667728]]


In [21]:
print(X_test_scaled)

[[-0.12792206 -0.3123107  -0.19822953 ...  1.865352   -0.35768646
  -0.41667728]
 [-0.12792206 -0.3123107  -0.19822953 ... -0.53609185  2.79574466
  -0.41667728]
 [-0.12792206 -0.3123107  -0.19822953 ... -0.53609185  2.79574466
  -0.41667728]
 ...
 [-0.12792206 -0.3123107  -0.19822953 ... -0.53609185  2.79574466
  -0.41667728]
 [-0.12792206 -0.3123107  -0.19822953 ... -0.53609185 -0.35768646
   2.39993886]
 [-0.12792206 -0.3123107  -0.19822953 ... -0.53609185 -0.35768646
  -0.41667728]]


In [22]:
# 75% train, 25% test for X
print(X_train_scaled.shape)
print(X_test_scaled.shape)

(22111, 10)
(7371, 10)


In [23]:
# 75% train, 25% test for y
print(y_train.shape)
print(y_test.shape)

(22111,)
(7371,)


### random oversampling

Use more of the rare class records in the classification model, aka upsample. 

In [24]:
# implement random oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)

Counter(y_resampled)

Counter({0: 19761, 1: 19761})

### logistic regression

In [25]:
# create a logistic regression model
model = LogisticRegression(solver='lbfgs', random_state=1)
model

LogisticRegression(random_state=1)

In [26]:
# fit (train) model using the training data
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

### make predictions

In [27]:
# predict outcomes for the test data set
predictions = model.predict(X_test_scaled)
pd.DataFrame( {'Prediction': predictions, 'Acutal': y_test} )

Unnamed: 0,Prediction,Acutal
10675,1,0
14080,0,0
7111,0,0
2101,0,0
22333,0,0
...,...,...
2592,0,0
4787,1,0
16708,0,0
17847,1,0


In [28]:
# cal. the balanced accuracy score
y_pred = model.predict(X_test_scaled)
score = balanced_accuracy_score(y_test, y_pred)

print('Accuracy score: ', score)

Accuracy score:  0.5783720369125334


63% of prediction and acutal match.  

It would take a lot of time to determine accuracy one-by-one for all 36 variables.  Therefore, we will create a loop to automate this process.

### Confusion matrix

In [29]:
matrix = confusion_matrix (y_test, y_pred)
print(matrix)

[[2570 4017]
 [ 183  601]]


### Classification report

In [30]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.93      0.39      0.55      6587
           1       0.13      0.77      0.22       784

    accuracy                           0.43      7371
   macro avg       0.53      0.58      0.39      7371
weighted avg       0.85      0.43      0.52      7371



Precision measures the accuracy of a predicted positive outcome.  The precision of 0s (not diabetes) is 0.96, high.  The precision of 1s (diabetes) is 0.16, low. 

The recall, also known sensitivity, measures how robust this model predict a positive outcome -- the precentage of 1s that is correctly identified.  

The recall of 0s vs. 1s are more or less similar.

# Looping over  variables to determine their accuracy

In [31]:
# remove the indexes (that made the matrix too big to run)
#clean_df2 = clean_df2.drop(columns=['ID'])
#clean_df2

In [32]:
import sklearn as skl

target_column = "Diabetes"

target_column_onehot = 'yes'

for column in clean_df2.columns.values:
    if target_column != column:
        XY = clean_df2[ [column,"Diabetes"] ].copy()
        XY = XY.dropna()
        if len( XY ) > 0:
            if len( XY[column] ) == len( XY[target_column] ):
                X = pd.get_dummies( XY[ column ].astype('str') )
                y = pd.get_dummies( XY[ target_column ].astype('str') ) 
                y = y[ target_column_onehot ]
                print(column,X.shape,y.shape)
                X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
                X_scaler = skl.preprocessing.StandardScaler()
                X_scaler.fit(X_train)
                X_train_scaled = X_scaler.transform(X_train)
                X_test_scaled = X_scaler.transform(X_test)
                
                # implement random oversampling
                from imblearn.over_sampling import RandomOverSampler
                ros = RandomOverSampler(random_state=1)
                X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
                Counter(y_resampled)
                
                model = LogisticRegression(solver='lbfgs', random_state=1)
                model.fit(X_resampled, y_resampled)
                y_pred = model.predict(X_test)
                print( column, "score:  ",  balanced_accuracy_score(y_test, y_pred) )
    #break

ID (29482, 29482) (29482,)
ID score:   0.5
General_Health_Status (29482, 7) (29482,)
General_Health_Status score:   0.6946637315925308
Prediabetes (29482, 4) (29482,)
Prediabetes score:   0.7764021123858682
Weight_Lbs (26887, 198) (26887,)
Weight_Lbs score:   0.5874681270429185
Categorical_BMI (29482, 5) (29482,)
Categorical_BMI score:   0.6010879112537683
Region (29482, 4) (29482,)
Region score:   0.5271503394131297
Age (29482, 70) (29482,)
Age score:   0.6594292871239888
Gender (29482, 4) (29482,)
Gender score:   0.5203342894012015
Education (29482, 12) (29482,)
Education score:   0.5783720369125334
Race (29482, 9) (29482,)
Race score:   0.5401656943329936
Poverty_Ratio (29482, 956) (29482,)
Poverty_Ratio score:   0.5732930393198725


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


General_health_status, prediabetes, and BMI, education, income have high scores.


### Univariate logistic regression with the following variables:

1. Categorical_BMI
2. Education
3. Poverty_ratio
4. Diabetes


In [33]:
import sklearn as skl

target_column = "Diabetes"
target_column_onehot = 'yes'

columns=[
         'Categorical_BMI',
         'Education',
         'Poverty_Ratio'
         ]
         
columnsCAT=[
         'Categorical_BMI',
         'Education'
         ]

XYcolumns = columns+[target_column]
print(XYcolumns)
XY = clean_df2.loc[ :, XYcolumns ].copy()
XY = XY.dropna()

X = pd.get_dummies( XY[ columns ].astype('str'), columns=columnsCAT )
y_ = pd.get_dummies( XY[ target_column ].astype('str') ) 
y = y_[ target_column_onehot ]
print(columns,X.shape,y.shape)
print(XY.head())

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

X_scaler = skl.preprocessing.StandardScaler()
X_scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# implement random oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)
y_pred = model.predict(X_test_scaled)
print( "Score", balanced_accuracy_score(y_test, y_pred) )
    #break

['Categorical_BMI', 'Education', 'Poverty_Ratio', 'Diabetes']
['Categorical_BMI', 'Education', 'Poverty_Ratio'] (29482, 18) (29482,)
  Categorical_BMI                    Education  Poverty_Ratio Diabetes
0      Overweight                   Grade_1-11           1.93       no
1      Overweight  Associates_Academic_Program           4.45      yes
2      Overweight                     Bachelor           5.94       no
3           Obese       Some_College_no_degree           3.70       no
4           Obese         High_School_Graduate           1.66       no
Score 0.6295700134463988


### Break down the logistic regression code above  to see whether it gives the same result (accuracy score: 0.63)

This is to mainly check if the codes are correctly written and to look at the data closely.

In [34]:
# a test df of the following variables
test_df = clean_df2.loc[:, ['Diabetes',
                            'Categorical_BMI',
                            'Education',
                            'Poverty_Ratio']]
test_df                         

Unnamed: 0,Diabetes,Categorical_BMI,Education,Poverty_Ratio
0,no,Overweight,Grade_1-11,1.93
1,yes,Overweight,Associates_Academic_Program,4.45
2,no,Overweight,Bachelor,5.94
3,no,Obese,Some_College_no_degree,3.70
4,no,Obese,High_School_Graduate,1.66
...,...,...,...,...
29477,no,Overweight,Masters,5.11
29478,yes,Obese,Associates_Academic_Program,3.03
29479,no,Overweight,High_School_Graduate,2.07
29480,no,Healthy_Weight,Some_College_no_degree,2.05


In [35]:
# drop any null value
test_df1 = test_df.dropna()
test_df1

Unnamed: 0,Diabetes,Categorical_BMI,Education,Poverty_Ratio
0,no,Overweight,Grade_1-11,1.93
1,yes,Overweight,Associates_Academic_Program,4.45
2,no,Overweight,Bachelor,5.94
3,no,Obese,Some_College_no_degree,3.70
4,no,Obese,High_School_Graduate,1.66
...,...,...,...,...
29477,no,Overweight,Masters,5.11
29478,yes,Obese,Associates_Academic_Program,3.03
29479,no,Overweight,High_School_Graduate,2.07
29480,no,Healthy_Weight,Some_College_no_degree,2.05


In [36]:
# convert categorical variables into indicators
test_df2 = pd.get_dummies(test_df1, columns=['Diabetes',
                                             'Categorical_BMI', 
                                             'Education'])
test_df2

Unnamed: 0,Poverty_Ratio,Diabetes_Dont_Know,Diabetes_Refused,Diabetes_no,Diabetes_yes,Categorical_BMI_Healthy_Weight,Categorical_BMI_Obese,Categorical_BMI_Overweight,Categorical_BMI_Underweight,Categorical_BMI_Unknown,...,Education_Associates_Occupational_Technical_Vocational,Education_Bachelor,Education_Dont_Know,Education_GED_Equivalent,Education_Grade_1-11,Education_Greater_Than_Master,Education_High_School_Graduate,Education_Masters,Education_Refused,Education_Some_College_no_degree
0,1.93,0,0,1,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
1,4.45,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5.94,0,0,1,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
3,3.70,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1.66,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29477,5.11,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
29478,3.03,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29479,2.07,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
29480,2.05,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


Please note that 'poverty_ratio' is a continuous variable. 

In [37]:
# drop the unnessary dummies
test_df3 = test_df2.drop(columns= { 'Diabetes_Dont_Know',
                                    'Diabetes_Refused',
                                    'Diabetes_no',
                                    'Education_Refused',
                                    'Education_Dont_Know'} )
test_df3                        

Unnamed: 0,Poverty_Ratio,Diabetes_yes,Categorical_BMI_Healthy_Weight,Categorical_BMI_Obese,Categorical_BMI_Overweight,Categorical_BMI_Underweight,Categorical_BMI_Unknown,Education_12th_Grade_no_diploma,Education_Associates_Academic_Program,Education_Associates_Occupational_Technical_Vocational,Education_Bachelor,Education_GED_Equivalent,Education_Grade_1-11,Education_Greater_Than_Master,Education_High_School_Graduate,Education_Masters,Education_Some_College_no_degree
0,1.93,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
1,4.45,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0
2,5.94,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0
3,3.70,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
4,1.66,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29477,5.11,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
29478,3.03,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0
29479,2.07,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
29480,2.05,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [38]:
# create our feature by dropping our target
X = test_df3.drop(columns=['Diabetes_yes'])
X.head()

Unnamed: 0,Poverty_Ratio,Categorical_BMI_Healthy_Weight,Categorical_BMI_Obese,Categorical_BMI_Overweight,Categorical_BMI_Underweight,Categorical_BMI_Unknown,Education_12th_Grade_no_diploma,Education_Associates_Academic_Program,Education_Associates_Occupational_Technical_Vocational,Education_Bachelor,Education_GED_Equivalent,Education_Grade_1-11,Education_Greater_Than_Master,Education_High_School_Graduate,Education_Masters,Education_Some_College_no_degree
0,1.93,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
1,4.45,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0
2,5.94,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0
3,3.7,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
4,1.66,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0


In [39]:
# create our target
y = clean_df6['Diabetes_yes']
y.head()

0    0
1    1
2    0
3    0
4    0
Name: Diabetes_yes, dtype: uint8

In [40]:
# check the balance of the target value
y.value_counts()

0    26348
1     3134
Name: Diabetes_yes, dtype: int64

In [41]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [42]:
# create a scaler instance
import sklearn as skl
X_scaler = skl.preprocessing.StandardScaler()

In [43]:
# fit/train the scaler
X_scaler.fit(X_train)

StandardScaler()

In [44]:
# scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [45]:
# implement random oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({0: 19761, 1: 19761})

In [46]:
# create a logistic regression model
model = LogisticRegression(solver='lbfgs', random_state=1)

model

LogisticRegression(random_state=1)

In [47]:
# fit (train) model using the training data
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [48]:
# predict outcomes for the test data set
predictions = model.predict(X_test_scaled)
pd.DataFrame( {'Prediction': predictions, 'Acutal': y_test} )



Unnamed: 0,Prediction,Acutal
10675,1,0
14080,0,0
7111,0,0
2101,1,0
22333,0,0
...,...,...
2592,0,0
4787,0,0
16708,0,0
17847,1,0


In [49]:
# cal. the balanced accuracy score
y_pred = model.predict(X_test_scaled)
score = balanced_accuracy_score(y_test, y_pred)

print('Accuracy score: ', score)

Accuracy score:  0.6270467804550088





### Univariate logistic regression with the following variables:

1. Categorical_BMI
2. Education
3. Poverty_ratio
4. Diabetes
5. Race

In [50]:
import sklearn as skl

target_column = "Diabetes"
target_column_onehot = 'yes'

columns=[
         'Categorical_BMI',
         'Education',
         'Poverty_Ratio',
         'Race'
         ]
         
columnsCAT=[
         'Categorical_BMI',
         'Education',
         'Race'
         ]

XYcolumns = columns+[target_column]
print(XYcolumns)
XY = clean_df2.loc[ :, XYcolumns ].copy()
XY = XY.dropna()

X = pd.get_dummies( XY[ columns ].astype('str'), columns=columnsCAT )
y_ = pd.get_dummies( XY[ target_column ].astype('str') ) 
y = y_[ target_column_onehot ]
print(columns,X.shape,y.shape)
print(XY.head())

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

X_scaler = skl.preprocessing.StandardScaler()
X_scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# implement random oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)
y_pred = model.predict(X_test_scaled)
print( "Score", balanced_accuracy_score(y_test, y_pred) )
    #break

['Categorical_BMI', 'Education', 'Poverty_Ratio', 'Race', 'Diabetes']
['Categorical_BMI', 'Education', 'Poverty_Ratio', 'Race'] (29482, 27) (29482,)
  Categorical_BMI                    Education  Poverty_Ratio  \
0      Overweight                   Grade_1-11           1.93   
1      Overweight  Associates_Academic_Program           4.45   
2      Overweight                     Bachelor           5.94   
3           Obese       Some_College_no_degree           3.70   
4           Obese         High_School_Graduate           1.66   

                    Race Diabetes  
0             White_Only       no  
1  African_American_Only      yes  
2             White_Only       no  
3             White_Only       no  
4  African_American_Only       no  
Score 0.6324843034982324


The accuracy score is 0.63. 


### Univariate logistic regression with the following variables:

1. Categorical_BMI
2. Education
3. Poverty_ratio
4. Diabetes
5. Race
6. Gender
7. Region
8. Age

In [52]:
import sklearn as skl

target_column = "Diabetes"
target_column_onehot = 'yes'

columns=[
         'Categorical_BMI',
         'Education',
         'Poverty_Ratio',
         'Race',
         'Gender',
         'Region', 
         'Age',
         ]
         
columnsCAT=[
         'Categorical_BMI',
         'Education',
         'Race',
         'Gender',
         'Region',
         ]

XYcolumns = columns+[target_column]
print(XYcolumns)
XY = clean_df2.loc[ :, XYcolumns ].copy()
XY = XY.dropna()

X = pd.get_dummies( XY[ columns ].astype('str'), columns=columnsCAT )
y_ = pd.get_dummies( XY[ target_column ].astype('str') ) 
y = y_[ target_column_onehot ]
print(columns,X.shape,y.shape)
print(XY.head())

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

X_scaler = skl.preprocessing.StandardScaler()
X_scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# implement random oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

# model = LogisticRegression(solver='lbfgs', random_state=1)
#model.fit(X_resampled, y_resampled)
#y_pred = model.predict(X_test_scaled)
#print( "Score", balanced_accuracy_score(y_test, y_pred) )

from sklearn.linear_model import LogisticRegressionCV
model = LogisticRegressionCV(
            cv=5, penalty='l1', solver='saga', 
            max_iter=10000, random_state=0
        )
model.fit(X_resampled, y_resampled)
y_pred = model.predict( X_test_scaled )
print( "Score", model.score(X_test_scaled, y_test) )

    #break

['Categorical_BMI', 'Education', 'Poverty_Ratio', 'Race', 'Gender', 'Region', 'Age', 'Diabetes']
['Categorical_BMI', 'Education', 'Poverty_Ratio', 'Race', 'Gender', 'Region', 'Age'] (29482, 36) (29482,)
  Categorical_BMI                    Education  Poverty_Ratio  \
0      Overweight                   Grade_1-11           1.93   
1      Overweight  Associates_Academic_Program           4.45   
2      Overweight                     Bachelor           5.94   
3           Obese       Some_College_no_degree           3.70   
4           Obese         High_School_Graduate           1.66   

                    Race  Gender Region  Age Diabetes  
0             White_Only    Male  South   50       no  
1  African_American_Only    Male  South   53      yes  
2             White_Only    Male  South   56       no  
3             White_Only  Female  South   57       no  
4  African_American_Only    Male  South   25       no  
Score 0.6773843440510107


In [53]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.96      0.67      0.79      6587
           1       0.21      0.74      0.33       784

    accuracy                           0.68      7371
   macro avg       0.58      0.71      0.56      7371
weighted avg       0.88      0.68      0.74      7371



In [54]:
# list the model coefficient into a df
df_logistic_regularized = pd.DataFrame( 
        data=model.coef_[0], 
        index=X.columns.to_numpy(), 
        columns=['coef'] )

df_logistic_regularized

Unnamed: 0,coef
Poverty_Ratio,-0.195712
Age,0.999671
Categorical_BMI_Healthy_Weight,-0.336264
Categorical_BMI_Obese,0.36029
Categorical_BMI_Overweight,0.0
Categorical_BMI_Underweight,-0.164397
Categorical_BMI_Unknown,0.039941
Education_12th_Grade_no_diploma,0.07398
Education_Associates_Academic_Program,-0.001433
Education_Associates_Occupational_Technical_Vocational,0.011241
