In [1]:
from sqlalchemy import create_engine
import psycopg2

from config import db_password

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, confusion_matrix

In [2]:
# localserver, the connection string
dbEngine = f"postgresql://postgres:{db_password}@127.0.0.1:5432/Drops_of_Jupyter"

# create the database engine
engine = create_engine(dbEngine)
conn = engine.connect()

In [3]:
# Read data from PostgreSQL database table and load into a DataFrame instance
ghs_df = pd.read_sql("select * from general_health_status", conn);
ghs_df

Unnamed: 0,ID,General_Health_Status,Diabetes,Prediabetes,Weight_Lbs,Categorical_BMI
0,H056808,Very_Good,no,yes,199.0,Overweight
1,H018779,Very_Good,yes,yes,205.0,Overweight
2,H049265,Very_Good,no,no,160.0,Overweight
3,H007699,Fair,no,no,190.0,Obese
4,H066034,Good,no,no,250.0,Obese
...,...,...,...,...,...,...
29477,H012375,Very_Good,no,no,140.0,Overweight
29478,H052160,Fair,yes,yes,220.0,Obese
29479,H051563,Very_Good,no,no,130.0,Overweight
29480,H058432,Good,no,no,168.0,Healthy_Weight


In [4]:
# Read data from PostgreSQL database table and load into a DataFrame instance
ind_df = pd.read_sql("select * from individual", conn)
ind_df

Unnamed: 0,ID,Region,Age,Gender,Education,Race,Poverty_Ratio
0,H056808,South,50,Male,Grade_1-11,White_Only,1.93
1,H018779,South,53,Male,Associates_Academic_Program,African_American_Only,4.45
2,H049265,South,56,Male,Bachelor,White_Only,5.94
3,H007699,South,57,Female,Some_College_no_degree,White_Only,3.70
4,H066034,South,25,Male,High_School_Graduate,African_American_Only,1.66
...,...,...,...,...,...,...,...
29477,H012375,West,70,Female,Masters,White_Only,5.11
29478,H052160,West,35,Female,Associates_Academic_Program,Not_Ascertained,3.03
29479,H051563,West,72,Female,High_School_Graduate,White_Only,2.07
29480,H058432,West,58,Male,Some_College_no_degree,White_Only,2.05


In [5]:
# merge two dfs
clean_df2 = ghs_df.merge(ind_df, how='inner', on='ID')
clean_df2

Unnamed: 0,ID,General_Health_Status,Diabetes,Prediabetes,Weight_Lbs,Categorical_BMI,Region,Age,Gender,Education,Race,Poverty_Ratio
0,H056808,Very_Good,no,yes,199.0,Overweight,South,50,Male,Grade_1-11,White_Only,1.93
1,H018779,Very_Good,yes,yes,205.0,Overweight,South,53,Male,Associates_Academic_Program,African_American_Only,4.45
2,H049265,Very_Good,no,no,160.0,Overweight,South,56,Male,Bachelor,White_Only,5.94
3,H007699,Fair,no,no,190.0,Obese,South,57,Female,Some_College_no_degree,White_Only,3.70
4,H066034,Good,no,no,250.0,Obese,South,25,Male,High_School_Graduate,African_American_Only,1.66
...,...,...,...,...,...,...,...,...,...,...,...,...
29477,H012375,Very_Good,no,no,140.0,Overweight,West,70,Female,Masters,White_Only,5.11
29478,H052160,Fair,yes,yes,220.0,Obese,West,35,Female,Associates_Academic_Program,Not_Ascertained,3.03
29479,H051563,Very_Good,no,no,130.0,Overweight,West,72,Female,High_School_Graduate,White_Only,2.07
29480,H058432,Good,no,no,168.0,Healthy_Weight,West,58,Male,Some_College_no_degree,White_Only,2.05


In [6]:
# columns in df
clean_df2.columns

Index(['ID', 'General_Health_Status', 'Diabetes', 'Prediabetes', 'Weight_Lbs',
       'Categorical_BMI', 'Region', 'Age', 'Gender', 'Education', 'Race',
       'Poverty_Ratio'],
      dtype='object')

# Model 1: Univariate logistic regression
## Objective:  to determine the correlation bet. "Ever had Diabetes?" and "Education of sample adult"

In [7]:
# a new df of 'Diabetes" & 'Education'
clean_df3 = clean_df2.loc[:, ['Diabetes', 'Education'] ]
clean_df3

Unnamed: 0,Diabetes,Education
0,no,Grade_1-11
1,yes,Associates_Academic_Program
2,no,Bachelor
3,no,Some_College_no_degree
4,no,High_School_Graduate
...,...,...
29477,no,Masters
29478,yes,Associates_Academic_Program
29479,no,High_School_Graduate
29480,no,Some_College_no_degree


In [8]:
# drop any null value
clean_df4 = clean_df3.dropna()
clean_df4

Unnamed: 0,Diabetes,Education
0,no,Grade_1-11
1,yes,Associates_Academic_Program
2,no,Bachelor
3,no,Some_College_no_degree
4,no,High_School_Graduate
...,...,...
29477,no,Masters
29478,yes,Associates_Academic_Program
29479,no,High_School_Graduate
29480,no,Some_College_no_degree


This df has no null values.  We started with almost 30k rows, and we still have them all. 

In [9]:
# convert categorical variables into indicators
clean_df5 = pd.get_dummies(clean_df4, columns=['Diabetes',
                                              'Education'])
clean_df5

Unnamed: 0,Diabetes_Dont_Know,Diabetes_Refused,Diabetes_no,Diabetes_yes,Education_12th_Grade_no_diploma,Education_Associates_Academic_Program,Education_Associates_Occupational_Technical_Vocational,Education_Bachelor,Education_Dont_Know,Education_GED_Equivalent,Education_Grade_1-11,Education_Greater_Than_Master,Education_High_School_Graduate,Education_Masters,Education_Refused,Education_Some_College_no_degree
0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
4,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29477,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
29478,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
29479,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0
29480,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1


In [10]:
# drop the unnessary dummies
clean_df6 = clean_df5.drop(columns={'Diabetes_Dont_Know',
                                    'Diabetes_Refused',
                                    'Diabetes_no',
                                    'Education_Refused',
                                    'Education_Dont_Know'} )
                        
clean_df6

Unnamed: 0,Diabetes_yes,Education_12th_Grade_no_diploma,Education_Associates_Academic_Program,Education_Associates_Occupational_Technical_Vocational,Education_Bachelor,Education_GED_Equivalent,Education_Grade_1-11,Education_Greater_Than_Master,Education_High_School_Graduate,Education_Masters,Education_Some_College_no_degree
0,0,0,0,0,0,0,1,0,0,0,0
1,1,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
29477,0,0,0,0,0,0,0,0,0,1,0
29478,1,0,1,0,0,0,0,0,0,0,0
29479,0,0,0,0,0,0,0,0,1,0,0
29480,0,0,0,0,0,0,0,0,0,0,1


In [11]:
# create our features by dropping our target
X = clean_df6.drop(columns=['Diabetes_yes'])
X.head()

Unnamed: 0,Education_12th_Grade_no_diploma,Education_Associates_Academic_Program,Education_Associates_Occupational_Technical_Vocational,Education_Bachelor,Education_GED_Equivalent,Education_Grade_1-11,Education_Greater_Than_Master,Education_High_School_Graduate,Education_Masters,Education_Some_College_no_degree
0,0,0,0,0,0,1,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,1,0,0


In [12]:
# create our target
y = clean_df6['Diabetes_yes']
y[:5]

0    0
1    1
2    0
3    0
4    0
Name: Diabetes_yes, dtype: uint8

In [13]:
# check the balance of the target value
y.value_counts()

0    26348
1     3134
Name: Diabetes_yes, dtype: int64

'0' codes for 'No diabetes'.

'1' codes for 'Yes'.

About 10% of the population are diabetic.


We don't want  any imbalance:  one class has too few or too much instances in the training set.  We may need to do some balancing. 

### Split the data into train and test set

In [14]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [15]:
y_test.value_counts()

0    6587
1     784
Name: Diabetes_yes, dtype: int64

In [16]:
y.value_counts()

0    26348
1     3134
Name: Diabetes_yes, dtype: int64

In [17]:
# create a scaler instance
import sklearn as skl
X_scaler = skl.preprocessing.StandardScaler()

In [18]:
# fit/train the scaler
X_scaler.fit(X_train)

StandardScaler()

In [19]:
# scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [20]:
print (X_train_scaled)

[[-0.12792206 -0.3123107  -0.19822953 ...  1.865352   -0.35768646
  -0.41667728]
 [-0.12792206 -0.3123107  -0.19822953 ... -0.53609185 -0.35768646
  -0.41667728]
 [-0.12792206 -0.3123107  -0.19822953 ...  1.865352   -0.35768646
  -0.41667728]
 ...
 [-0.12792206 -0.3123107  -0.19822953 ... -0.53609185 -0.35768646
  -0.41667728]
 [-0.12792206 -0.3123107  -0.19822953 ... -0.53609185  2.79574466
  -0.41667728]
 [-0.12792206 -0.3123107  -0.19822953 ... -0.53609185  2.79574466
  -0.41667728]]


In [21]:
print(X_test_scaled)

[[-0.12792206 -0.3123107  -0.19822953 ...  1.865352   -0.35768646
  -0.41667728]
 [-0.12792206 -0.3123107  -0.19822953 ... -0.53609185  2.79574466
  -0.41667728]
 [-0.12792206 -0.3123107  -0.19822953 ... -0.53609185  2.79574466
  -0.41667728]
 ...
 [-0.12792206 -0.3123107  -0.19822953 ... -0.53609185  2.79574466
  -0.41667728]
 [-0.12792206 -0.3123107  -0.19822953 ... -0.53609185 -0.35768646
   2.39993886]
 [-0.12792206 -0.3123107  -0.19822953 ... -0.53609185 -0.35768646
  -0.41667728]]


In [22]:
# 75% train, 25% test for X
print(X_train_scaled.shape)
print(X_test_scaled.shape)

(22111, 10)
(7371, 10)


In [23]:
# 75% train, 25% test for y
print(y_train.shape)
print(y_test.shape)

(22111,)
(7371,)


In [24]:
# create a logistic regression model
model = LogisticRegression(solver='lbfgs', random_state=1)
model

LogisticRegression(random_state=1)

In [25]:
# fit (train) model using the training data
model.fit(X_train_scaled, y_train)

LogisticRegression(random_state=1)

### make predictions

In [26]:
# predict outcomes for the test data set
predictions = model.predict(X_test_scaled)
pd.DataFrame( {'Prediction': predictions, 'Acutal': y_test} )

Unnamed: 0,Prediction,Acutal
10675,0,0
14080,0,0
7111,0,0
2101,0,0
22333,0,0
...,...,...
2592,0,0
4787,0,0
16708,0,0
17847,0,0


In [27]:
# cal. the balanced accuracy score
y_pred = model.predict(X_test_scaled)
score = balanced_accuracy_score(y_test, y_pred)

print('Accuracy score: ', score)

Accuracy score:  0.5


50% of prediction and acutal match.  

It would take a lot of time to determine accuracy one-by-one for all 36 variables.  Therefore, we will create a loop to automate this process.

# Looping over  variables to determine their accuracy

In [28]:
# remove the indexes (that made the matrix too big to run)
clean_df2 = clean_df2.drop(columns=['ID'])
clean_df2

Unnamed: 0,General_Health_Status,Diabetes,Prediabetes,Weight_Lbs,Categorical_BMI,Region,Age,Gender,Education,Race,Poverty_Ratio
0,Very_Good,no,yes,199.0,Overweight,South,50,Male,Grade_1-11,White_Only,1.93
1,Very_Good,yes,yes,205.0,Overweight,South,53,Male,Associates_Academic_Program,African_American_Only,4.45
2,Very_Good,no,no,160.0,Overweight,South,56,Male,Bachelor,White_Only,5.94
3,Fair,no,no,190.0,Obese,South,57,Female,Some_College_no_degree,White_Only,3.70
4,Good,no,no,250.0,Obese,South,25,Male,High_School_Graduate,African_American_Only,1.66
...,...,...,...,...,...,...,...,...,...,...,...
29477,Very_Good,no,no,140.0,Overweight,West,70,Female,Masters,White_Only,5.11
29478,Fair,yes,yes,220.0,Obese,West,35,Female,Associates_Academic_Program,Not_Ascertained,3.03
29479,Very_Good,no,no,130.0,Overweight,West,72,Female,High_School_Graduate,White_Only,2.07
29480,Good,no,no,168.0,Healthy_Weight,West,58,Male,Some_College_no_degree,White_Only,2.05


In [29]:
import sklearn as skl

target_column = "Diabetes"

target_column_onehot = 'yes'

for column in clean_df2.columns.values:
    if target_column != column:
        XY = clean_df2[ [column,"Diabetes"] ].copy()
        XY = XY.dropna()
        if len( XY ) > 0:
            if len( XY[column] ) == len( XY[target_column] ):
                X = pd.get_dummies( XY[ column ].astype('str') )
                y = pd.get_dummies( XY[ target_column ].astype('str') ) 
                y = y[ target_column_onehot ]
                print(column,X.shape,y.shape)
                X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
                X_scaler = skl.preprocessing.StandardScaler()
                X_scaler.fit(X_train)
                X_train_scaled = X_scaler.transform(X_train)
                X_test_scaled = X_scaler.transform(X_test)
                model = LogisticRegression(solver='lbfgs', random_state=1)
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                print( column, "score:  ",  balanced_accuracy_score(y_test, y_pred) )
    #break

General_Health_Status (29482, 7) (29482,)
General_Health_Status score:   0.5
Prediabetes (29482, 4) (29482,)
Prediabetes score:   0.5
Weight_Lbs (26887, 198) (26887,)
Weight_Lbs score:   0.5
Categorical_BMI (29482, 5) (29482,)
Categorical_BMI score:   0.5
Region (29482, 4) (29482,)
Region score:   0.5
Age (29482, 70) (29482,)
Age score:   0.5
Gender (29482, 4) (29482,)
Gender score:   0.5
Education (29482, 12) (29482,)
Education score:   0.5
Race (29482, 9) (29482,)
Race score:   0.5
Poverty_Ratio (29482, 956) (29482,)
Poverty_Ratio score:   0.5


Most variables only have 0.5 accuracy rate, suggesting that not a single variable is strongly correlated with diabetes.

An analysis (not shown here, done previously) shows that the two variables with the high accuracy values are:
1. Taking diabetic pills:  0.77
2. Taking insulin:  0.65

This is very interesting, suggesting that people that are taking diabetic pills or  insulin are not all diabetic.  

# Univariate logistic regression with the following variables:

1. Categorical_BMI
2. Education
3. Poverty_ratio
4. Diabetes


In [30]:
import sklearn as skl

target_column = "Diabetes"
target_column_onehot = 'yes'

columns=[
         'Categorical_BMI',
         'Education',
         'Poverty_Ratio'
         ]
         
columnsCAT=[
         'Categorical_BMI',
         'Education'
         ]

XYcolumns = columns+[target_column]
print(XYcolumns)
XY = clean_df2.loc[ :, XYcolumns ].copy()
XY = XY.dropna()
#X = pd.get_dummies( XY[ columnsCAT ].astype('str') )
X = pd.get_dummies( XY[ columns ].astype('str'), columns=columnsCAT )
y_ = pd.get_dummies( XY[ target_column ].astype('str') ) 
y = y_[ target_column_onehot ]
print(columns,X.shape,y.shape)
print(XY.head() )
X_train, X_test, y_train, y_test = train_test_split(X, y) #, random_state=1, stratify=y)
X_scaler = skl.preprocessing.StandardScaler()
X_scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
model = LogisticRegression(solver='lbfgs', random_state=1)
#model.fit(X_train, y_train)
model.fit(X_train_scaled, y_train)
#y_pred = model.predict(X_test)
y_pred = model.predict(X_test_scaled)
print( "Score", balanced_accuracy_score(y_test, y_pred) )
    #break

['Categorical_BMI', 'Education', 'Poverty_Ratio', 'Diabetes']
['Categorical_BMI', 'Education', 'Poverty_Ratio'] (29482, 18) (29482,)
  Categorical_BMI                    Education  Poverty_Ratio Diabetes
0      Overweight                   Grade_1-11           1.93       no
1      Overweight  Associates_Academic_Program           4.45      yes
2      Overweight                     Bachelor           5.94       no
3           Obese       Some_College_no_degree           3.70       no
4           Obese         High_School_Graduate           1.66       no
Score 0.5


Result:  The accuracy ratio: 0.5 

### Break down the logistic regression code above  to see whether it gives the same result (accuracy score: 0.5)

This is to mainly check if the codes are correctly written and to look at the data closely.

In [31]:
# a test df of the following variables
test_df = clean_df2.loc[:, ['Diabetes',
                            'Categorical_BMI',
                            'Education',
                            'Poverty_Ratio']]
test_df                         

Unnamed: 0,Diabetes,Categorical_BMI,Education,Poverty_Ratio
0,no,Overweight,Grade_1-11,1.93
1,yes,Overweight,Associates_Academic_Program,4.45
2,no,Overweight,Bachelor,5.94
3,no,Obese,Some_College_no_degree,3.70
4,no,Obese,High_School_Graduate,1.66
...,...,...,...,...
29477,no,Overweight,Masters,5.11
29478,yes,Obese,Associates_Academic_Program,3.03
29479,no,Overweight,High_School_Graduate,2.07
29480,no,Healthy_Weight,Some_College_no_degree,2.05


In [32]:
# drop any null value
test_df1 = test_df.dropna()
test_df1

Unnamed: 0,Diabetes,Categorical_BMI,Education,Poverty_Ratio
0,no,Overweight,Grade_1-11,1.93
1,yes,Overweight,Associates_Academic_Program,4.45
2,no,Overweight,Bachelor,5.94
3,no,Obese,Some_College_no_degree,3.70
4,no,Obese,High_School_Graduate,1.66
...,...,...,...,...
29477,no,Overweight,Masters,5.11
29478,yes,Obese,Associates_Academic_Program,3.03
29479,no,Overweight,High_School_Graduate,2.07
29480,no,Healthy_Weight,Some_College_no_degree,2.05


In [33]:
# convert categorical variables into indicators
test_df2 = pd.get_dummies(test_df1, columns=['Diabetes',
                                             'Categorical_BMI', 
                                             'Education'])
test_df2

Unnamed: 0,Poverty_Ratio,Diabetes_Dont_Know,Diabetes_Refused,Diabetes_no,Diabetes_yes,Categorical_BMI_Healthy_Weight,Categorical_BMI_Obese,Categorical_BMI_Overweight,Categorical_BMI_Underweight,Categorical_BMI_Unknown,...,Education_Associates_Occupational_Technical_Vocational,Education_Bachelor,Education_Dont_Know,Education_GED_Equivalent,Education_Grade_1-11,Education_Greater_Than_Master,Education_High_School_Graduate,Education_Masters,Education_Refused,Education_Some_College_no_degree
0,1.93,0,0,1,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
1,4.45,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5.94,0,0,1,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
3,3.70,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1.66,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29477,5.11,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
29478,3.03,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29479,2.07,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
29480,2.05,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


Please note that 'SA family poverty ratio' is a continuous variable. 

In [34]:
# drop the unnessary dummies
test_df3 = test_df2.drop(columns= { 'Diabetes_Dont_Know',
                                    'Diabetes_Refused',
                                    'Diabetes_no',
                                    'Education_Refused',
                                    'Education_Dont_Know'} )
test_df3                        

Unnamed: 0,Poverty_Ratio,Diabetes_yes,Categorical_BMI_Healthy_Weight,Categorical_BMI_Obese,Categorical_BMI_Overweight,Categorical_BMI_Underweight,Categorical_BMI_Unknown,Education_12th_Grade_no_diploma,Education_Associates_Academic_Program,Education_Associates_Occupational_Technical_Vocational,Education_Bachelor,Education_GED_Equivalent,Education_Grade_1-11,Education_Greater_Than_Master,Education_High_School_Graduate,Education_Masters,Education_Some_College_no_degree
0,1.93,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
1,4.45,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0
2,5.94,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0
3,3.70,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
4,1.66,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29477,5.11,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
29478,3.03,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0
29479,2.07,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
29480,2.05,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [35]:
# create our feature by dropping our target
X = test_df3.drop(columns=['Diabetes_yes'])
X.head()

Unnamed: 0,Poverty_Ratio,Categorical_BMI_Healthy_Weight,Categorical_BMI_Obese,Categorical_BMI_Overweight,Categorical_BMI_Underweight,Categorical_BMI_Unknown,Education_12th_Grade_no_diploma,Education_Associates_Academic_Program,Education_Associates_Occupational_Technical_Vocational,Education_Bachelor,Education_GED_Equivalent,Education_Grade_1-11,Education_Greater_Than_Master,Education_High_School_Graduate,Education_Masters,Education_Some_College_no_degree
0,1.93,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
1,4.45,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0
2,5.94,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0
3,3.7,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
4,1.66,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0


In [36]:
# create our target
y = clean_df6['Diabetes_yes']
y.head()

0    0
1    1
2    0
3    0
4    0
Name: Diabetes_yes, dtype: uint8

In [37]:
# check the balance of the target value
y.value_counts()

0    26348
1     3134
Name: Diabetes_yes, dtype: int64

In [38]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [39]:
# create a scaler instance
import sklearn as skl
X_scaler = skl.preprocessing.StandardScaler()

In [40]:
# fit/train the scaler
X_scaler.fit(X_train)

StandardScaler()

In [41]:
# scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [42]:
# create a logistic regression model
model = LogisticRegression(solver='lbfgs', random_state=1)
model

LogisticRegression(random_state=1)

In [43]:
# fit (train) model using the training data
model.fit(X_train_scaled, y_train)

LogisticRegression(random_state=1)

In [44]:
# predict outcomes for the test data set
predictions = model.predict(X_test_scaled)
pd.DataFrame( {'Prediction': predictions, 'Acutal': y_test} )

Unnamed: 0,Prediction,Acutal
10675,0,0
14080,0,0
7111,0,0
2101,0,0
22333,0,0
...,...,...
2592,0,0
4787,0,0
16708,0,0
17847,0,0


In [45]:
# cal. the balanced accuracy score
y_pred = model.predict(X_test_scaled)
score = balanced_accuracy_score(y_test, y_pred)

print('Accuracy score: ', score)

Accuracy score:  0.5


# Model 2:  Univariate linear regression

In [46]:
import sklearn as skl
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

target_column = "Diabetes"
target_column_onehot = 'yes'

columns=[
        'Categorical_BMI',
         'Education',
#         'Ratio of family income to poverty threshold for SA’s family',
         'Poverty_Ratio'
         ]
         
columnsCAT=[
         'Categorical_BMI',
         'Education',
#         'Ratio of family income to poverty threshold for SA’s family'
         ]

XYcolumns = columns+[target_column]
print(XYcolumns)
XY = clean_df2.loc[ :, XYcolumns ].copy()
XY = XY.dropna()
#X = pd.get_dummies( XY[ columnsCAT ].astype('str') )
#X = pd.get_dummies( XY[ columns ].astype('str'), columns=columnsCAT )
X = pd.get_dummies( XY[ columns ], columns=columnsCAT )
#X = XY[ columns ]
y_ = pd.get_dummies( XY[ target_column ].astype('str') ) 
y = y_[ target_column_onehot ]
print(columns,X.shape,y.shape)
print(XY.head() )
X_train, X_test, y_train, y_test = train_test_split(X, y) #, random_state=1, stratify=y)
X_scaler = skl.preprocessing.StandardScaler()
X_scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
#regr.fit(X_train, y_train)
regr.fit(X_train_scaled, y_train)

# Make predictions using the testing set
#y_pred = regr.predict(X_test)
y_pred = regr.predict(X_test_scaled)

# The coefficients
print("Coefficients: \n", regr.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

##model = LogisticRegression(solver='lbfgs', random_state=1)
###model.fit(X_train, y_train)
##model.fit(X_train_scaled, y_train)
###y_pred = model.predict(X_test)
##y_pred = model.predict(X_test_scaled)
##print( "Score", balanced_accuracy_score(y_test, y_pred) )
    #break

['Categorical_BMI', 'Education', 'Poverty_Ratio', 'Diabetes']
['Categorical_BMI', 'Education', 'Poverty_Ratio'] (29482, 18) (29482,)
  Categorical_BMI                    Education  Poverty_Ratio Diabetes
0      Overweight                   Grade_1-11           1.93       no
1      Overweight  Associates_Academic_Program           4.45      yes
2      Overweight                     Bachelor           5.94       no
3           Obese       Some_College_no_degree           3.70       no
4           Obese         High_School_Graduate           1.66       no
Coefficients: 
 [-1.51137335e-02  4.23890181e+12  4.24162074e+12  4.33574301e+12
  1.16152875e+12  1.42871930e+12 -7.50859855e+09 -1.67462672e+10
 -1.12510971e+10 -2.52109545e+10 -3.60638708e+09 -8.61976891e+09
 -1.52155913e+10 -1.14583421e+10 -2.47327909e+10 -1.86564025e+10
 -2.52121000e+09 -2.12280619e+10]
Mean squared error: 0.09
Coefficient of determination: 0.04


In [47]:
XY['Categorical_BMI'].value_counts()

Overweight        9917
Obese             9225
Healthy_Weight    9144
Unknown            733
Underweight        463
Name: Categorical_BMI, dtype: int64

In [48]:
np.unique(y_pred, return_counts=True )

(array([-0.0311298 , -0.02429386, -0.02424045, ...,  0.2521954 ,
         0.25268368,  0.25317196]),
 array([2, 2, 3, ..., 2, 3, 3]))

In [49]:
np.unique(y_test, return_counts=True )

(array([0, 1], dtype=uint8), array([6583,  788]))

In [50]:
y.value_counts()

0    26348
1     3134
Name: yes, dtype: int64

In [51]:
clean_df2['Poverty_Ratio']

0        1.93
1        4.45
2        5.94
3        3.70
4        1.66
         ... 
29477    5.11
29478    3.03
29479    2.07
29480    2.05
29481    4.89
Name: Poverty_Ratio, Length: 29482, dtype: float64

In [52]:
y_['yes'].value_counts()

0    26348
1     3134
Name: yes, dtype: int64

In [53]:
X_train_scaled.shape

(22111, 18)

In [54]:
X_test_scaled.shape

(7371, 18)