<a href="https://colab.research.google.com/github/amyylin1/machine-learning/blob/main/ML_Env_Supervised.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, confusion_matrix

In [2]:
# load your file
clean_df2 = pd.read_csv('clean.csv')
clean_df2.head()

Unnamed: 0.1,Unnamed: 0,Unique ID,Household region,Age of sample adult,Sex of sample adult,Education of sample adult,Hispanic ethnicity of SA,Single + Mult race gps w Hispanic origin,Hispanic origin detail,Single and multiple race groups,...,Medicaid HDHP.1,Medicaid reassignment flag,Paid for by Medicare - plan 1,Paid for by Medicare - plan 2,"Not eligible for Medicaid, CHIP, or other public coverage",SA family poverty ratio,Ratio of family income to poverty threshold for SA’s family,income from wages,income from SSI SSDI,income from retirement
0,0,H056808,3,50,1,1,2,2,3,1,...,,,2.0,,,1.93,7,1,2.0,2.0
1,1,H018779,3,53,1,7,2,3,3,2,...,,,2.0,,,4.45,12,1,2.0,2.0
2,2,H049265,3,56,1,8,2,2,3,1,...,,,2.0,,,5.94,14,1,2.0,2.0
3,3,H007699,3,57,2,5,2,2,3,1,...,,,,,,3.7,11,1,1.0,1.0
4,4,H066034,3,25,1,4,2,3,3,2,...,,,2.0,,,1.66,6,8,,


In [3]:
# columns in df
clean_df2.columns

Index(['Unnamed: 0', 'Unique ID', 'Household region', 'Age of sample adult',
       'Sex of sample adult', 'Education of sample adult',
       'Hispanic ethnicity of SA', 'Single + Mult race gps w Hispanic origin',
       'Hispanic origin detail', 'Single and multiple race groups',
       'General health status', 'Ever had Diabetes?', 'Ever had pre-diabetes?',
       'Taking diabetic pills', 'Taking insuliin', 'Diabetes type',
       'Ever had weak/failing kidneys', 'Weight without shoes (pounds)',
       'Categorical Body Mass Index', 'Health insurance hierarchy under 65',
       'Health insurance hierarchy under 65.1', 'Type of Medicare coverage',
       'Enrolled in Medicare Advantage Plan', 'Medicare HMO',
       'Medicare Advantage Plan', 'Medicare Part D',
       'Medicaid through Marketplace ', 'Medicaid premium',
       'Medicaid deductible', 'Medicaid HDHP', 'Medicaid HDHP.1',
       'Medicaid reassignment flag', 'Paid for by Medicare - plan 1',
       'Paid for by Medicare - 

# Model 1: Univariate logistic regression
## Objective:  to determine the correlation bet. "Ever had Diabetes?" and "Education of sample adult"

In [4]:
# a new df of 'Ever had Diabetes?" & 'Education of sample adult'
clean_df3 = clean_df2.loc[:, ['Ever had Diabetes?', 'Education of sample adult'] ]
clean_df3

Unnamed: 0,Ever had Diabetes?,Education of sample adult
0,2,1
1,1,7
2,2,8
3,2,5
4,2,4
...,...,...
29477,2,9
29478,1,7
29479,2,4
29480,2,5


In [5]:
# drop any null value
clean_df4 = clean_df3.dropna()
clean_df4

Unnamed: 0,Ever had Diabetes?,Education of sample adult
0,2,1
1,1,7
2,2,8
3,2,5
4,2,4
...,...,...
29477,2,9
29478,1,7
29479,2,4
29480,2,5


This df has no null values.  We started with almost 30k rows, and we still have them all. 

In [6]:
# convert categorical variables into indicators
clean_df5 = pd.get_dummies(clean_df4, columns=['Ever had Diabetes?',
                                              'Education of sample adult'])
clean_df5

Unnamed: 0,Ever had Diabetes?_1,Ever had Diabetes?_2,Ever had Diabetes?_7,Ever had Diabetes?_9,Education of sample adult_1,Education of sample adult_2,Education of sample adult_3,Education of sample adult_4,Education of sample adult_5,Education of sample adult_6,Education of sample adult_7,Education of sample adult_8,Education of sample adult_9,Education of sample adult_10,Education of sample adult_97,Education of sample adult_99
0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29477,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0
29478,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
29479,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0
29480,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0


'Ever had Diabetes?_7' codes for 'Refused' and it has frequency of 17 of 30k. 

'Ever had Diabetes?_9' codes for 'Don't know' and it has frequency of 12 out 30k. 

These are low frequency counts and can be dropped. Do the same for the 'Education of sample adult' variable. 




In [7]:
# drop the unnessary dummies
clean_df6 = clean_df5.drop(columns={'Ever had Diabetes?_2',
                                    'Ever had Diabetes?_7',
                                    'Ever had Diabetes?_9',
                                    'Education of sample adult_97',
                                    'Education of sample adult_99'} )
                        
clean_df6

Unnamed: 0,Ever had Diabetes?_1,Education of sample adult_1,Education of sample adult_2,Education of sample adult_3,Education of sample adult_4,Education of sample adult_5,Education of sample adult_6,Education of sample adult_7,Education of sample adult_8,Education of sample adult_9,Education of sample adult_10
0,0,1,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
29477,0,0,0,0,0,0,0,0,0,1,0
29478,1,0,0,0,0,0,0,1,0,0,0
29479,0,0,0,0,1,0,0,0,0,0,0
29480,0,0,0,0,0,1,0,0,0,0,0


In [8]:
# create our features by dropping our target
X = clean_df6.drop(columns=['Ever had Diabetes?_1'])
X.head()

Unnamed: 0,Education of sample adult_1,Education of sample adult_2,Education of sample adult_3,Education of sample adult_4,Education of sample adult_5,Education of sample adult_6,Education of sample adult_7,Education of sample adult_8,Education of sample adult_9,Education of sample adult_10
0,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,1,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0


In [9]:
# create our target
y = clean_df6['Ever had Diabetes?_1']
y[:5]

0    0
1    1
2    0
3    0
4    0
Name: Ever had Diabetes?_1, dtype: uint8

In [10]:
# check the balance of the target value
y.value_counts()

# result: good, 1:1 ratio of taking vs. not taking, no need to resample the data

0    26348
1     3134
Name: Ever had Diabetes?_1, dtype: int64

'0' codes for 'No diabetes'.

'1' codes for 'Yes'.

About 10% of the population are diabetic.


We don't want  any imbalance:  one class has too few or too much instances in the training set.  We may need to do some balancing. 

### Split the data into train and test set

In [11]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [12]:
# create a scaler instance
import sklearn as skl
X_scaler = skl.preprocessing.StandardScaler()

In [13]:
# fit/train the scaler
X_scaler.fit(X_train)

StandardScaler()

In [14]:
# scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [15]:
print (X_train_scaled)

[[-0.27179185 -0.1299162   6.64956908 ... -0.55884112 -0.35543045
  -0.20128996]
 [-0.27179185 -0.1299162  -0.15038568 ...  1.78941735 -0.35543045
  -0.20128996]
 [-0.27179185 -0.1299162  -0.15038568 ... -0.55884112 -0.35543045
  -0.20128996]
 ...
 [ 3.67928616 -0.1299162  -0.15038568 ... -0.55884112 -0.35543045
  -0.20128996]
 [-0.27179185 -0.1299162  -0.15038568 ... -0.55884112  2.81348997
  -0.20128996]
 [-0.27179185 -0.1299162  -0.15038568 ... -0.55884112  2.81348997
  -0.20128996]]


In [16]:
print(X_test_scaled)

[[-0.27179185 -0.1299162  -0.15038568 ... -0.55884112  2.81348997
  -0.20128996]
 [-0.27179185 -0.1299162  -0.15038568 ... -0.55884112 -0.35543045
  -0.20128996]
 [-0.27179185 -0.1299162  -0.15038568 ... -0.55884112  2.81348997
  -0.20128996]
 ...
 [-0.27179185 -0.1299162  -0.15038568 ... -0.55884112  2.81348997
  -0.20128996]
 [-0.27179185 -0.1299162  -0.15038568 ...  1.78941735 -0.35543045
  -0.20128996]
 [-0.27179185 -0.1299162  -0.15038568 ...  1.78941735 -0.35543045
  -0.20128996]]


In [17]:
# 75% train, 25% test for X
print(X_train_scaled.shape)
print(X_test_scaled.shape)

(22111, 10)
(7371, 10)


In [18]:
# 75% train, 25% test for y
print(y_train.shape)
print(y_test.shape)

(22111,)
(7371,)


In [19]:
# create a logistic regression model
model = LogisticRegression(solver='lbfgs', random_state=1)
model

LogisticRegression(random_state=1)

In [20]:
# fit (train) model using the training data
model.fit(X_train_scaled, y_train)

LogisticRegression(random_state=1)

### make predictions

In [21]:
# predict outcomes for the test data set
predictions = model.predict(X_test_scaled)
pd.DataFrame( {'Prediction': predictions, 'Acutal': y_test} )

Unnamed: 0,Prediction,Acutal
10676,0,0
14080,0,0
7111,0,0
2101,0,0
22333,0,0
...,...,...
2592,0,0
4787,0,0
16708,0,0
17847,0,0


In [22]:
# cal. the balanced accuracy score
y_pred = model.predict(X_test_scaled)
score = balanced_accuracy_score(y_test, y_pred)

print('Accuracy score: ', score)

Accuracy score:  0.5


Almost 60% of prediction and acutal DO match.  

# looping over 38 categorical variables to determine their accuracy

In [None]:
import sklearn as skl

target_column = "Ever had Diabetes?"

target_column_onehot = '1'

for column in clean_df2.columns.values:
    if target_column != column:
        XY = clean_df2[ [column,"Ever had Diabetes?"] ].copy()
        XY = XY.dropna()
        if len( XY ) > 0:
            if len( XY[column] ) == len( XY[target_column] ):
                X = pd.get_dummies( XY[ column ].astype('str') )
                y = pd.get_dummies( XY[ target_column ].astype('str') ) 
                y = y[ target_column_onehot ]
                print(column,X.shape,y.shape)
                X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
                X_scaler = skl.preprocessing.StandardScaler()
                X_scaler.fit(X_train)
                X_train_scaled = X_scaler.transform(X_train)
                X_test_scaled = X_scaler.transform(X_test)
                model = LogisticRegression(solver='lbfgs', random_state=1)
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                print( column, "score", balanced_accuracy_score(y_test, y_pred) )
    #break

Unnamed: 0 (29482, 29482) (29482,)


Take variables that have accuracy score > 0.53?