<a href="https://colab.research.google.com/github/DaniliukK95/Drops_Of_Jupyter/blob/Amy-Ying-Lin/un_ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [50]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, confusion_matrix

In [51]:
# load your file
clean_df2 = pd.read_csv('clean.csv')
clean_df2

Unnamed: 0.1,Unnamed: 0,Unique ID,Household region,Age of sample adult,Sex of sample adult,Education of sample adult,Hispanic ethnicity of SA,Single + Mult race gps w Hispanic origin,Hispanic origin detail,Single and multiple race groups,...,Medicaid HDHP.1,Medicaid reassignment flag,Paid for by Medicare - plan 1,Paid for by Medicare - plan 2,"Not eligible for Medicaid, CHIP, or other public coverage",SA family poverty ratio,Ratio of family income to poverty threshold for SA’s family,income from wages,income from SSI SSDI,income from retirement
0,0,H056808,3,50,1,1,2,2,3,1,...,,,2.0,,,1.93,7,1,2.0,2.0
1,1,H018779,3,53,1,7,2,3,3,2,...,,,2.0,,,4.45,12,1,2.0,2.0
2,2,H049265,3,56,1,8,2,2,3,1,...,,,2.0,,,5.94,14,1,2.0,2.0
3,3,H007699,3,57,2,5,2,2,3,1,...,,,,,,3.70,11,1,1.0,1.0
4,4,H066034,3,25,1,4,2,3,3,2,...,,,2.0,,,1.66,6,8,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29477,29477,H012375,4,70,2,9,2,2,3,1,...,,,2.0,,,5.11,14,2,2.0,2.0
29478,29478,H052160,4,35,2,7,1,1,2,8,...,,,2.0,,,3.03,10,1,2.0,2.0
29479,29479,H051563,4,72,2,4,2,2,3,1,...,,,,,,2.07,8,2,2.0,2.0
29480,29480,H058432,4,58,1,5,2,2,3,1,...,,,,,,2.05,8,2,1.0,2.0


In [52]:
clean_df2.columns

Index(['Unnamed: 0', 'Unique ID', 'Household region', 'Age of sample adult',
       'Sex of sample adult', 'Education of sample adult',
       'Hispanic ethnicity of SA', 'Single + Mult race gps w Hispanic origin',
       'Hispanic origin detail', 'Single and multiple race groups',
       'General health status', 'Ever had Diabetes?', 'Ever had pre-diabetes?',
       'Taking diabetic pills', 'Taking insuliin', 'Diabetes type',
       'Ever had weak/failing kidneys', 'Weight without shoes (pounds)',
       'Categorical Body Mass Index', 'Health insurance hierarchy under 65',
       'Health insurance hierarchy under 65.1', 'Type of Medicare coverage',
       'Enrolled in Medicare Advantage Plan', 'Medicare HMO',
       'Medicare Advantage Plan', 'Medicare Part D',
       'Medicaid through Marketplace ', 'Medicaid premium',
       'Medicaid deductible', 'Medicaid HDHP', 'Medicaid HDHP.1',
       'Medicaid reassignment flag', 'Paid for by Medicare - plan 1',
       'Paid for by Medicare - 

In [53]:
# remove the unnamed: 0 and unique ID
clean_u = clean_df2.drop(['Unnamed: 0', 'Unique ID'], axis='columns')
clean_u.head()

Unnamed: 0,Household region,Age of sample adult,Sex of sample adult,Education of sample adult,Hispanic ethnicity of SA,Single + Mult race gps w Hispanic origin,Hispanic origin detail,Single and multiple race groups,General health status,Ever had Diabetes?,...,Medicaid HDHP.1,Medicaid reassignment flag,Paid for by Medicare - plan 1,Paid for by Medicare - plan 2,"Not eligible for Medicaid, CHIP, or other public coverage",SA family poverty ratio,Ratio of family income to poverty threshold for SA’s family,income from wages,income from SSI SSDI,income from retirement
0,3,50,1,1,2,2,3,1,2,2,...,,,2.0,,,1.93,7,1,2.0,2.0
1,3,53,1,7,2,3,3,2,2,1,...,,,2.0,,,4.45,12,1,2.0,2.0
2,3,56,1,8,2,2,3,1,2,2,...,,,2.0,,,5.94,14,1,2.0,2.0
3,3,57,2,5,2,2,3,1,4,2,...,,,,,,3.7,11,1,1.0,1.0
4,3,25,1,4,2,3,3,2,3,2,...,,,2.0,,,1.66,6,8,,


In [54]:
# null values for each column
for column in clean_u.columns:
  print( f'Column {column} has {clean_u[column].isnull().sum()} null values. ')

Column Household region has 0 null values. 
Column Age of sample adult has 0 null values. 
Column Sex of sample adult has 0 null values. 
Column Education of sample adult has 0 null values. 
Column Hispanic ethnicity of SA has 0 null values. 
Column Single + Mult race gps w Hispanic origin has 0 null values. 
Column Hispanic origin detail has 0 null values. 
Column Single and multiple race groups has 0 null values. 
Column General health status has 0 null values. 
Column Ever had Diabetes? has 0 null values. 
Column Ever had pre-diabetes? has 0 null values. 
Column Taking diabetic pills has 23780 null values. 
Column Taking insuliin has 23780 null values. 
Column Diabetes type has 26348 null values. 
Column Ever had weak/failing kidneys has 0 null values. 
Column Weight without shoes (pounds) has 0 null values. 
Column Categorical Body Mass Index has 0 null values. 
Column Health insurance hierarchy under 65 has 8908 null values. 
Column Health insurance hierarchy under 65.1 has 20575 

Columns with lots of null values:
1. taking diabeti pills
2. taking insulin
3. diabetes type
4. medicare adv plan
5. not eligible for medicaid

What should we do, drop them or some?

The age of the sample adult col. is on a different scale than the other columns, it needs to have a similar scale on all the variables in order to use unsupervised learning algorithms.  So the age column should be rescaled.  Divide the col. by 10?

In [55]:
# transform the age col.
clean_u['Age of sample adult'] = clean_u['Age of sample adult'] / 10
clean_u.head()

Unnamed: 0,Household region,Age of sample adult,Sex of sample adult,Education of sample adult,Hispanic ethnicity of SA,Single + Mult race gps w Hispanic origin,Hispanic origin detail,Single and multiple race groups,General health status,Ever had Diabetes?,...,Medicaid HDHP.1,Medicaid reassignment flag,Paid for by Medicare - plan 1,Paid for by Medicare - plan 2,"Not eligible for Medicaid, CHIP, or other public coverage",SA family poverty ratio,Ratio of family income to poverty threshold for SA’s family,income from wages,income from SSI SSDI,income from retirement
0,3,5.0,1,1,2,2,3,1,2,2,...,,,2.0,,,1.93,7,1,2.0,2.0
1,3,5.3,1,7,2,3,3,2,2,1,...,,,2.0,,,4.45,12,1,2.0,2.0
2,3,5.6,1,8,2,2,3,1,2,2,...,,,2.0,,,5.94,14,1,2.0,2.0
3,3,5.7,2,5,2,2,3,1,4,2,...,,,,,,3.7,11,1,1.0,1.0
4,3,2.5,1,4,2,3,3,2,3,2,...,,,2.0,,,1.66,6,8,,


In [56]:
# datetypes
clean_u.dtypes

Household region                                                 int64
Age of sample adult                                            float64
Sex of sample adult                                              int64
Education of sample adult                                        int64
Hispanic ethnicity of SA                                         int64
Single + Mult race gps w Hispanic origin                         int64
Hispanic origin detail                                           int64
Single and multiple race groups                                  int64
General health status                                            int64
Ever had Diabetes?                                               int64
Ever had pre-diabetes?                                           int64
Taking diabetic pills                                          float64
Taking insuliin                                                float64
Diabetes type                                                  float64
Ever h

In [57]:
#
import pandas as pd
clean_u1 = pd.get_dummies(clean_u, columns=None)
clean_u1

Unnamed: 0,Household region,Age of sample adult,Sex of sample adult,Education of sample adult,Hispanic ethnicity of SA,Single + Mult race gps w Hispanic origin,Hispanic origin detail,Single and multiple race groups,General health status,Ever had Diabetes?,...,Medicaid HDHP.1,Medicaid reassignment flag,Paid for by Medicare - plan 1,Paid for by Medicare - plan 2,"Not eligible for Medicaid, CHIP, or other public coverage",SA family poverty ratio,Ratio of family income to poverty threshold for SA’s family,income from wages,income from SSI SSDI,income from retirement
0,3,5.0,1,1,2,2,3,1,2,2,...,,,2.0,,,1.93,7,1,2.0,2.0
1,3,5.3,1,7,2,3,3,2,2,1,...,,,2.0,,,4.45,12,1,2.0,2.0
2,3,5.6,1,8,2,2,3,1,2,2,...,,,2.0,,,5.94,14,1,2.0,2.0
3,3,5.7,2,5,2,2,3,1,4,2,...,,,,,,3.70,11,1,1.0,1.0
4,3,2.5,1,4,2,3,3,2,3,2,...,,,2.0,,,1.66,6,8,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29477,4,7.0,2,9,2,2,3,1,2,2,...,,,2.0,,,5.11,14,2,2.0,2.0
29478,4,3.5,2,7,1,1,2,8,4,1,...,,,2.0,,,3.03,10,1,2.0,2.0
29479,4,7.2,2,4,2,2,3,1,2,2,...,,,,,,2.07,8,2,2.0,2.0
29480,4,5.8,1,5,2,2,3,1,3,2,...,,,,,,2.05,8,2,1.0,2.0


In [58]:
# feature
X = clean_u.drop()

ValueError: ignored