In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Label Prep

In [45]:
demographics_raw = pd.read_sas('Data/Demographics/DEMO_I.XPT')
diabetes_raw = pd.read_sas('Data/Questionnaire/DIQ_I.XPT')

# Eliminate people who are under 17 next time
# demographics_raw = demographics_raw[demographics_raw.RIDAGEYR > 17]
# diabetes_raw = diabetes_raw[diabetes_raw.DIQ010 != 9].replace({'DIQ010': 3.0}, {'DIQ010': 2.0})

In [14]:
diabetes = diabetes_raw[diabetes_raw.DIQ010 != 9].replace({'DIQ010': 3.0}, {'DIQ010': 2.0})

2.0    8715
1.0     856
Name: DIQ010, dtype: int64

# Feature Prep

## A1C1 Levels

In [2]:
glycohemoglobin_raw = pd.read_sas('Data/Laboratory/GHB_I.XPT')

In [3]:
glycohemoglobin_raw.head()

Unnamed: 0,SEQN,LBXGH
0,83732.0,7.0
1,83733.0,5.5
2,83734.0,5.8
3,83735.0,5.6
4,83736.0,5.6


## Blood Pressure

In [4]:
blood_pressure_raw = pd.read_sas('Data/Examination/BPX_I.XPT')

In [34]:
# Systolic blood pressure
bp_sys = blood_pressure_raw[['SEQN', 'BPXSY1', 'BPXSY2', 'BPXSY3', 'BPXSY4']]
bp_sys = bp_sys.fillna(0)
bp_sys = bp_sys.assign(bloodp_sys = lambda df: (df.BPXSY1 + df.BPXSY2 + df.BPXSY3 + df.BPXSY4) / 3)
bp_sys = bp_sys[bp_sys.bloodp_sys != 0]
bp_sys.head()

Unnamed: 0,SEQN,BPXSY1,BPXSY2,BPXSY3,BPXSY4,bloodp_sys
0,83732.0,128.0,124.0,116.0,0.0,122.666667
1,83733.0,146.0,140.0,134.0,0.0,140.0
2,83734.0,138.0,132.0,136.0,0.0,135.333333
3,83735.0,132.0,134.0,136.0,0.0,134.0
4,83736.0,100.0,114.0,98.0,0.0,104.0


In [35]:
# Diastolic blood pressure
bp_dia = blood_pressure_raw[['SEQN', 'BPXDI1', 'BPXDI2', 'BPXDI3', 'BPXDI4']]
bp_dia = bp_dia.fillna(0)
bp_dia = bp_dia.assign(bloodp_dia = lambda df: (df.BPXDI1 + df.BPXDI2 + df.BPXDI3 + df.BPXDI4) / 3)
bp_dia = bp_dia[bp_dia.bloodp_dia != 0]
bp_dia.head()

Unnamed: 0,SEQN,BPXDI1,BPXDI2,BPXDI3,BPXDI4,bloodp_dia
0,83732.0,70.0,64.0,62.0,0.0,65.333333
1,83733.0,88.0,88.0,82.0,0.0,86.0
2,83734.0,46.0,44.0,46.0,0.0,45.333333
3,83735.0,72.0,68.0,70.0,0.0,70.0
4,83736.0,70.0,54.0,56.0,0.0,60.0


In [67]:
blood_pressure = pd.merge(bp_sys, bp_dia, how='inner', on='SEQN')
blood_pressure = blood_pressure[['SEQN', 'bloodp_sys', 'bloodp_dia']]
blood_pressure.head()

Unnamed: 0,SEQN,bloodp_sys,bloodp_dia
0,83732.0,122.666667,65.333333
1,83733.0,140.0,86.0
2,83734.0,135.333333,45.333333
3,83735.0,134.0,70.0
4,83736.0,104.0,60.0


## Glucose Levels (Serum)

In [24]:
serum_raw = pd.read_sas('Data/Laboratory/BIOPRO_I.XPT')

In [36]:
glucose_ser = serum_raw[['SEQN', 'LBXSGL']]
glucose_ser.head()

Unnamed: 0,SEQN,LBXSGL
0,83732.0,94.0
1,83733.0,94.0
2,83734.0,103.0
3,83735.0,63.0
4,83736.0,83.0


# BMI

In [30]:
body_raw = pd.read_sas('Data/Examination/BMX_I.XPT')

In [37]:
bmi = body_raw[['SEQN', 'BMXBMI']]
bmi.head()

Unnamed: 0,SEQN,BMXBMI
0,83732.0,27.8
1,83733.0,30.8
2,83734.0,28.8
3,83735.0,42.4
4,83736.0,20.3


# Creatinine

In [32]:
creatinine_raw = pd.read_sas('Data/Laboratory/ALB_CR_I.XPT')

In [38]:
creatinine = creatinine_raw[['SEQN', 'URXUCR']]
creatinine.head()

Unnamed: 0,SEQN,URXUCR
0,83732.0,41.0
1,83733.0,181.0
2,83734.0,70.0
3,83735.0,102.0
4,83736.0,315.0


# HDL

In [39]:
hdl_raw = pd.read_sas('Data/Laboratory/HDL_I.XPT')

In [40]:
hdl = hdl_raw[['SEQN', 'LBDHDD']]
hdl.head()

Unnamed: 0,SEQN,LBDHDD
0,83732.0,46.0
1,83733.0,63.0
2,83734.0,30.0
3,83735.0,61.0
4,83736.0,53.0


# Triglycerides

In [42]:
trigly_raw = pd.read_sas('Data/Laboratory/TRIGLY_I.XPT')

In [43]:
trigly = trigly_raw[['SEQN', 'LBXTR']]
trigly.head()

Unnamed: 0,SEQN,LBXTR
0,83733.0,147.0
1,83734.0,269.0
2,83736.0,47.0
3,83737.0,46.0
4,83741.0,68.0


# Race/Gender/Age

In [50]:
race_gender_age = demographics_raw[['SEQN', 'RIDRETH1', 'RIAGENDR', 'RIDAGEYR']]
race_gender_age.head()

Unnamed: 0,SEQN,RIDRETH1,RIAGENDR,RIDAGEYR
0,83732.0,3.0,1.0,62.0
1,83733.0,3.0,1.0,53.0
2,83734.0,3.0,1.0,78.0
3,83735.0,3.0,2.0,56.0
4,83736.0,4.0,2.0,42.0


In [51]:
race_gender_age['RIDRETH1'].value_counts()

3.0    3066
4.0    2129
1.0    1921
5.0    1547
2.0    1308
Name: RIDRETH1, dtype: int64

In [52]:
race_gender_age['RIAGENDR'].value_counts()

2.0    5079
1.0    4892
Name: RIAGENDR, dtype: int64

In [53]:
race_gender_age['RIDAGEYR'].value_counts()

5.397605e-79    396
8.000000e+01    376
1.000000e+00    293
2.000000e+00    291
7.000000e+00    238
               ... 
7.400000e+01     56
7.800000e+01     49
7.700000e+01     48
7.600000e+01     46
7.900000e+01     37
Name: RIDAGEYR, Length: 81, dtype: int64

In [58]:
race_gender_age['RIDAGEYR'] = race_gender_age['RIDAGEYR'].apply(lambda x: round(x, 2))
race_gender_age['RIDAGEYR'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


0.0     396
80.0    376
1.0     293
2.0     291
7.0     238
       ... 
74.0     56
78.0     49
77.0     48
76.0     46
79.0     37
Name: RIDAGEYR, Length: 81, dtype: int64

# Merge Features

In [71]:
features = pd.merge(glycohemoglobin_raw, race_gender_age, how='inner', on='SEQN')
features = pd.merge(features, blood_pressure, how='inner', on='SEQN')
features = pd.merge(features, blood_pressure, how='inner', on='SEQN')
features = pd.merge(features, blood_pressure, how='inner', on='SEQN')
features = pd.merge(features, blood_pressure, how='inner', on='SEQN')
features = pd.merge(features, blood_pressure, how='inner', on='SEQN')
features = pd.merge(features, blood_pressure, how='inner', on='SEQN')
features = pd.merge(features, blood_pressure, how='inner', on='SEQN')
features = pd.merge(features, blood_pressure, how='inner', on='SEQN')
features = pd.merge(features, blood_pressure, how='inner', on='SEQN')