In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
%cd /content/drive/MyDrive/Hypertension_Final_Project/cleaned_file/2013

/content/drive/MyDrive/Hypertension_Final_Project/cleaned_file/2013


In [6]:
import pandas as pd
import numpy as np

In [7]:
pd.options.display.float_format = '{:20,.2f}'.format

In [8]:
year = 2013

In [9]:
BASE_PATH = "/content/drive/MyDrive/Hypertension_Final_Project/data_" + str(year) + '/'

In [10]:
BASE_PATH

'/content/drive/MyDrive/Hypertension_Final_Project/data_2013/'

# Hypertension Identification


## Blood Pressumre

In [11]:
# read sas file
blood_pressure = pd.read_sas(filepath_or_buffer = BASE_PATH + 'BPX.XPT')
# obtain Systolic_BP and Diastolic_BP data
blood_pressure_columns = ['SEQN', 'BPXPLS', 'BPXSY1', 'BPXDI1', 'BPXSY2', 'BPXDI2', 'BPXSY3', 'BPXDI3', 'BPXSY4', 'BPXDI4']
blood_pressure = blood_pressure[blood_pressure_columns]
# calculate average Systolic_BP and Diastolic_BP (average all 4 tests)
blood_pressure['Systolic_BP'] = blood_pressure.loc[:,['BPXSY1', 'BPXSY2', 'BPXSY3', 'BPXSY4']].mean(axis = 1)
blood_pressure['Diastolic_BP'] = blood_pressure.loc[:,['BPXDI1', 'BPXDI2', 'BPXDI3', 'BPXDI4']].mean(axis = 1)
# obtain clean the BP data
blood_pressure_final = blood_pressure.loc[:,['SEQN', 'Systolic_BP', 'Diastolic_BP']]

## Hypertension (Questionnaire)

In [12]:
# read questionnaire data
Hypertension = pd.read_sas(filepath_or_buffer = BASE_PATH + 'BPQ.XPT')
# preprocess the data
Hypertension_columns = ['SEQN', 'BPQ020', 'BPQ050A']
Hypertension = Hypertension[Hypertension_columns]

Told_High_blood_pressure
- 0：not been told that s/he has hypertension, also called high blood pressure
- 1：been told that s/he has hypertension, also called high blood pressure

In [13]:
Hypertension.loc[Hypertension['BPQ020'] == 2, 'Told_High_blood_pressure_dummy'] = 0
Hypertension.loc[Hypertension['BPQ020'] == 1, 'Told_High_blood_pressure_dummy'] = 1

In [14]:
# Taking Medicine for HBP
Hypertension.loc[Hypertension['BPQ050A'] == 2, 'Taking_Medicine_for_HBP_dummy'] = 0
Hypertension.loc[Hypertension['BPQ050A'] == 1, 'Taking_Medicine_for_HBP_dummy'] = 1

In [15]:
Hypertension.drop(columns = ['BPQ020', 'BPQ050A'], inplace = True)
Hypertension_combine = Hypertension.merge(blood_pressure_final, on = 'SEQN', how = 'outer')

In [16]:
Hypertension_final = Hypertension_combine[~((Hypertension_combine['Taking_Medicine_for_HBP_dummy'].isnull()) & (Hypertension_combine['Systolic_BP'].isnull()))]

In [17]:
Hypertension_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7653 entries, 0 to 10010
Data columns (total 5 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   SEQN                            7653 non-null   float64
 1   Told_High_blood_pressure_dummy  6162 non-null   float64
 2   Taking_Medicine_for_HBP_dummy   1814 non-null   float64
 3   Systolic_BP                     7531 non-null   float64
 4   Diastolic_BP                    7531 non-null   float64
dtypes: float64(5)
memory usage: 358.7 KB


Hypertension
- Systolic_BP > 130
- Diastolic_BP > 80
- Taking Medicine for HBP

In [18]:
Hypertension_final.loc[(Hypertension_final['Systolic_BP'] > 130) | (Hypertension_final['Diastolic_BP'] > 80) | (Hypertension_final['Taking_Medicine_for_HBP_dummy']) == 1, 'Hypertension_dummy'] = 1
Hypertension_final.loc[(Hypertension_final['Systolic_BP'] <= 130) | (Hypertension_final['Diastolic_BP'] <= 80), 'Hypertension_dummy'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [19]:
Hypertension_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7653 entries, 0 to 10010
Data columns (total 6 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   SEQN                            7653 non-null   float64
 1   Told_High_blood_pressure_dummy  6162 non-null   float64
 2   Taking_Medicine_for_HBP_dummy   1814 non-null   float64
 3   Systolic_BP                     7531 non-null   float64
 4   Diastolic_BP                    7531 non-null   float64
 5   Hypertension_dummy              7642 non-null   float64
dtypes: float64(6)
memory usage: 418.5 KB


In [20]:
Hypertension_final = Hypertension_final[~Hypertension_final['Hypertension_dummy'].isnull()]

In [21]:
Hypertension_final['Hypertension_dummy'].value_counts()

0.00    6965
1.00     677
Name: Hypertension_dummy, dtype: int64

In [22]:
Hypertension_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7642 entries, 0 to 10010
Data columns (total 6 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   SEQN                            7642 non-null   float64
 1   Told_High_blood_pressure_dummy  6151 non-null   float64
 2   Taking_Medicine_for_HBP_dummy   1803 non-null   float64
 3   Systolic_BP                     7531 non-null   float64
 4   Diastolic_BP                    7531 non-null   float64
 5   Hypertension_dummy              7642 non-null   float64
dtypes: float64(6)
memory usage: 417.9 KB


In [23]:
Hypertension_final[Hypertension_final['Told_High_blood_pressure_dummy'] == 0]['Hypertension_dummy'].value_counts()

0.00    3781
1.00     225
Name: Hypertension_dummy, dtype: int64

In [24]:
Hypertension_final[Hypertension_final['Told_High_blood_pressure_dummy'] == 1]['Hypertension_dummy'].value_counts()

0.00    1694
1.00     451
Name: Hypertension_dummy, dtype: int64

In [25]:
Hypertension_final.to_csv('Hypertension_final.csv')

In [26]:
Hypertension_SEQN = Hypertension_final['SEQN']

In [27]:
Hypertension_SEQN.to_csv('Hyper_SEQN.csv')

# demographic

In [28]:
demographic = pd.read_sas(filepath_or_buffer = BASE_PATH + 'DEMO.XPT')

# respondents that were both interviewed and examined
demographic  = demographic[demographic['RIDSTATR'] == 2]

demographic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9813 entries, 0 to 10174
Data columns (total 47 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   SEQN      9813 non-null   float64
 1   SDDSRVYR  9813 non-null   float64
 2   RIDSTATR  9813 non-null   float64
 3   RIAGENDR  9813 non-null   float64
 4   RIDAGEYR  9813 non-null   float64
 5   RIDAGEMN  642 non-null    float64
 6   RIDRETH1  9813 non-null   float64
 7   RIDRETH3  9813 non-null   float64
 8   RIDEXMON  9813 non-null   float64
 9   RIDEXAGM  4213 non-null   float64
 10  DMQMILIZ  6066 non-null   float64
 11  DMQADFC   523 non-null    float64
 12  DMDBORN4  9813 non-null   float64
 13  DMDCITZN  9809 non-null   float64
 14  DMDYRSUS  1852 non-null   float64
 15  DMDEDUC3  2703 non-null   float64
 16  DMDEDUC2  5588 non-null   float64
 17  DMDMARTL  5588 non-null   float64
 18  RIDEXPRG  1262 non-null   float64
 19  SIALANG   9813 non-null   float64
 20  SIAPROXY  9812 non-null   flo

In [29]:
demo_columns = ['SEQN', 'SDDSRVYR', 'RIAGENDR', 'RIDAGEYR', 'RIDRETH1', 'DMDBORN4', 'DMDMARTL', 'DMDEDUC3', 'DMDEDUC2', 'DMDHREDU', 'INDFMIN2', 'INDFMPIR']
demographic = demographic[demo_columns]

## Marital_Status
- 1: Married/Living with partner
- 2: Widowed/Divorced/Separated
- 3: Never Married

In [30]:
demographic.loc[((demographic['DMDMARTL'] == 1) | (demographic['DMDMARTL'] == 6)), 'Marital_Status_dummy'] = 1
demographic.loc[((demographic['DMDMARTL'] == 2) | (demographic['DMDMARTL'] == 3) | (demographic['DMDMARTL'] == 4)), 'Marital_Status_dummy'] = 2
demographic.loc[demographic['DMDMARTL'] == 5 , 'Marital_Status_dummy'] = 3

In [31]:
demographic['Marital_Status_dummy'].value_counts()

1.00    3293
2.00    1227
3.00    1065
Name: Marital_Status_dummy, dtype: int64

In [32]:
demographic.drop(columns = ['DMDMARTL'], inplace = True)

## Demographic: Education
- 1: Less than 9th grade
- 2: 9-11th grade(includes 12th grade with no diploma)
- 3: High school graduate/GED or equipment
- 4: some college or AA degreee
- 5: college graduate or above

In [33]:
demographic['Education_dummy'] = demographic['DMDEDUC2']
demographic.loc[(demographic['DMDEDUC3'] < 9) | (demographic['DMDEDUC3'] == 66), 'Education_dummy'] = 1
demographic.loc[(demographic['DMDEDUC3'] > 9) & (demographic['DMDEDUC3'] < 13), 'Education_dummy'] = 2
demographic.loc[(demographic['DMDEDUC3'] == 14) | (demographic['DMDEDUC3'] == 13), 'Education_dummy'] = 3
demographic.loc[demographic['DMDEDUC3'] == 15, 'Education_dummy'] = 4

In [34]:
demographic['Education_dummy'].replace({7: np.nan, 9:np.nan}, inplace = True)

In [35]:
demographic['Education_dummy'].value_counts()

1.00    2346
4.00    1799
5.00    1400
3.00    1399
2.00    1148
Name: Education_dummy, dtype: int64

In [36]:
demographic = demographic.drop(columns = ['DMDEDUC3', 'DMDEDUC2'])

## Income & Income_to_Proverty_Ratio
Income_to_Proverty_Ratio
- 1: less than 5 (proverty)
- 2: >= 5


In [37]:
demographic['Income_dummy'] = demographic['INDFMIN2']
demographic.loc[demographic['INDFMIN2'] == 12, 'Income_dummy'] = 6
demographic.loc[demographic['INDFMIN2'] == 13, 'Income_dummy'] = 4
demographic.loc[demographic['INDFMIN2'] == 14, 'Income_dummy'] = 11
demographic.loc[demographic['INDFMIN2'] == 15, 'Income_dummy'] = 12
demographic['Income_dummy'].replace({77: np.nan, 99:np.nan}, inplace = True)

In [38]:
demographic['Income_dummy'].value_counts()

12.00    1633
 6.00    1358
 7.00     895
 5.00     873
11.00     812
 4.00     801
 8.00     716
 3.00     656
 9.00     493
 2.00     450
10.00     368
 1.00     359
Name: Income_dummy, dtype: int64

In [39]:
demographic.loc[demographic['INDFMPIR'] <= 1, 'Poverty_dummy'] = 1
demographic.loc[demographic['INDFMPIR'] > 1, 'Poverty_dummy'] = 0

In [40]:
demographic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9813 entries, 0 to 10174
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   SEQN                  9813 non-null   float64
 1   SDDSRVYR              9813 non-null   float64
 2   RIAGENDR              9813 non-null   float64
 3   RIDAGEYR              9813 non-null   float64
 4   RIDRETH1              9813 non-null   float64
 5   DMDBORN4              9813 non-null   float64
 6   DMDHREDU              9551 non-null   float64
 7   INDFMIN2              9710 non-null   float64
 8   INDFMPIR              9083 non-null   float64
 9   Marital_Status_dummy  5585 non-null   float64
 10  Education_dummy       8092 non-null   float64
 11  Income_dummy          9414 non-null   float64
 12  Poverty_dummy         9083 non-null   float64
dtypes: float64(13)
memory usage: 1.0 MB


In [41]:
demographic['DMDBORN4'].replace({7: np.nan, 9:np.nan}, inplace = True)

In [42]:
# demographic.loc[demographic['DMDBORN2'] == 1, 'Country_of_birth_dummy'] = 1
# demographic.loc[(demographic['DMDBORN2'] > 1) & (demographic['DMDBORN2'] < 7), 'Country_of_birth_dummy'] = 2


In [43]:
demographic.rename(columns={'RIAGENDR': 'gender_dummy', 'RIDAGEYR': 'age', 'RIDRETH1': 'race_dummy', 'DMDBORN4': 'Country_of_birth_dummy', 'DMDHREDU': 'Household_reference_education_dummy'}, inplace = True)

In [44]:
demographic.drop(columns = ['INDFMIN2', 'INDFMPIR'], inplace = True)

In [45]:
demographic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9813 entries, 0 to 10174
Data columns (total 11 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   SEQN                                 9813 non-null   float64
 1   SDDSRVYR                             9813 non-null   float64
 2   gender_dummy                         9813 non-null   float64
 3   age                                  9813 non-null   float64
 4   race_dummy                           9813 non-null   float64
 5   Country_of_birth_dummy               9813 non-null   float64
 6   Household_reference_education_dummy  9551 non-null   float64
 7   Marital_Status_dummy                 5585 non-null   float64
 8   Education_dummy                      8092 non-null   float64
 9   Income_dummy                         9414 non-null   float64
 10  Poverty_dummy                        9083 non-null   float64
dtypes: float64(11)
memory usage: 

In [46]:
len(demographic.columns) == 11

True

In [47]:
demographic.to_csv('cleaned_demographic.csv')

# Body Measure

In [48]:
# read body measure file from Examination Data
body_measure = pd.read_sas(filepath_or_buffer = BASE_PATH + 'BMX.XPT')

# select columns
body_columns = ['SEQN', 'BMXWT', 'BMXHT', 'BMXBMI', 'BMXARMC', 'BMXWAIST']
body_measure = body_measure[body_columns]

# rename the columns
body_measure.rename(columns={'BMXWT': 'Weight(kg)', 'BMXHT': 'Height(cm)', 'BMXBMI': 'BMI', 'BMXARMC': 'Arm_Circumference', 'BMXWAIST': 'Waist_Circumference'}, inplace = True)

In [49]:
body_measure.head()

Unnamed: 0,SEQN,Weight(kg),Height(cm),BMI,Arm_Circumference,Waist_Circumference
0,73557.0,78.3,171.3,26.7,35.3,100.0
1,73558.0,89.5,176.8,28.6,34.7,107.6
2,73559.0,88.9,175.3,28.9,33.5,109.2
3,73560.0,32.2,137.3,17.1,21.0,61.0
4,73561.0,52.0,162.4,19.7,25.2,


In [50]:
len(body_measure.columns) == 6

True

In [51]:
body_measure.to_csv('cleaned_body_measure.csv')

# Other factors

## Consumer Behavior
The ratio is more important than the absolute value as the total spending amounts are different, therefore, the ratio is calcuated

In [52]:
# read sas file
consumer = pd.read_sas(filepath_or_buffer = BASE_PATH + 'CBQ.XPT')

consumer_col = ['SEQN', 'CBD070', 'CBD090', 'CBD110', 'CBD120', 'CBD130']

consumer = consumer[consumer_col]

# get the ratio of each spending category
ratio_cal = consumer.iloc[:,1:].div(consumer.iloc[:,1:].sum(axis=1),axis=0)

ratio_cal['Spending_on_food'] = ratio_cal['CBD070'] + ratio_cal['CBD110']
ratio_cal.rename(columns = {'CBD090': 'Spending_on_nonfood', 'CBD120': 'Spending_on_eating_out', 'CBD130': 'Spending_on_carryout_deliverred'}, inplace = True)

consumer_final = pd.concat([consumer, ratio_cal], axis=1)
consumer_final = consumer_final.drop(columns = ['CBD070', 'CBD090', 'CBD110', 'CBD120', 'CBD130'])

In [53]:
consumer_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10175 entries, 0 to 10174
Data columns (total 5 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   SEQN                             10175 non-null  float64
 1   Spending_on_nonfood              10043 non-null  float64
 2   Spending_on_eating_out           10052 non-null  float64
 3   Spending_on_carryout_deliverred  10052 non-null  float64
 4   Spending_on_food                 10052 non-null  float64
dtypes: float64(5)
memory usage: 397.6 KB


In [54]:
print(len(consumer_final.columns) == 5) 

True


In [55]:
consumer_final.to_csv('cleaned_consumer_behavior.csv')

## Alcohol drink

In [56]:
alcohol = pd.read_sas(filepath_or_buffer = BASE_PATH + 'ALQ.XPT')

In [57]:
alcohol['ALQ120Q'].replace({777: np.nan, 999:np.nan}, inplace = True)

In [58]:
alcohol[alcohol['ALQ120U'].isnull()]['ALQ120Q'].value_counts()

0.00    882
Name: ALQ120Q, dtype: int64

In [59]:
alcohol.loc[alcohol['ALQ120Q'] < 0.01, 'alcohol_drink_frequency_dummy'] = 0
alcohol.loc[((alcohol['ALQ120Q'] <= 12) | (alcohol['ALQ120Q'] >= 1)) & (alcohol['ALQ120U'] == 3), 'alcohol_drink_frequency_dummy'] = 1
alcohol.loc[(alcohol['ALQ120Q'] == 1)  & (alcohol['ALQ120U'] == 2), 'alcohol_drink_frequency_dummy'] = 1
alcohol.loc[(~(alcohol['ALQ120Q'].isnull())) & (~(alcohol['ALQ120U'].isnull())) & (alcohol['alcohol_drink_frequency_dummy'].isnull()), 'alcohol_drink_frequency_dummy'] = 2


In [60]:
alcohol[alcohol['alcohol_drink_frequency_dummy'].isnull()]['ALQ120Q'].value_counts()

Series([], Name: ALQ120Q, dtype: int64)

In [61]:
alcohol.head()

Unnamed: 0,SEQN,ALQ101,ALQ110,ALQ120Q,ALQ120U,ALQ130,ALQ141Q,ALQ141U,ALQ151,ALQ160,alcohol_drink_frequency_dummy
0,73557.0,1.0,,1.0,3.0,1.0,0.0,,1.0,,1.0
1,73558.0,1.0,,7.0,1.0,4.0,2.0,1.0,1.0,0.0,2.0
2,73559.0,1.0,,0.0,,,,,2.0,,0.0
3,73561.0,1.0,,0.0,,,,,2.0,,0.0
4,73562.0,1.0,,5.0,3.0,1.0,0.0,,2.0,0.0,1.0


In [62]:
alcohol.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5924 entries, 0 to 5923
Data columns (total 11 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   SEQN                           5924 non-null   float64
 1   ALQ101                         5421 non-null   float64
 2   ALQ110                         1631 non-null   float64
 3   ALQ120Q                        4475 non-null   float64
 4   ALQ120U                        3593 non-null   float64
 5   ALQ130                         3596 non-null   float64
 6   ALQ141Q                        3595 non-null   float64
 7   ALQ141U                        1464 non-null   float64
 8   ALQ151                         4477 non-null   float64
 9   ALQ160                         1866 non-null   float64
 10  alcohol_drink_frequency_dummy  4475 non-null   float64
dtypes: float64(11)
memory usage: 509.2 KB


In [63]:
alcohol.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5924 entries, 0 to 5923
Data columns (total 11 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   SEQN                           5924 non-null   float64
 1   ALQ101                         5421 non-null   float64
 2   ALQ110                         1631 non-null   float64
 3   ALQ120Q                        4475 non-null   float64
 4   ALQ120U                        3593 non-null   float64
 5   ALQ130                         3596 non-null   float64
 6   ALQ141Q                        3595 non-null   float64
 7   ALQ141U                        1464 non-null   float64
 8   ALQ151                         4477 non-null   float64
 9   ALQ160                         1866 non-null   float64
 10  alcohol_drink_frequency_dummy  4475 non-null   float64
dtypes: float64(11)
memory usage: 509.2 KB


In [64]:
alcohol['alcohol_drink_per_day'] = alcohol['ALQ130']
alcohol['alcohol_drink_per_day'].replace({777: np.nan, 999:np.nan}, inplace = True)

In [65]:
alcohol_columns = ['SEQN', 'alcohol_drink_frequency_dummy', 'alcohol_drink_per_day']

In [66]:
alcohol_final = alcohol[alcohol_columns]

In [67]:
len(alcohol_final.columns) == 3

True

In [68]:
alcohol_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5924 entries, 0 to 5923
Data columns (total 3 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   SEQN                           5924 non-null   float64
 1   alcohol_drink_frequency_dummy  4475 non-null   float64
 2   alcohol_drink_per_day          3593 non-null   float64
dtypes: float64(3)
memory usage: 139.0 KB


In [69]:
alcohol_final.to_csv('cleaned_alcohol.csv')

## Diet behavior

In [70]:
Diet_behavior = pd.read_sas(filepath_or_buffer = BASE_PATH + 'DBQ.XPT')

Diet_behavior_cols = ['SEQN', 'DBQ197', 'DBQ223A',  'DBQ223B', 'DBQ223C', 'DBQ223D', 'DBQ223E', 'DBQ223U', 'DBD895', 'DBD905', 'DBD910']
Diet_behavior = Diet_behavior[Diet_behavior_cols]

# get the milk type by assigning row-wise minimum value of all milk type columns
Diet_behavior['milk_type_dummy'] = Diet_behavior[['DBQ223A',  'DBQ223B', 'DBQ223C', 'DBQ223D', 'DBQ223E', 'DBQ223U']].min(axis=1)
Diet_behavior.loc[Diet_behavior['milk_type_dummy'] == 99, 'milk_type_dummy'] = np.nan


Diet_behavior.drop(columns = ['DBQ223A',  'DBQ223B', 'DBQ223C', 'DBQ223D', 'DBQ223E', 'DBQ223U'], inplace = True)

Diet_behavior['milk_product_consumption_freq_dummy'] = Diet_behavior['DBQ197']
Diet_behavior.loc[Diet_behavior['DBQ197'] == 4, 'milk_product_consumption_freq_dummy'] = 2

# number of meals not home prepared
Diet_behavior['eat_outside'] = Diet_behavior['DBD895']
Diet_behavior.loc[Diet_behavior['DBD895'] == 5555, 'eat_outside'] = 22
Diet_behavior.loc[Diet_behavior['DBD895'] > 5555, 'eat_outside'] = np.nan

# number of ready-to-eat foods in past 30 days
Diet_behavior['ready_to_eat_food'] = Diet_behavior['DBD905']
Diet_behavior.loc[Diet_behavior['DBD905'] == 6666, 'ready_to_eat_food'] = 100
Diet_behavior.loc[Diet_behavior['DBD905'] > 6666, 'ready_to_eat_food'] = np.nan

# of frozen meals/pizza in past 30 days
Diet_behavior['frozen_food'] = Diet_behavior['DBD910']
Diet_behavior.loc[Diet_behavior['DBD910'] == 6666, 'frozen_food'] = 100
Diet_behavior.loc[Diet_behavior['DBD910'] > 6666, 'frozen_food'] = np.nan

Diet_behavior.drop(columns = ['DBQ197', 'DBD895', 'DBD905', 'DBD910'], inplace = True)

In [71]:
len(Diet_behavior.columns) == 6

True

In [72]:
Diet_behavior.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10175 entries, 0 to 10174
Data columns (total 6 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   SEQN                                 10175 non-null  float64
 1   milk_type_dummy                      8448 non-null   float64
 2   milk_product_consumption_freq_dummy  9769 non-null   float64
 3   eat_outside                          9687 non-null   float64
 4   ready_to_eat_food                    9668 non-null   float64
 5   frozen_food                          9669 non-null   float64
dtypes: float64(6)
memory usage: 477.1 KB


In [73]:
Diet_behavior.to_csv('cleaned_diet_behavior.csv')

## Mental Health

In [74]:
mental_health = pd.read_sas(filepath_or_buffer = BASE_PATH + 'DPQ.XPT')

mental_health_cols = ['SEQN', 'DPQ030', 'DPQ040', 'DPQ050', 'DPQ060']
mental_health = mental_health[mental_health_cols]

mental_health_cols = ['DPQ030', 'DPQ040', 'DPQ050', 'DPQ060']

for col in mental_health_cols:
  mental_health.loc[(mental_health[col] < 1) , col + '_1'] = 'low'
  mental_health.loc[(mental_health[col] == 1) | (mental_health[col] == 2), col + '_1'] = 'mild'
  mental_health.loc[(mental_health[col] == 3) , col + '_1'] = 'severe'

mental_health = mental_health.rename(columns = {'DPQ030_1': 'trouble_sleeping_dummy', 'DPQ040_1': 'feeling_tired_dummy', 'DPQ050_1':'poor_appetite_dummy', 'DPQ060_1':'feeling_bad_dummy'})
mental_health.drop(columns=['DPQ030', 'DPQ040', 'DPQ050', 'DPQ060'], inplace = True)


In [75]:
len(mental_health.columns) == 5

True

In [76]:
mental_health.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5924 entries, 0 to 5923
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   SEQN                    5924 non-null   float64
 1   trouble_sleeping_dummy  5394 non-null   object 
 2   feeling_tired_dummy     5394 non-null   object 
 3   poor_appetite_dummy     5393 non-null   object 
 4   feeling_bad_dummy       5390 non-null   object 
dtypes: float64(1), object(4)
memory usage: 231.5+ KB


In [77]:
mental_health.to_csv('cleaned_mental_health.csv')

## diabetes

In [78]:
diabetes = pd.read_sas(filepath_or_buffer = BASE_PATH + 'DIQ.XPT')
diabetes.loc[(diabetes['DIQ010']==9),'DIQ010']=np.nan
diabetes.loc[(diabetes['DIQ050']==9),'DIQ010']=np.nan
diabetes.loc[(diabetes['DIQ160']==9),'DIQ010']=np.nan
diabetes=diabetes[['SEQN','DIQ010','DIQ050','DIQ160']]

dia_cols=['SEQN','have_diabetes_1_dummy','Taking_insulin_now_1_dummy','have_prediabetes_1_dummy']
diabetes.columns=dia_cols
clean_diabetes=diabetes

In [79]:
clean_diabetes.to_csv('cleaned_diabetes.csv')

## Health Insurance

In [80]:
insurance = pd.read_sas(filepath_or_buffer = BASE_PATH + 'HIQ.XPT')

insurance.loc[(insurance['HIQ011']==9),'HIQ011']=np.nan
insurance.loc[(insurance['HIQ011']==7),'HIQ011']=np.nan
insurance.loc[(insurance['HIQ270']==9),'HIQ270']=np.nan
insurance.loc[(insurance['HIQ270']==7),'HIQ270']=np.nan

insurance=insurance[['SEQN','HIQ011','HIQ270']]

insurance_cols=['SEQN','Covered_by_health_insurance_1_dummy','plans_cover_prescriptions_1_dummy']
insurance.columns=insurance_cols
clean_insurance=insurance

In [81]:
clean_insurance.to_csv('cleaned_insurance.csv')

## Current_health_Status

In [82]:
health_status = pd.read_sas(filepath_or_buffer = BASE_PATH + 'HSQ.XPT')

health_status.loc[(health_status['HSQ500']==9),'HSQ500']=np.nan
health_status.loc[(health_status['HSQ500']==7),'HSQ500']=np.nan
health_status.loc[(health_status['HSQ510']==9),'HSQ510']=np.nan
health_status.loc[(health_status['HSQ510']==7),'HSQ510']=np.nan
health_status.loc[(health_status['HSQ520']==9),'HSQ520']=np.nan
health_status=health_status[['SEQN','HSQ500','HSQ510','HSQ520']]

health_status_cols=['SEQN','head_cold_or_chest_cold_dummy','stomach_or_intestinal_illness_dummy','flu_pneumonia_ear_infection_dummy']
health_status.columns=health_status_cols

In [83]:
health_status.to_csv('cleaned_health_status.csv')

## Occupation

In [84]:
occupation = pd.read_sas(filepath_or_buffer = BASE_PATH + 'OCQ.XPT')

occupation=occupation[['SEQN','OCD150']]
occupation.loc[(occupation['OCD150']!=1),'OCD150']=0
occupation['OCD150'].value_counts()
occ_cols=['SEQN','have_job_1_dummy']
occupation.columns=occ_cols

In [85]:
occupation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6464 entries, 0 to 6463
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   SEQN              6464 non-null   float64
 1   have_job_1_dummy  6464 non-null   float64
dtypes: float64(2)
memory usage: 101.1 KB


In [86]:
occupation.to_csv('cleaned_occupation.csv')

## Medical Condition

In [87]:
med_condition = pd.read_sas(filepath_or_buffer = BASE_PATH + 'MCQ.XPT')

med_condition=med_condition[['SEQN','MCQ010','MCQ053','MCQ092']]

med_condition.loc[(med_condition['MCQ010']==9),'MCQ010']=np.nan
med_condition.loc[(med_condition['MCQ053']==7),'MCQ053']=np.nan
med_condition.loc[(med_condition['MCQ053']==9),'MCQ053']=np.nan
med_condition.loc[(med_condition['MCQ092']==7),'MCQ092']=np.nan
med_condition.loc[(med_condition['MCQ092']==9),'MCQ092']=np.nan

med_cols=['SEQN','have_asthma_1_dummy','treatment_for_anemia_1_dummy','blood_transfusion_1_dummy']
med_condition.columns=med_cols

In [88]:
med_condition.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9770 entries, 0 to 9769
Data columns (total 4 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   SEQN                          9770 non-null   float64
 1   have_asthma_1_dummy           9761 non-null   float64
 2   treatment_for_anemia_1_dummy  9762 non-null   float64
 3   blood_transfusion_1_dummy     8514 non-null   float64
dtypes: float64(4)
memory usage: 305.4 KB


In [89]:
med_condition.to_csv('cleaned_med_condition.csv')

## Physical Activity

In [90]:
physical_act = pd.read_sas(filepath_or_buffer = BASE_PATH + 'PAQ.XPT')

physical_act=physical_act[['SEQN','PAQ605','PAQ620','PAQ635','PAQ650','PAQ665']]
physical_act.loc[(physical_act['PAQ605']==9),'PAQ605']=np.nan
physical_act.loc[(physical_act['PAQ620']==9),'PAQ620']=np.nan

physical_act_cols=['SEQN','Vigorous_work_activity_1_dummy','Moderate work activity_1_dummy','Walk_or_bicycle_1_dummy','Vigorous_recreational_activities_1_dummy','Moderate_recreational_activities_1_dummy']
physical_act.columns=physical_act_cols

In [91]:
physical_act.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9484 entries, 0 to 9483
Data columns (total 6 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   SEQN                                      9484 non-null   float64
 1   Vigorous_work_activity_1_dummy            7148 non-null   float64
 2   Moderate work activity_1_dummy            7146 non-null   float64
 3   Walk_or_bicycle_1_dummy                   7147 non-null   float64
 4   Vigorous_recreational_activities_1_dummy  7147 non-null   float64
 5   Moderate_recreational_activities_1_dummy  7145 non-null   float64
dtypes: float64(6)
memory usage: 444.7 KB


In [92]:
physical_act.to_csv('cleaned_physical_act.csv')

## Sleeping disorder

In [93]:
sleep_disorder = pd.read_sas(filepath_or_buffer = BASE_PATH + 'SLQ.XPT')


sleep_disorder.loc[(sleep_disorder['SLQ050']==9),'SLQ050']=np.nan

sleep_disorder=sleep_disorder[['SEQN', 'SLD010H', 'SLQ050']]
sleep_cols=['SEQN', 'Sleep_hours', 'had_trouble_sleeping_1_dummy']
sleep_disorder.columns=sleep_cols

In [94]:
sleep_disorder.head()

Unnamed: 0,SEQN,Sleep_hours,had_trouble_sleeping_1_dummy
0,73557.0,7.0,1.0
1,73558.0,9.0,2.0
2,73559.0,8.0,2.0
3,73561.0,9.0,2.0
4,73562.0,5.0,2.0


In [95]:
# # sleep hour
# sleep_disorder['hour'] = pd.to_numeric(sleep_disorder['Usual_sleep_time'].str[:2])
# sleep_disorder['min'] = pd.to_numeric(sleep_disorder['Usual_sleep_time'].str[3:])
# sleep_disorder = sleep_disorder[~(sleep_disorder['hour'] > 30)]
# sleep_disorder['min_decimal'] = sleep_disorder['min'] / 60
# sleep_disorder['final_sleep_time'] = sleep_disorder['hour'] + sleep_disorder['min_decimal']


# # Wake hour
# sleep_disorder['wake_hour'] = pd.to_numeric(sleep_disorder['Usual_wake_time'].str[:2])
# sleep_disorder['wake_min'] = pd.to_numeric(sleep_disorder['Usual_wake_time'].str[3:])
# sleep_disorder = sleep_disorder[~(sleep_disorder['wake_hour'] > 30)]
# sleep_disorder['wake_min_decimal'] = sleep_disorder['wake_min'] / 60
# sleep_disorder['final_wake_time'] = sleep_disorder['wake_hour'] + sleep_disorder['wake_min_decimal']

In [96]:
sleep_disorder.columns

Index(['SEQN', 'Sleep_hours', 'had_trouble_sleeping_1_dummy'], dtype='object')

In [97]:
# sleep_disorder.drop(columns = ['Usual_sleep_time', 'Usual_wake_time', 'hour', 'min', 'min_decimal',\
#                                'wake_hour', 'wake_min', 'wake_min_decimal'], inplace = True)

In [98]:
# sleep_disorder.head()

In [99]:
sleep_disorder.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6464 entries, 0 to 6463
Data columns (total 3 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   SEQN                          6464 non-null   float64
 1   Sleep_hours                   6461 non-null   float64
 2   had_trouble_sleeping_1_dummy  6462 non-null   float64
dtypes: float64(3)
memory usage: 151.6 KB


In [100]:
sleep_disorder.to_csv('cleaned_sleep_disorder.csv')

## Smoking

In [101]:
smoking = pd.read_sas(filepath_or_buffer = BASE_PATH + 'SMQ.XPT')

smoking=smoking[['SEQN','SMQ020']]
smoking_cols=['SEQN','>100_cigarettes_inlife_1_dummy']
smoking.columns=smoking_cols

In [102]:
smoking.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7168 entries, 0 to 7167
Data columns (total 2 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   SEQN                            7168 non-null   float64
 1   >100_cigarettes_inlife_1_dummy  6113 non-null   float64
dtypes: float64(2)
memory usage: 112.1 KB


In [103]:
smoking.to_csv('cleaned_smoking.csv')

## Smoking - Household Smokers

In [104]:
# HH_smokers = pd.read_sas(filepath_or_buffer = BASE_PATH + 'SMQFAM.XPT')

# HH_smokers.loc[(HH_smokers['SMD460']==7.770000e+02),'SMD460']=np.nan
# HH_smokers.loc[(HH_smokers['SMD460']==9.990000e+02 ),'SMD460']=np.nan

# HH_smokers=HH_smokers[['SEQN','SMD460']]
# HH_smokers_cols=['SEQN','HH_smoke_number_dummy']
# HH_smokers.columns=HH_smokers_cols

In [105]:
# HH_smokers.info()

In [106]:
# HH_smokers.to_csv('cleaned_HH_smoker.csv')

# Diary Interview

In [None]:
d1 = pd.read_sas(filepath_or_buffer = BASE_PATH + 'DR1TOT.XPT')
d1

  df[x] = v


Unnamed: 0,SEQN,WTDRD1,WTDR2D,DR1DRSTZ,DR1EXMER,DRABF,DRDINT,DR1DBIH,DR1DAY,DR1LANG,...,DRD370QQ,DRD370R,DRD370RQ,DRD370S,DRD370SQ,DRD370T,DRD370TQ,DRD370U,DRD370UQ,DRD370V
0,73557.0,1.688833e+04,12930.890649,1.0,49.0,2.0,2.0,6.0,2.0,1.0,...,,,,,,,,,,
1,73558.0,1.793214e+04,12684.148869,1.0,59.0,2.0,2.0,4.0,1.0,1.0,...,,2.0,,2.0,,2.0,,2.0,,2.0
2,73559.0,5.964181e+04,39394.236709,1.0,49.0,2.0,2.0,18.0,6.0,1.0,...,,,,,,,,,,
3,73560.0,1.422031e+05,125966.366442,1.0,54.0,2.0,2.0,21.0,3.0,1.0,...,,,,,,,,,,
4,73561.0,5.905236e+04,39004.892993,1.0,63.0,2.0,2.0,18.0,1.0,1.0,...,,2.0,,2.0,,2.0,,2.0,,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9808,83727.0,1.414678e+04,10520.302070,1.0,59.0,2.0,2.0,8.0,7.0,1.0,...,,2.0,,2.0,,2.0,,2.0,,2.0
9809,83728.0,3.875076e+04,51815.103262,1.0,25.0,2.0,2.0,9.0,5.0,1.0,...,,,,,,,,,,
9810,83729.0,1.527251e+04,10401.781170,1.0,59.0,2.0,2.0,1.0,6.0,1.0,...,,2.0,,2.0,,2.0,,2.0,,2.0
9811,83730.0,5.397605e-79,,5.0,,,,,,,...,,,,,,,,,,


In [None]:
d1['DRQSPREP'].value_counts()

4.0    3398
3.0    3078
2.0    1602
1.0     557
9.0     148
Name: DRQSPREP, dtype: int64

In [None]:
d1.loc[(d1['WTDRD1']<0.01),'WTDRD1']=np.nan
d1.loc[(d1['WTDR2D']<0.01),'WTDR2D']=np.nan
d1.loc[(d1['DR1DRSTZ']==2),'DR1DRSTZ']=np.nan
d1.loc[(d1['DR1DRSTZ']==5),'DR1DRSTZ']=np.nan
d1.loc[(d1['DR1LANG']==3),'DR1LANG']=2
d1.loc[(d1['DR1LANG']==5),'DR1LANG']=4
d1.loc[(d1['DR1LANG']==6),'DR1LANG']=4
d1.loc[(d1['DBQ095Z']==99),'DBQ095Z']=np.nan
d1.loc[(d1['DBD100']==9),'DBD100']=np.nan
d1.loc[(d1['DRQSPREP']==9),'DRQSPREP']=np.nan
d1.loc[(d1['DR1STY']==9),'DR1STY']=np.nan

In [None]:
d1=d1[['SEQN','WTDRD1','WTDR2D','DR1DRSTZ','DRABF','DR1LANG','DBQ095Z','DBD100','DRQSPREP','DRQSDIET']]
d1_cols=['SEQN','d1_sample_w','d2_sample_w','d_recall_status','Breast_fed_infant_1_dummy','language_dummy','salt_type_dummy','salt_freq_dummy','salt_inprep_dummy','special_diet_1_dummy']