In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import seaborn as sns
from features import htn_name_mapping, replace_values_with_nan, replace_nan_with_x, replace_prescription

In [2]:
df = pd.read_sas('P_BPQ.XPT')
diq_clean= pd.read_pickle('diq_clean.pkl')
df_lab_chol = pd.read_sas('P_TCHOL.XPT')
df_bp_pys = pd.read_sas('P_BPXO.XPT')
df_hdl = pd.read_sas('P_HDL.XPT')
df_smk = pd.read_sas('P_SMQ.XPT')

In [3]:
df = df.merge(df_lab_chol, on='SEQN', how='left')
df = df.merge(df_bp_pys, on='SEQN', how='left') 
df = df.merge(df_hdl, on='SEQN', how='left')
df = df.merge(df_smk, on='SEQN', how='left')

df= df.rename(columns=htn_name_mapping)
df= df.filter(htn_name_mapping.values(), axis=1)

df= df.astype(float)

total_rows = df.shape[0]
print("Total rows in the DataFrame:", total_rows)
print(df.isnull().sum().sort_values(ascending=False))

replace_column_x = [['medication_cholesterol', 'medication_htn','prescription_htn', 'prescription_cholesterol']]
df= replace_nan_with_x(df, replace_column_x, 2)
replace_column_nan = ['hypertension_age']
value_to_replace= [777,999]
df= replace_values_with_nan(df, replace_column_nan, value_to_replace)

columns = ['bp_sys', 'bp_dis']

for col in columns:
    df[f'{col}1'].fillna(df[f'{col}2'], inplace=True)
    df[f'{col}2'].fillna(df[f'{col}1'], inplace=True)



df['mean_bp_sys'] = df[['bp_sys1', 'bp_sys2']].mean(axis=1)
df['mean_bp_dis'] = df[['bp_dis1', 'bp_dis2']].mean(axis=1)



df = df.merge(diq_clean, on='SEQN', how='inner')

df= df.astype(float)
columns_to_drop= ['bp_dis1', 'mean_bp_dis', 'bp_sys1', 'bp_sys2', 'bp_dis2', 'mean_bp_sys', 'lab_cholesterol', 'lab_hdl','retinopathy','seeing_specialist', 'medication' ]
df = df.dropna(subset=columns_to_drop)


total_rows = df.shape[0]
print("Total rows in the DataFrame after cleaning:", total_rows)

print(df.isnull().sum().sort_values(ascending=False))

Total rows in the DataFrame: 10195
medication_cholesterol      7447
medication_htn              6916
hypertension_age            6598
prescription_htn            6598
prescription_cholesterol    2646
bp_sys2                     1749
bp_dis2                     1749
bp_sys1                     1738
bp_dis1                     1738
lab_cholesterol             1469
lab_hdl                     1469
smoker                       502
SEQN                           0
hypertension                   0
cholesterol                    0
dtype: int64
Total rows in the DataFrame after cleaning: 7100
LengthStay_US               5187
hypertension_age            4433
fasting_glucose             3579
SEQN                           0
insulin_Duration               0
medication                     0
diabetes_specialist_year       0
seeing_specialist              0
retinopathy                    0
lab_A1c                        0
Interview_Status               0
Gender                         0
pre-diabetes

In [4]:
def replace_prescription(df, column_check, column_replace):
    # Your condition
    condition_1 = (df[column_check] == 1) & (df[column_replace].isin([7, 9]))
    condition_2 = (df[column_check] == 2) & (df[column_replace].isin([7, 9]))

    # Perform replacements
    df.loc[condition_1, column_replace] = 1
    df.loc[condition_2, column_replace] = 2
    return df


df = replace_prescription(df, 'prescription_htn', 'hypertension')

In [5]:
df[df['hypertension'].isin([2])]

Unnamed: 0,SEQN,hypertension,hypertension_age,prescription_htn,medication_htn,cholesterol,prescription_cholesterol,medication_cholesterol,lab_cholesterol,bp_sys1,...,fasting_glucose,Gender,Interview_Status,Age_Years,Race,Birth_Country,LengthStay_US,Education_20+,Marital_Status,RatioIncome_Poverty
0,109266.0,2.0,,2.0,2.0,1.0,2.0,2.0,195.0,99.0,...,,2.0,2.0,29.0,6.0,2.0,2.0,5.0,3.0,5.000000
1,109271.0,2.0,,2.0,2.0,1.0,1.0,1.0,147.0,102.0,...,5.72,1.0,2.0,49.0,3.0,1.0,,2.0,3.0,1.024266
2,109273.0,2.0,,2.0,2.0,2.0,2.0,2.0,164.0,116.0,...,,1.0,2.0,36.0,3.0,1.0,,4.0,3.0,0.830000
10,109293.0,2.0,,2.0,2.0,2.0,2.0,2.0,189.0,126.0,...,,1.0,2.0,44.0,3.0,1.0,,3.0,3.0,0.020000
11,109295.0,2.0,,2.0,2.0,2.0,2.0,2.0,234.0,158.0,...,,2.0,2.0,54.0,1.0,2.0,4.0,1.0,1.0,5.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8111,124814.0,2.0,,2.0,2.0,2.0,2.0,2.0,225.0,141.0,...,5.83,1.0,2.0,64.0,4.0,1.0,,3.0,2.0,2.000000
8112,124815.0,2.0,,2.0,2.0,2.0,2.0,2.0,203.0,141.0,...,5.66,1.0,2.0,52.0,4.0,1.0,,4.0,2.0,5.000000
8113,124817.0,2.0,,2.0,2.0,1.0,1.0,1.0,200.0,111.0,...,,2.0,2.0,67.0,1.0,2.0,4.0,2.0,1.0,2.190000
8114,124818.0,2.0,,2.0,2.0,2.0,2.0,2.0,234.0,106.0,...,,1.0,2.0,40.0,4.0,1.0,,5.0,1.0,3.820000


In [6]:
marital_status_mapping = {1: 'Married/Partner', 2: 'Widow/Divorce', 3: 'Single'}
gender_mapping = {1: 'Male', 2: 'Female'}
race_mapping = {1: 'Mexican American', 2: 'Other Hispanic', 3: 'White', 4: 'Black', 6: 'Asian', 7: 'Other Race'}
birth_mapping = {1: 'USA', 2: 'Others'}
diabetes_mapping = {1: 'Yes', 2: 'No', 3:'Borderline'}
specialist_mapping = {1: 'Yes', 2: 'No'}
retinopathy_mapping  = {1: 'Yes', 2: 'No'}
insulin_mapping = {1: 'Yes', 2: 'No'}
medication_mapping  = {1: 'Yes', 2: 'No'}
prediabetes_mapping  = {1: 'Yes', 2: 'No', 3: 'Diabetic'}
education_mapping  = {1: '<9th grade', 2: '9th-11th grade', 3: 'High School', 4:'College', 5: 'College graduate'}

hypertension_mapping = {1: 'Yes', 2: 'No'}
cholesterol_mapping = {1: 'Yes', 2: 'No'}
prescription_cholesterol_mapping = {1: 'Yes', 2: 'No'}
prescription_hypertension_mapping = {1: 'Yes', 2: 'No'}
medication_cholesterol_mapping = {1: 'Yes', 2: 'No'}
medication_hypertension_mapping = {1: 'Yes', 2: 'No'}
smoker_mapping = {1: 'Yes', 2: 'No'}

df_htn_label = df.copy()
df_htn_label = df_htn_label.astype(float)
df_htn_label= df_htn_label.drop('SEQN', axis=1)
df_htn_label.replace({
    'Marital_Status': marital_status_mapping,
    'Gender': gender_mapping,
    'Race': race_mapping,
    'Birth_Country': birth_mapping,
    'diabetes': diabetes_mapping,
    'seeing_specialist': specialist_mapping,
    'retinopathy': retinopathy_mapping,
    'insulin': insulin_mapping,
    'medication': medication_mapping,
    'pre-diabetes': prediabetes_mapping,
    'Education_20+': education_mapping,
    

    'hypertension': hypertension_mapping,
    'cholesterol': cholesterol_mapping,
    'prescription_htn': prescription_hypertension_mapping,
    'prescription_cholesterol': prescription_cholesterol_mapping,
    'medication_htn': medication_hypertension_mapping,
    'medication_cholesterol': medication_cholesterol_mapping
}, inplace=True)


In [7]:
df_htn_label.head()

Unnamed: 0,hypertension,hypertension_age,prescription_htn,medication_htn,cholesterol,prescription_cholesterol,medication_cholesterol,lab_cholesterol,bp_sys1,bp_sys2,...,fasting_glucose,Gender,Interview_Status,Age_Years,Race,Birth_Country,LengthStay_US,Education_20+,Marital_Status,RatioIncome_Poverty
0,No,,No,No,Yes,No,No,195.0,99.0,99.0,...,,Female,2.0,29.0,Asian,Others,2.0,College graduate,Single,5.0
1,No,,No,No,Yes,Yes,Yes,147.0,102.0,108.0,...,5.72,Male,2.0,49.0,White,USA,,9th-11th grade,Single,1.024266
2,No,,No,No,No,No,No,164.0,116.0,110.0,...,,Male,2.0,36.0,White,USA,,College,Single,0.83
3,Yes,54.0,Yes,Yes,Yes,Yes,Yes,105.0,138.0,132.0,...,8.55,Male,2.0,68.0,Other Race,USA,,College,Single,1.2
4,Yes,60.0,Yes,Yes,Yes,No,No,233.0,141.0,137.0,...,5.27,Male,2.0,76.0,White,USA,,College graduate,Married/Partner,3.61


In [8]:
total_rows = df.shape[0]

print("Total rows in the DataFrame:", total_rows)

print(df.isnull().sum().sort_values(ascending=False))

Total rows in the DataFrame: 7100
LengthStay_US               5187
hypertension_age            4433
fasting_glucose             3579
SEQN                           0
insulin_Duration               0
medication                     0
diabetes_specialist_year       0
seeing_specialist              0
retinopathy                    0
lab_A1c                        0
Interview_Status               0
Gender                         0
pre-diabetes                   0
Age_Years                      0
Race                           0
Birth_Country                  0
Education_20+                  0
Marital_Status                 0
insulin                        0
diabetes_age                   0
hypertension                   0
bp_sys1                        0
prescription_htn               0
medication_htn                 0
cholesterol                    0
prescription_cholesterol       0
medication_cholesterol         0
lab_cholesterol                0
bp_sys2                        0
diabetes 

In [9]:
total_rows = df_htn_label.shape[0]

print("Total rows in the DataFrame:", total_rows)

print(df_htn_label.isnull().sum().sort_values(ascending=False))

Total rows in the DataFrame: 7100
LengthStay_US               5187
hypertension_age            4433
fasting_glucose             3579
hypertension                   0
insulin_Duration               0
medication                     0
diabetes_specialist_year       0
seeing_specialist              0
retinopathy                    0
lab_A1c                        0
Gender                         0
Interview_Status               0
Age_Years                      0
Race                           0
Birth_Country                  0
Education_20+                  0
Marital_Status                 0
insulin                        0
pre-diabetes                   0
diabetes_age                   0
bp_sys1                        0
prescription_htn               0
medication_htn                 0
cholesterol                    0
prescription_cholesterol       0
medication_cholesterol         0
lab_cholesterol                0
bp_sys2                        0
diabetes                       0
bp_dis1  

In [10]:
csv_htn = 'htn_label_clean.csv'
df_htn_label.to_csv(csv_htn)