# Setup and Imports

These cells will import necessary libraries and configure the notebook's visual style.

In [1]:
# Efficient math and data management
import numpy as np
import pandas as pd

# You may import useful modules and functions from the Python Standard Library.
import os
from functools import reduce  

# Visualization libraries
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Inline figures. Can swap comments to use interactive figures. Use inline figures for assignment submission.
%matplotlib inline
# %matplotlib notebook

In [3]:
# Set seaborn visual style
sns.set()
sns.set_context('talk')
plt.rcParams["patch.force_edgecolor"] = False  # Turn off histogram borders

# Load Data

Load the combined data file.

In [4]:
def gfr(seq_num, dem_data, bio_pro):
    user_dem_data = dem_data[dem_data['SEQN'] == seq_num]
    race = user_dem_data['RIDRETH1'].item()
    gender = user_dem_data['RIAGENDR'].item()
    age = user_dem_data['RIDAGEYR'].item()
    sc = bio_pro[bio_pro['SEQN'] == seq_num]['LBXSCR'].item()
    k = 0.7 if gender == 1 else 0.9
    a = -0.329 if gender == 1 else -0.411
    t = 1.159 if race == 4 else 1
    s = 1.018 if gender == 1 else 1
    return 141 * (min(sc/k, 1)**a) * (max(sc/k, 1)**-1.209) * (0.993**age) * s * t


In [5]:
def load_gfr(folder):
    bio_pro = pd.read_sas(folder + '/P_BIOPRO.XPT')
    dem_data = pd.read_sas(folder + '/P_DEMO.XPT')
    gfr_col = bio_pro['SEQN'].apply(lambda seqn: gfr(seqn, dem_data, bio_pro))
    gfr_data = pd.concat([bio_pro['SEQN'],gfr_col], axis=1, join='inner')
    gfr_data.columns = ['SEQN', 'GFR']
    print('Number of patients with possible kidney disease: ', np.count_nonzero(gfr_col <= 60))
    return gfr_data

In [9]:
#folder is the data directory, single_cols is a dict which maps file names with a list of specific cols for that file (for example, from P_KIQ_U.XPT we only want KIQ022 so specific_cols = {'P_KIQ_U.XPT' : ['KIQ022']}
def load_and_merge(folder, specific_cols=None):
    gfr_data = load_gfr(folder)
    data_files = [gfr_data]
    for filename in os.listdir(folder):
        filepath = folder + '/' + filename
        if specific_cols and filename in specific_cols:
            #Add SEQN to cols
            col_names = ['SEQN']
            col_names.extend(specific_cols[filename])
            #Rebuild as dataframe with the correct column names (necessary for the inner join after all data files are collected)
            data_files.append(pd.DataFrame(pd.read_sas(filepath)[col_names], columns=col_names))
        else:
            data_files.append(pd.read_sas(filepath))
    #https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns
    return pd.concat(
        (iDF.set_index('SEQN') for iDF in data_files),
        axis=1, join='inner'
    ).reset_index()

In [17]:
all_years = []
for year_range in ['2017-2020', '2015-2016', '2013-2014', '2011-2012', '2009-2010']:
    full_data = load_and_merge('data/' + year_range, {'KIQ_U.XPT' : ['KIQ022'], 'ALB_CR.XPT' : ['URDACT']})
    all_years.append(full_data)

Number of patients with possible kidney disease:  1252
Number of patients with possible kidney disease:  718
Number of patients with possible kidney disease:  793
Number of patients with possible kidney disease:  695
Number of patients with possible kidney disease:  875


In [18]:
for data in all_years:
    print('Num that said yes to kidney disease', np.count_nonzero(data['KIQ022'] == 1))
    print('Num with risky gfr', np.count_nonzero(data['GFR'] <= 60))
    print('Num with risky acr', np.count_nonzero(data['URDACT'] >= 30))
    print('Num with risky acr and gfr', np.count_nonzero((data['URDACT'] >= 30) & (data['GFR'] <= 60)))
    print()

Num that said yes to kidney disease 50
Num with risky gfr 215
Num with risky acr 173
Num with risky acr and gfr 55

Num that said yes to kidney disease 48
Num with risky gfr 124
Num with risky acr 124
Num with risky acr and gfr 36

Num that said yes to kidney disease 32
Num with risky gfr 118
Num with risky acr 95
Num with risky acr and gfr 25

Num that said yes to kidney disease 35
Num with risky gfr 94
Num with risky acr 108
Num with risky acr and gfr 30

Num that said yes to kidney disease 27
Num with risky gfr 141
Num with risky acr 111
Num with risky acr and gfr 31

