In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from functions.load_data import *
from functions.clean_data import *

In [3]:
import warnings 
warnings.filterwarnings('ignore')

# Load Data

In [5]:
df = load_diabetes_data()
df_clean = clean_feature_data(df)
display(df_clean.head())
print(df_clean.shape)

Unnamed: 0,house_family_person_id,diabetes,age,smoker,sex,coronary_heart_disease,weight,had_high_cholesterol,class_of_worker,years_on_job,...,more_than_one_job,kidney_condition,liver_condition,cant_afford_meds,skipped_meds,non_hispanic_race,hispanic,primary_care,bmi,age_group
0,1_2_1,NO,24,NO,FEMALE,NO,125.0,NO,PRIVATE COMPANY,2.0,...,NO,NO,NO,NO,MISSING,Black/African American,Not Hispanic/Spanish origin,MISSING,20.17,Young Adult (18-25)
1,2_1_1,NO,76,NO,MALE,NO,190.0,NO,STATE GOVERNMENT,25.0,...,MISSING,NO,NO,NO,MISSING,White,Not Hispanic/Spanish origin,Doctor office,27.26,Senior (76-85)
2,3_1_2,NO,36,NO,FEMALE,NO,225.0,NO,PRIVATE COMPANY,2.0,...,NO,NO,NO,YES,NO,White,Mexican-American,MISSING,38.62,Early Middle Age (36-45)
3,5_1_1,NO,35,YES,FEMALE,NO,240.0,NO,PRIVATE COMPANY,1.0,...,NO,NO,NO,YES,YES,White,Mexican-American,MISSING,39.93,Emerging Adulthood (26-35)
4,10_1_5,NO,20,NO,FEMALE,NO,103.0,NO,MISSING,0.0,...,MISSING,NO,NO,NO,MISSING,White,Puerto Rico,Doctor office,18.84,Young Adult (18-25)


(32499, 26)


In [6]:
df17 = load_diabetes_data(path='../data/samadult17.csv')
df_clean17 = clean_feature_data(df17)
display(df_clean17.head())
print(df_clean17.shape)

Unnamed: 0,house_family_person_id,diabetes,age,smoker,sex,coronary_heart_disease,weight,had_high_cholesterol,class_of_worker,years_on_job,...,more_than_one_job,kidney_condition,liver_condition,cant_afford_meds,skipped_meds,non_hispanic_race,hispanic,primary_care,bmi,age_group
0,3_1_1,NO,65,YES,FEMALE,NO,155.0,NO,PRIVATE COMPANY,28.0,...,MISSING,NO,NO,NO,NO,White,Not Hispanic/Spanish origin,MISSING,29.28,Late Middle Age (56-65)
1,9_1_1,NO,19,NO,MALE,NO,180.0,NO,PRIVATE COMPANY,0.0,...,MISSING,YES,YES,NO,NO,White,Not Hispanic/Spanish origin,MISSING,23.11,Young Adult (18-25)
2,11_1_2,NO,45,YES,MALE,NO,240.0,NO,PRIVATE COMPANY,13.0,...,NO,NO,NO,YES,NO,White,Not Hispanic/Spanish origin,Doctor office,35.44,Early Middle Age (36-45)
3,15_1_1,NO,67,YES,FEMALE,NO,236.0,NO,PRIVATE COMPANY,13.0,...,NO,NO,NO,NO,NO,White,Not Hispanic/Spanish origin,MISSING,43.16,Early Senior (66-75)
4,18_1_1,NO,40,YES,MALE,NO,182.0,NO,PRIVATE COMPANY,16.0,...,NO,NO,NO,NO,MISSING,White,Multiple Hispanic,MISSING,32.24,Early Middle Age (36-45)


(26311, 26)


# Analysis

## 2016

In [9]:
diseases = ["diabetes",
            "smoker",
            "coronary_heart_disease",
            "had_high_cholesterol",
            "hypertension",
            "heart_condition",
            "cancer",
            "kidney_condition",
            "liver_condition",
            "cant_afford_meds",
            "skipped_meds",
           ]

In [10]:
dict_diseases_race = {}

for i in diseases:

    df_counts = df_clean.groupby(['non_hispanic_race', i]).agg({'house_family_person_id': 'count'}).reset_index()
    df_counts['Total'] = df_counts.groupby('non_hispanic_race')['house_family_person_id'].transform('sum')
    df_counts['Percentage'] = df_counts['house_family_person_id'] / df_counts['Total']
    
    df_counts = df_counts[~df_counts['non_hispanic_race'].isin(['Primary race not releasable', 'Multiple race, no primary race selected'])]
    df_counts_yes = df_counts[df_counts[i]=='YES'].sort_values('Percentage', ascending=False)

    races = df_counts_yes.head(3)['non_hispanic_race'].unique().tolist()

    dict_diseases_race[i] = races
    
    # print()

    # display(df_counts_yes)

In [11]:
df_diseases = pd.DataFrame(dict_diseases_race, index=['Most Affected', 'Second Most', 'Third Most']).T.sort_values(['Most Affected', 'Second Most', 'Third Most'])
df_diseases

Unnamed: 0,Most Affected,Second Most,Third Most
hypertension,Black/African American,Filipino,White
kidney_condition,Black/African American,"Indian (American), Alaska Native",White
cant_afford_meds,Black/African American,"Indian (American), Alaska Native",White
skipped_meds,Black/African American,"Indian (American), Alaska Native",White
had_high_cholesterol,Filipino,White,Other Asian
diabetes,"Indian (American), Alaska Native",Black/African American,Filipino
smoker,"Indian (American), Alaska Native",White,Black/African American
liver_condition,Other Asian,"Indian (American), Alaska Native",White
coronary_heart_disease,White,"Indian (American), Alaska Native",Black/African American
heart_condition,White,"Indian (American), Alaska Native",Black/African American


In [12]:
# df_diseases.reset_index().groupby(['Most Affected']).agg({'index':"unique"})

## 2017

In [14]:
dict_diseases_race17 = {}

for i in diseases:

    df_counts = df_clean17.groupby(['non_hispanic_race', i]).agg({'house_family_person_id': 'count'}).reset_index()
    df_counts['Total'] = df_counts.groupby('non_hispanic_race')['house_family_person_id'].transform('sum')
    df_counts['Percentage'] = df_counts['house_family_person_id'] / df_counts['Total']
    
    df_counts = df_counts[~df_counts['non_hispanic_race'].isin(['Primary race not releasable', 'Multiple race, no primary race selected'])]
    df_counts_yes = df_counts[df_counts[i]=='YES'].sort_values('Percentage', ascending=False)

    races = df_counts_yes.head(3)['non_hispanic_race'].unique().tolist()

    dict_diseases_race17[i] = races

df_diseases17 = pd.DataFrame(dict_diseases_race17, index=['Most Affected', 'Second Most', 'Third Most']).T.sort_values(['Most Affected', 'Second Most', 'Third Most'])

In [15]:
df_diseases17

Unnamed: 0,Most Affected,Second Most,Third Most
hypertension,Black/African American,"Indian (American), Alaska Native",Filipino
skipped_meds,Black/African American,"Indian (American), Alaska Native",Filipino
cant_afford_meds,Black/African American,"Indian (American), Alaska Native",White
coronary_heart_disease,Filipino,White,Black/African American
had_high_cholesterol,Filipino,White,Other Asian
diabetes,"Indian (American), Alaska Native",Black/African American,Filipino
kidney_condition,"Indian (American), Alaska Native",Black/African American,White
liver_condition,"Indian (American), Alaska Native",Filipino,Other Asian
smoker,"Indian (American), Alaska Native",White,Black/African American
heart_condition,White,"Indian (American), Alaska Native",Black/African American
