# Predicting the prevalence of Diabetes from online records using 

The main goal of this assignment is to set up a machine learning framework that predicts the prevalence of a specific 
chronic disease or health risk factor from digital records of online behavior.



## Ground truth data

### Downloading the dataset

In [None]:
https://chronicdata.cdc.gov/resource/5hnu-2c66.json

In [1]:
import pandas as pd

df = pd.read_csv("data/U.S._Chronic_Disease_Indicators__CDI_.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [2]:
df.head(5)

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,Response,DataValueUnit,DataValueType,...,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1,StratificationCategoryID2,StratificationID2,StratificationCategoryID3,StratificationID3
0,2015,2015,AK,Alaska,YRBSS,Alcohol,Alcohol use among youth,,%,Crude Prevalence,...,2,ALC,ALC1_1,CrdPrev,OVERALL,OVR,,,,
1,2015,2015,AL,Alabama,YRBSS,Alcohol,Alcohol use among youth,,%,Crude Prevalence,...,1,ALC,ALC1_1,CrdPrev,OVERALL,OVR,,,,
2,2015,2015,AR,Arkansas,YRBSS,Alcohol,Alcohol use among youth,,%,Crude Prevalence,...,5,ALC,ALC1_1,CrdPrev,OVERALL,OVR,,,,
3,2015,2015,AZ,Arizona,YRBSS,Alcohol,Alcohol use among youth,,%,Crude Prevalence,...,4,ALC,ALC1_1,CrdPrev,OVERALL,OVR,,,,
4,2015,2015,CA,California,YRBSS,Alcohol,Alcohol use among youth,,%,Crude Prevalence,...,6,ALC,ALC1_1,CrdPrev,OVERALL,OVR,,,,


In [3]:
unique_topics = list(set(df["Topic"]))
unique_topics

['Diabetes',
 'Nutrition, Physical Activity, and Weight Status',
 'Tobacco',
 'Arthritis',
 'Reproductive Health',
 'Oral Health',
 'Asthma',
 'Older Adults',
 'Immunization',
 'Chronic Obstructive Pulmonary Disease',
 'Mental Health',
 'Disability',
 'Alcohol',
 'Cardiovascular Disease',
 'Overarching Conditions',
 'Chronic Kidney Disease',
 'Cancer']

In [4]:
df_diabetes = df[df.Topic=="Diabetes"]
df_diabetes.to_csv("data/US_diabetes.csv")

In [5]:
set(df_diabetes.DataSource)

{'BRFSS', 'Birth Certificate, NVSS', 'NVSS', 'PRAMS', 'State Inpatient Data'}

### Type of questions with prevalence in name

In [6]:
questions = dict(enumerate(sorted(list(set(df_diabetes.Question)))))
questions

{0: 'Adults with diagnosed diabetes aged >= 18 years who have taken a diabetes self-management course',
 1: 'Amputation of a lower extremity attributable to diabetes',
 2: 'Diabetes prevalence among women aged 18-44 years',
 3: 'Dilated eye examination among adults aged >= 18 years with diagnosed diabetes',
 4: 'Foot examination among adults aged >= 18 years with diagnosed diabetes',
 5: 'Glycosylated hemoglobin measurement among adults aged >= 18 years with diagnosed diabetes',
 6: 'Hospitalization with diabetes as a listed diagnosis',
 7: 'Influenza vaccination among noninstitutionalized adults aged 18-64 years with diagnosed diabetes',
 8: 'Influenza vaccination among noninstitutionalized adults aged >= 65 years with diagnosed diabetes',
 9: 'Mortality due to diabetes reported as any listed cause of death',
 10: 'Mortality with diabetic ketoacidosis reported as any listed cause of death',
 11: 'Pneumococcal vaccination among noninstitutionalized adults aged 18-64 years with diagnose

## Type of Data Values


In [7]:
data_values = set(df_diabetes.DataValueType)
data_values

{'Age-adjusted Prevalence',
 'Age-adjusted Rate',
 'Crude Prevalence',
 'Crude Rate',
 'Number',
 'Prevalence'}

In [38]:
df_diabetes[(df_diabetes.LocationDesc == "Wyoming")&(df_diabetes.StratificationCategory1 == "Overall")&(df_diabetes.YearStart == 2013)&(df_diabetes.DataValueType == "Age-adjusted Prevalence")&(df_diabetes.Question == questions[14])]

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,Response,DataValueUnit,DataValueType,...,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1,StratificationCategoryID2,StratificationID2,StratificationCategoryID3,StratificationID3
329821,2013,2013,WY,Wyoming,BRFSS,Diabetes,Prevalence of diagnosed diabetes among adults ...,,%,Age-adjusted Prevalence,...,56,DIA,DIA2_1,AgeAdjPrev,OVERALL,OVR,,,,


In [31]:
set(df_diabetes[(df_diabetes.YearStart == df_diabetes.YearEnd)].YearStart)
# (df_diabetes.LocationAbbr == 'US')&

{2010, 2011, 2012, 2013, 2014, 2015}

In [15]:
def get_prevalence_indicator(df, state, category, year, data_value, question):
    return float(df[
        (df.LocationAbbr == state) & \
        (df.StratificationCategory1 == category) & \
        (df.YearStart == year) & \
        (df.DataValueType == data_value) & \
        (df.Question == question )
    ]["DataValue"].values[0])

In [28]:
get_prevalence_indicator(df=df_diabetes,
                         state="US",
                         category="Overall",
                         year=2014,
                         data_value= "Age-adjusted Prevalence",#"Crude Prevalence",
                         question = questions[14]                        
                        )

9.1

In [26]:
set(df_diabetes.YearStart)

{2009, 2010, 2011, 2012, 2013, 2014, 2015}

### Google Trends

Command to install the necessary package
###### pip install pytrends 

In [34]:
from pytrends.request import TrendReq

In [35]:
pytrend = TrendReq(hl='en-US', tz=360)

In [36]:
key_list = {"1": ["diabetes", "sugar", "glucose", "insulin", "diagnosis"], 
            "2": ["blood pressure", "fruit", "cholesterol", "diet", "type 1"],
            "3": ["type 2", "symptoms", "diabetes mellitus", "diagnosis", "complications"],
            "4": ["osteoporosis", "hypertension","obesity", "fat", "association"]
           }

In [37]:
d_s = '-01-01 '
d_e = '-12-31'
for i in range(2009,2016):
    period = str(i) + d_s + str(i) + d_e
    for key in key_list:
        pytrend.build_payload(kw_list=key_list[key], geo="US", cat = 630, \
                      timeframe= period)
        if key == "1":
            interest_by_region_df = pytrend.interest_by_region()
        else:
             interest_by_region_df = pd.concat([interest_by_region_df, pytrend.interest_by_region()], axis = 1)
    file = str(i) + ".csv"
    interest_by_region_df.to_csv(file, sep = ',')

## Poverty Data

In [32]:
df_poverty = pd.read_excel("data_poverty/dat_p.xls")

In [33]:
df_poverty.head(5)

Unnamed: 0,STATE,Total.15,Number.15,Standard error.15,Percent.15,Standard error.15.1,Total.14,Number.14,Standard error.14,Percent.14,...,Total.10,Number.10,Standard error.10,Percent.10,Standard error.10.1,Total.9,Number.9,Standard error.9,Percent.9,Standard error.9.1
0,Alabama,4820,784,60,16.3,1.2,4765,848,53,17.8,...,4717,812,83,17.2,1.8,4655,770,60,16.6,1.3
1,Alaska,703,65,6,9.2,0.9,694,82,8,11.9,...,695,87,10,12.5,1.5,688,81,8,11.7,1.2
2,Arizona,6728,1156,84,17.2,1.3,6657,1409,76,21.2,...,6426,1208,108,18.8,1.7,6508,1381,84,21.2,1.3
3,Arkansas,2951,475,27,16.1,0.9,2891,532,44,18.4,...,2879,440,45,15.3,1.6,2846,538,40,18.9,1.4
4,California,39081,5441,211,13.9,0.5,38666,6112,218,15.8,...,37240,6073,212,16.3,0.6,36742,5638,177,15.3,0.5


In [40]:
locations = set(df_diabetes[df_diabetes.LocationAbbr != "US"].LocationAbbr)

In [41]:
locations

{'AK',
 'AL',
 'AR',
 'AZ',
 'CA',
 'CO',
 'CT',
 'DC',
 'DE',
 'FL',
 'GA',
 'GU',
 'HI',
 'IA',
 'ID',
 'IL',
 'IN',
 'KS',
 'KY',
 'LA',
 'MA',
 'MD',
 'ME',
 'MI',
 'MN',
 'MO',
 'MS',
 'MT',
 'NC',
 'ND',
 'NE',
 'NH',
 'NJ',
 'NM',
 'NV',
 'NY',
 'OH',
 'OK',
 'OR',
 'PA',
 'PR',
 'RI',
 'SC',
 'SD',
 'TN',
 'TX',
 'UT',
 'VA',
 'VI',
 'VT',
 'WA',
 'WI',
 'WV',
 'WY'}