## Behavioral Risk Factor Surveillance System

#### Following commands were executed to download the CSV file from Kaggle API.

#### https://github.com/Kaggle/kaggle-api




In [7]:
# This need not run every time.
!pip install kaggle
!kaggle datasets download -d cdc/behavioral-risk-factor-surveillance-system


Traceback (most recent call last):
  File "/Users/azizkoyuncu/anaconda3/bin/kaggle", line 6, in <module>
    from kaggle.cli import main
  File "/Users/azizkoyuncu/anaconda3/lib/python3.7/site-packages/kaggle/__init__.py", line 23, in <module>
    api.authenticate()
  File "/Users/azizkoyuncu/anaconda3/lib/python3.7/site-packages/kaggle/api/kaggle_api_extended.py", line 149, in authenticate
    self.config_file, self.config_dir))
OSError: Could not find kaggle.json. Make sure it's located in /Users/azizkoyuncu/.kaggle. Or use the environment method.


#### Installing Dependencies

In [8]:
# Installing Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')


#### Reading the Original CSV file downloaded from Kaggle.




In [9]:
%%time
# The behavioral-risk-factor-surveillance-system.zip had data for 5 years. 
# Using the 2015 data for analysis now.

file_path = "./Resources/2015.csv"
behavioral_data_original = pd.read_csv(file_path, encoding="utf-8")


FileNotFoundError: [Errno 2] File b'./Resources/2015.csv' does not exist: b'./Resources/2015.csv'

In [10]:
# Reviewing the original dataset.
behavioral_data_original.head()


NameError: name 'behavioral_data_original' is not defined

In [None]:
# Reviewing the MetaData info. Information about a DataFrame including the index dtype and column dtypes, 
# non-null values and memory usage
behavioral_data_original.info()

#### Selecting the required columns for exploratory analysis and cleaning.
* "_STATE" - State FIPS Code,  "SEX" - Gender ,  "MARITAL" - Marital Status,  "_RACEGR3" - Race/demographics, 
* "EDUCA" - Education Level, "_AGEG5YR" - Age Category , "WEIGHT2" - Weight in Lbs,  "HTIN4" - Height in Feet, 
* "_BMI5CAT" - BMI Category, "BPHIGH4" - Ever Teported with high Blood Pressure,"TOLDHI2" - Ever told with high blood Cholestrol
* "INCOME2" - Income category, "SMOKDAY2" - Smoking Frequency, "AVEDRNK2" - Average drinks / day, 
* "_VEGESUM" - Total vegetable consumption/ Day, "_FRUTSUM" - Total Fruit Consumption/Day, "CVDSTRK3" - Number of Drinks,
* "PA1MIN_" - Physical activity per minutes/week, "CVDCRHD4" - Ever reported of Heart Attack, 
* "CHCOCNCR" - Ever reported Cancer,  "DIABETE3" - Ever reported Diabetes.

In [None]:
%%time
# New dataframe with selected columns from the original dataframe.
behavioral_data_reduced = behavioral_data_original [["_STATE", "SEX", "MARITAL","_RACEGR3", "EDUCA", "_AGEG5YR", "WEIGHT2",  "HTIN4","_BMI5CAT","BPHIGH4" ,"TOLDHI2", "INCOME2", "SMOKDAY2" , "AVEDRNK2", "_VEGESUM", "_FRUTSUM", "CVDSTRK3", "PA1MIN_", "CVDCRHD4", "CHCOCNCR", "DIABETE3"]]

# Writing this new dataframe to a file
behavioral_data_reduced.to_csv("./Resources/behavioral_revised_data.csv")

# Viewing the metadate info.
behavioral_data_reduced.info()


In [None]:
%%time
# Viewing the Descriptive stats info on the revised data.
behavioral_data_reduced.describe()

#### Data Exploratory Analysis
#### Reading from the new CSV with selected columns

In [None]:
%%time
# Reading the new revised behavioral revised csv for processing.
file_path = "./Resources/behavioral_revised_data.csv"
behavioral_data =  pd.read_csv(file_path,skip_blank_lines=True, na_values=[9999,"", 7777],index_col=0)



In [None]:
# Previewing the Descriptive stats details for the data
behavioral_data.describe()

#### Cleaning up the data.
#### Converting Numeric Data to Text for categorical values

In [None]:
# Replacing all numeric values with state names
states_data = {1 : ["Alabama","AL"], 2: ["Alaska" ,"AK"] , 4: ["Arizona", "AZ"],
               5 : ["Arkansas", "AR"], 6: ["California","CA"],8 : ["Colorado", "CO"],
               9 : ["Connecticut", "CT"], 10 : ["Delaware","DE"], 11: ["District of Columbia","DC"],
               12 : ["Florida","FL"], 13 :  ["Georgia", "GA"],15 : ["Hawaii", "HI"], 16 : ["Idaho", "ID"],
               17 : ["Illinois", "IL"], 18 : ["Indiana", "IN"], 19 : ["Iowa", "IA"], 20 : ["Kansas", "KS"],
               21 : ["Kentucky", "KY"], 22 : ["Louisiana", "LA"], 23 : ["Maine", "ME"] ,24 : ["Maryland", "MD"],
               25 : ["Massachusetts", "MA"], 26 : ["Michigan", "MI"] ,27 : ["Minnesota", "MN"], 28 : ["Mississippi", "MS"],
               29 : ["Missouri", "MO"],30 : ["Montana", "MT"],31 : ["Nebraska","NE"],32 : ["Nevada", "NV"],33 : ["New Hampshire", "NH"],34 : ["New Jersey","NJ"]
               ,35 : ["New Mexico","NM"],36 : ["New York", "NY"] ,37 : ["North Carolina", "NC"],38 : ["North Dakota", "ND"],39 : ["Ohio", "OH"],40 : ["Oklahoma", "OK"]
               ,41 : ["Oregon", "OR"],42 : ["Pennsylvania", "PA"],44 : ["Rhode Island", "RI"],45 : ["South Carolina", "SC"],46 : ["South Dakota", "SD"],47 : ["Tennessee","TN"]
               ,48 : ["Texas", "TX"],49 : ["Utah", "UT"],50 : ["Vermont", "VT"],51 : ["Virginia", "VA"],53 : ["Washington", "WA"],54 : ["West Virginia", "WV"],55 : ["Wisconsin","WI"]
               ,56 : ["Wyoming", "WY"],66 : ["Guam", "GU"],72 : ["Puerto Rico", "PR"]}

#Replacing gender codes with actual values
gender = { 1: "Male" , 2 : "Female"}

# Replacing Marital status codes with actual values
marital_status = { 1 : "Married", 2 : "Divorced", 3 : "Widowed", 4 : "Separated", 5 : "Never married", 
                  6 : "A member of an unmarried couple",9 : "Refused" }

# Replacing education values to categories
education = {1 : "No School", 2 : "Elementary", 
            3 : "Junior High",4 : "High School", 
             5 : "College 3yrs", 
             6 : "College 4yrs", 9 : "Refused"}

# Replacing internet usage to values
internet_values = {1 : "Yes", 2 : "No", 7 : "Don’t know/Not Sure", 9 : "Refused", "" :  "Not asked or Missing"}

# Replacing race values to it's categories
race = { 1 :"White only", 2 :"Black only", 3 :"Other race only", 4 :"Multiracial",
                5 :"Hispanic", 9 :"Don’t know/Not sure/Refused"}


# Replacing age values to it's categories
age_values = {1 : "18-24", 2 : "25-29", 3 : "30-34", 4 : "35-39", 
              5 : "40-44", 6 : "45-49", 7 : "50-54", 8 : "55-59", 
              9 : "60-64", 10: "65-69", 11: "70-74", 12: "75-79",
              13: ">80", 14: "Don’t know/Refused/Missing"}

# Replacing income values to it's categories
income_values = {1.0 : "< 10000", 2 : "10000-15000", 3:  "15000-20000", 
                 4.0 : "20000-25000" , 5 : "25000-35000" , 6 : "35000-50000", 
                 7.0 : "50000-75000" , 8 : " >75000", 77 : "Don’t know/Not sure", 99 : "Refused"
                }

# Replacing smoke values to it's categories
smoke_values = { 1 : "Every day", 2 : "Some days", 3 :  "Not at all", 
                7 : "Don´t Know/Not Sure" , 9 : "Refused", "" :  "Not asked or Missing"}

# Replacing heart disease values to actual state.
heart_disease_values = { 1 : 'Yes' , 2 : 'No' , 7 : 'Don’t know/Not sure' , 9: 'Refused'}

# Replacing BMI values to it's categories.
bmi_values = { 1: "Underweight", 2: "Normal Weight" , 3:"Overweight" , 4 : "Obese" , "" : "Refused"}


# Replacing BP values to actual state.
blood_pressure_values = {1: "Yes", 2 : "Yes" , 3: "No", 4: "Borderline" , 7: "Don´t know/Not Sure", 9:"Refused" , "" :"Not Asked"}

# Replacing Cholestrol values to it's state.
cholestrol_values = { 1: "Yes", 2:"No", 3:"Don’t know/Not Sure", 9: "Refused", "": "Not Asked"}

# Replacing Cancer values to it's state.
cancer_values = { 1: "Yes" , 2 : "No" , 7 : "Don’t know/Not sure", 9 : "Refused"}

# Replacing Kidney values it's state
kidney_values = { 1: "Yes" , 2: "No", 7 : "Don’t know/Not sure" , 9 : "Refused" }

# Replacing Stroke values to it's state.
stroke_values = {1 : "Yes", 2 : "No", 7 : "Don’t know/Not sure", 9 : "Refused"}

# Replacing Diabetic values to it's state.
diabetes_values = { 1: "Yes", 2: "Yes" , 3: "No" , 4: "No" , 7: "Don’t know/Not Sure" , 9: "Refused"}

behavioral_data.info()

In [None]:
# Creating some new columns to store the text values
behavioral_data["State"] = ""
behavioral_data["State Code"] = ""
behavioral_data["Sex"] = ""
behavioral_data["Marital Status"] = ""
behavioral_data["Education"] = ""
behavioral_data["Race"] = ""
behavioral_data["Income"] = ""
behavioral_data["Smoking"] = ""
behavioral_data["Heart Disease"] = ""
behavioral_data["Blood Pressure"] = ""
behavioral_data["Diabetes"] = ""
behavioral_data["Cancer"] = ""
behavioral_data["Cholestrol"] = ""
behavioral_data["BMI"] = ""
behavioral_data["Stroke"] = ""
behavioral_data["Age"] = ""
behavioral_data.head()

In [None]:
%%time
# Converting Numeric data to Text form
behavioral_data['State'] = behavioral_data['_STATE'].map(lambda x: states_data.get(x, None)[0])
behavioral_data['State Code'] = behavioral_data['_STATE'].map(lambda x: states_data.get(x, None)[1])
behavioral_data["Sex"] = behavioral_data['SEX'].map(lambda x: gender.get(x, None))
behavioral_data["Marital Status"] = behavioral_data['MARITAL'].map(lambda x: marital_status.get(x, None))
behavioral_data["Education"] =  behavioral_data["EDUCA"].map(lambda x: education.get(x, None))
behavioral_data["Race"] = behavioral_data["_RACEGR3"].map(lambda x: race.get(x, None))
behavioral_data["Income"] = behavioral_data["INCOME2"].map(lambda x: income_values.get(x, None))
behavioral_data["Smoking"] = behavioral_data["SMOKDAY2"].map(lambda x: smoke_values.get(x, None))
behavioral_data["Heart Disease"] = behavioral_data["CVDCRHD4"].map(lambda x: heart_disease_values.get(x, None))
behavioral_data["Blood Pressure"] = behavioral_data["BPHIGH4"].map(lambda x: blood_pressure_values.get(x, None))
behavioral_data["Cancer"] = behavioral_data["CHCOCNCR"].map(lambda x: cancer_values.get(x, None))
behavioral_data["Diabetes"] = behavioral_data["DIABETE3"].map(lambda x: diabetes_values.get(x, None))
behavioral_data["Cholestrol"] = behavioral_data["TOLDHI2"].map(lambda x: cholestrol_values.get(x, None))
behavioral_data["BMI"] = behavioral_data["_BMI5CAT"].map(lambda x: bmi_values.get(x, None))
behavioral_data["Stroke"] = behavioral_data["CVDSTRK3"].map(lambda x: stroke_values.get(x, None))
behavioral_data["Age"] = behavioral_data["_AGEG5YR"].map(lambda x: age_values.get(x, None))
behavioral_data["HTIN4"] = behavioral_data["HTIN4"].map(lambda x: x * 0.08333)  # Converting to Feet from inches
behavioral_data["PA1MIN_"] = behavioral_data["PA1MIN_"] / 7   # Converting to Per day from Per Week
behavioral_data["_VEGESUM"] = round(behavioral_data["_VEGESUM"] / 100,2)
behavioral_data["_FRUTSUM"] = round(behavioral_data["_FRUTSUM"] / 100,2)



####  Removing outliers and Nan values.

In [None]:
# Keeping a threshold for AVEDRNK2, _FRUTSUM, _VEGESUM, WEIGHT2 and treating the remaining as outliers 
# All outliers are now set as Nan.
behavioral_data["AVEDRNK2"] = behavioral_data["AVEDRNK2"].map(lambda x: np.nan if x > 10 else x)
behavioral_data["_FRUTSUM"] = behavioral_data["_FRUTSUM"].map(lambda x: np.nan if x > 20 else x)
behavioral_data["_VEGESUM"] = behavioral_data["_VEGESUM"].map(lambda x: np.nan if x > 20 else x)
behavioral_data["WEIGHT2"] = behavioral_data["WEIGHT2"].map(lambda x: np.nan if x > 9000 else x)

# Dropping all NA rows

behavioral_data = behavioral_data.dropna()
behavioral_data = behavioral_data.reset_index(drop=True)

behavioral_data.describe()

In [None]:
# Dropping the old columns
behavioral_data = behavioral_data.drop(columns = ["_STATE", "SEX", "MARITAL","EDUCA", "_RACEGR3", "INCOME2", "SMOKDAY2", "CVDCRHD4","BPHIGH4", "CHCOCNCR", "TOLDHI2", "_BMI5CAT", "CVDSTRK3", "_AGEG5YR" ,"DIABETE3"])

behavioral_data.head()

In [None]:
# Re-naming the columns
behavioral_data.rename(columns={"WEIGHT2" : "Weight(lbs)", "EDUCA":"Education", "HTIN4" : "Height(ft)", "AVEDRNK2": "Alcohol/Day", "_VEGESUM" : "Vegetables/Day" ,"_FRUTSUM" : "Fruits/Day", "PA1MIN_" : "Physical Activity/Day(mints)"},inplace=True)

# Re-arranging the columns

behavioral_data = behavioral_data[["State","State Code" ,"Sex", "Marital Status","Age","Race","Education","Weight(lbs)","Height(ft)", "Income","Vegetables/Day","Fruits/Day", "Physical Activity/Day(mints)", "Smoking","Alcohol/Day","BMI", "Blood Pressure", "Cholestrol", "Heart Disease", "Stroke", "Cancer","Diabetes"]]

behavioral_data.head()

In [None]:
%%time
# Writing the cleaned data to a new csv file.
output_file = "./Resources/behavioral_revised_data_final.csv"
behavioral_data.to_csv(output_file,index=0)


In [12]:
behavioral_data.info()


NameError: name 'behavioral_data' is not defined

In [None]:
# Verifying Pairwise correlation of columns
behavioral_data.corr()

In [None]:
behavioral_data = behavioral_data[behavioral_data.Income != 'Refused']


In [11]:
behavioral_data.head()

NameError: name 'behavioral_data' is not defined