## 1. Importing Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
sns.set(rc={'figure.figsize':(15,10)})
pd.set_option('display.max_columns', None)

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn import metrics

## 2. Importing Dataset

In [7]:
raw_data = pd.read_csv('data/dataset.csv')

In [8]:
raw_data.head()

Unnamed: 0,Year,LocationAbbr,LocationDesc,Datasource,PriorityArea1,PriorityArea2,PriorityArea3,PriorityArea4,Category,Topic,Indicator,Data_Value_Type,Data_Value_Unit,Data_Value,Data_Value_Alt,Data_Value_Footnote_Symbol,Data_Value_Footnote,Confidence_Limit_Low,Confidence_Limit_High,Break_Out_Category,Break_out,CategoryID,TopicID,IndicatorID,Data_Value_TypeID,BreakoutCategoryID,BreakOutID,LocationID,GeoLocation
0,2011,AL,Alabama,BRFSS,,,,,Cardiovascular Diseases,Major Cardiovascular Disease,Prevalence of major cardiovascular disease amo...,Age-Standardized,Percent (%),9.9,9.9,,,9.2,10.7,Overall,Overall,C1,T1,BR001,AgeStdz,BOC01,OVR01,1,"(32.84057112200048, -86.63186076199969)"
1,2011,AL,Alabama,BRFSS,,,,,Cardiovascular Diseases,Major Cardiovascular Disease,Prevalence of major cardiovascular disease amo...,Crude,Percent (%),11.0,11.0,,,10.2,11.9,Overall,Overall,C1,T1,BR001,Crude,BOC01,OVR01,1,"(32.84057112200048, -86.63186076199969)"
2,2011,AL,Alabama,BRFSS,,,,,Cardiovascular Diseases,Major Cardiovascular Disease,Prevalence of major cardiovascular disease amo...,Crude,Percent (%),12.5,12.5,,,11.1,14.0,Gender,Male,C1,T1,BR001,Crude,BOC02,GEN01,1,"(32.84057112200048, -86.63186076199969)"
3,2011,AL,Alabama,BRFSS,,,,,Cardiovascular Diseases,Major Cardiovascular Disease,Prevalence of major cardiovascular disease amo...,Age-Standardized,Percent (%),11.8,11.8,,,10.6,13.2,Gender,Male,C1,T1,BR001,AgeStdz,BOC02,GEN01,1,"(32.84057112200048, -86.63186076199969)"
4,2011,AL,Alabama,BRFSS,,,,,Cardiovascular Diseases,Major Cardiovascular Disease,Prevalence of major cardiovascular disease amo...,Age-Standardized,Percent (%),8.3,8.3,,,7.5,9.1,Gender,Female,C1,T1,BR001,AgeStdz,BOC02,GEN02,1,"(32.84057112200048, -86.63186076199969)"


In [12]:
raw_data.shape

(85800, 29)

#### Calculating Percentage of Null Value in Each Column

In [17]:
round(raw_data.isnull().sum() / len(raw_data) * 100, 2)

Year                           0.00
LocationAbbr                   0.00
LocationDesc                   0.00
Datasource                     0.00
PriorityArea1                  0.00
PriorityArea2                  0.00
PriorityArea3                  0.00
PriorityArea4                  0.00
Category                       0.00
Topic                          0.00
Indicator                      0.00
Data_Value_Type                0.00
Data_Value_Unit                0.00
Data_Value                    33.53
Data_Value_Alt                 0.00
Data_Value_Footnote_Symbol    66.47
Data_Value_Footnote           66.47
Confidence_Limit_Low          35.16
Confidence_Limit_High         35.16
Break_Out_Category             0.00
Break_out                      0.00
CategoryID                     0.00
TopicID                        0.00
IndicatorID                    0.00
Data_Value_TypeID              0.00
BreakoutCategoryID             0.00
BreakOutID                     0.00
LocationID                  

## 3. Data Cleaning

### 3.1. Year Column

In [23]:
raw_data.groupby(by='Year').Year.count()

Year
2011    17160
2012    17160
2013    17160
2014    17160
2015    17160
Name: Year, dtype: int64

### 3.2. Location Abbreviations & Description

##### Location Abbreviations & Description are represents same data. Hence, we will drop Abbreviations column

In [26]:
raw_data = raw_data.drop(columns=['LocationAbbr'])

In [28]:
raw_data.groupby(by='LocationDesc').LocationDesc.count()

LocationDesc
Alabama                 1650
Alaska                  1650
Arizona                 1650
Arkansas                1650
California              1650
Colorado                1650
Connecticut             1650
Delaware                1650
Florida                 1650
Georgia                 1650
Hawaii                  1650
Idaho                   1650
Illinois                1650
Indiana                 1650
Iowa                    1650
Kansas                  1650
Kentucky                1650
Louisiana               1650
Maine                   1650
Maryland                1650
Massachusetts           1650
Median of all states    1650
Michigan                1650
Minnesota               1650
Mississippi             1650
Missouri                1650
Montana                 1650
Nebraska                1650
Nevada                  1650
New Hampshire           1650
New Jersey              1650
New Mexico              1650
New York                1650
North Carolina          1650
N

### 3.3. Data Source Column

In [29]:
raw_data.groupby(by='Datasource').Datasource.count()

Datasource
BRFSS    85800
Name: Datasource, dtype: int64

#### Data Source column consists of only one value and thus this columns can be dropped as it will not impact target variable in anyway possible

In [30]:
raw_data = raw_data.drop(columns=['Datasource'])

### 3.4. Priority Areas

In [31]:
raw_data.groupby(by='PriorityArea1').PriorityArea1.count()

PriorityArea1
Million Hearts    11440
None              74360
Name: PriorityArea1, dtype: int64

In [41]:
raw_data.PriorityArea1 = raw_data.PriorityArea1.map({'Million Hearts': 1, 'None': 0})

In [32]:
raw_data.groupby(by='PriorityArea2').PriorityArea2.count()

PriorityArea2
ABCS     5720
None    80080
Name: PriorityArea2, dtype: int64

In [43]:
raw_data.PriorityArea2 = raw_data.PriorityArea2.map({'ABCS': 1, 'None': 0})

In [44]:
raw_data.groupby(by='PriorityArea3').PriorityArea3.count()

PriorityArea3
Healthy People 2020    45760
None                   40040
Name: PriorityArea3, dtype: int64

In [45]:
raw_data.PriorityArea3 = raw_data.PriorityArea3.map({'Healthy People 2020': 1, 'None': 0})

#### Priority Area 4 column consists of only one value and thus this columns can be dropped as it will not impact target variable in anyway possible

In [37]:
raw_data.groupby(by='PriorityArea4').PriorityArea4.count()

PriorityArea4
None    85800
Name: PriorityArea4, dtype: int64

In [38]:
raw_data = raw_data.drop(columns=['PriorityArea4'])

### 3.5. Category

In [49]:
raw_data.groupby(by='Category').Category.count()

Category
Cardiovascular Diseases    28600
Risk Factors               57200
Name: Category, dtype: int64

### 3.6. Topic & Indicator

In [51]:
raw_data.groupby(by='Topic').Topic.count()

Topic
Acute Myocardial Infarction (Heart Attack)    11440
Cholesterol Abnormalities                     11440
Coronary Heart Disease                         5720
Diabetes                                       5720
Hypertension                                  11440
Major Cardiovascular Disease                   5720
Nutrition                                      5720
Obesity                                       11440
Physical Inactivity                            5720
Smoking                                        5720
Stroke                                         5720
Name: Topic, dtype: int64

In [60]:
raw_data.groupby(by=['Indicator']).Indicator.count()

Indicator
Prevalence of acute myocardial infarction (heart attack) among US adults (18+); BRFSS                    5720
Prevalence of cholesterol screening in the past 5 years among US adults (20+); BRFSS                     5720
Prevalence of consuming fruits and vegetables less than 5 times per day among US adults (18+); BRFSS     5720
Prevalence of coronary heart disease among US adults (18+); BRFSS                                        5720
Prevalence of current smoking among US adults (18+); BRFSS                                               5720
Prevalence of diabetes among US adults (18+); BRFSS                                                      5720
Prevalence of healthy weight among US adults (20+); BRFSS                                                5720
Prevalence of high total cholesterol among US adults (20+); BRFSS                                        5720
Prevalence of hypertension among US adults (18+); BRFSS                                                  5720


#### On carefully observing Topic & indicator column we can see that Indicator column is an extended form of Topic column, with more detailed info.
#### So, we will drop Topic column and retain Indicator column

In [78]:
raw_data = raw_data.drop(columns=['Topic'])

### 3.7. Data Value Type

In [80]:
raw_data.groupby(by=['Data_Value_Type']).Data_Value_Type.count() 

Data_Value_Type
Age-Standardized    31200
Crude               54600
Name: Data_Value_Type, dtype: int64