# Importing Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('risk_factor_surveillance_database.csv')
df

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,Datasource,Class,Topic,Question,Data_Value_Unit,Data_Value_Type,...,GeoLocation,ClassID,TopicID,QuestionID,DataValueTypeID,LocationID,StratificationCategory1,Stratification1,StratificationCategoryId1,StratificationID1
0,2011,2011,AL,Alabama,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(32.84057112200048, -86.63186076199969)",OWS,OWS1,Q036,VALUE,1,Total,Total,OVR,OVERALL
1,2011,2011,AL,Alabama,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(32.84057112200048, -86.63186076199969)",OWS,OWS1,Q036,VALUE,1,Gender,Male,GEN,MALE
2,2011,2011,AL,Alabama,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(32.84057112200048, -86.63186076199969)",OWS,OWS1,Q036,VALUE,1,Gender,Female,GEN,FEMALE
3,2011,2011,AL,Alabama,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(32.84057112200048, -86.63186076199969)",OWS,OWS1,Q036,VALUE,1,Education,Less than high school,EDU,EDUHS
4,2011,2011,AL,Alabama,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(32.84057112200048, -86.63186076199969)",OWS,OWS1,Q036,VALUE,1,Education,High school graduate,EDU,EDUHSGRAD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53387,2016,2016,VI,Virgin Islands,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,Value,...,"(18.335765, -64.896335)",PA,PA1,Q047,VALUE,78,Race/Ethnicity,Asian,RACE,RACEASN
53388,2016,2016,VI,Virgin Islands,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,Value,...,"(18.335765, -64.896335)",PA,PA1,Q047,VALUE,78,Race/Ethnicity,Hawaiian/Pacific Islander,RACE,RACEHPI
53389,2016,2016,VI,Virgin Islands,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,Value,...,"(18.335765, -64.896335)",PA,PA1,Q047,VALUE,78,Race/Ethnicity,American Indian/Alaska Native,RACE,RACENAA
53390,2016,2016,VI,Virgin Islands,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,Value,...,"(18.335765, -64.896335)",PA,PA1,Q047,VALUE,78,Race/Ethnicity,2 or more races,RACE,RACE2PLUS


# Removing rows with no sample sizes.

In [4]:
df.drop(df[df['Data_Value_Footnote_Symbol'] == '~'].index, inplace=True)

# Dropping Unnecessary Columns

In [5]:
df = df.drop(
    ['YearEnd', 'LocationDesc', 'LocationDesc', 'ClassID', 'TopicID', 'DataValueTypeID',
     'StratificationCategoryId1', 'Stratification1', 'StratificationCategory1', 'Data_Value_Footnote', 'Data_Value_Footnote_Symbol', 'Data_Value_Alt', 'Data_Value_Type', 'Data_Value_Unit', 'Question', 'Topic', 'Class', 'LocationAbbr', 'Datasource', 'GeoLocation', 
     'Low_Confidence_Limit', 'High_Confidence_Limit '], 
    axis=1)
df

Unnamed: 0,YearStart,Data_Value,Sample_Size,Total,Age(years),Education,Gender,Income,Race/Ethnicity,QuestionID,LocationID,StratificationID1
0,2011,32.0,7304.0,Total,,,,,,Q036,1,OVERALL
1,2011,32.3,2581.0,,,,Male,,,Q036,1,MALE
2,2011,31.8,4723.0,,,,Female,,,Q036,1,FEMALE
3,2011,33.6,1153.0,,,Less than high school,,,,Q036,1,EDUHS
4,2011,32.8,2402.0,,,High school graduate,,,,Q036,1,EDUHSGRAD
...,...,...,...,...,...,...,...,...,...,...,...,...
53382,2016,13.3,212.0,,,,,"$75,000 or greater",,Q047,78,INC75PLUS
53383,2016,25.3,137.0,,,,,Data not reported,,Q047,78,INCNR
53384,2016,18.3,154.0,,,,,,Non-Hispanic White,Q047,78,RACEWHT
53385,2016,24.1,820.0,,,,,,Non-Hispanic Black,Q047,78,RACEBLK


# Getting obesity data

In [6]:
obesity_df = df.loc[(df.QuestionID == 'Q036') | (df.QuestionID == 'Q037')]
obesity_df = obesity_df.drop(['StratificationID1', 'QuestionID'], axis=1)

# Testing Label and Ordinal Encoding

In [7]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

obesity_df_1 = obesity_df.copy()
label_encoder = LabelEncoder()
ordinal_encoder = OrdinalEncoder()

obesity_df_1['Age(years)'] = ordinal_encoder.fit_transform(obesity_df['Age(years)'].to_numpy().reshape(-1, 1))

obesity_df_1['YearStart'] = ordinal_encoder.fit_transform(obesity_df['YearStart'].to_numpy().reshape(-1, 1))

obesity_df_1['Education'] = ordinal_encoder.fit_transform(obesity_df['Education'].to_numpy().reshape(-1, 1))

obesity_df_1['Gender'] = label_encoder.fit_transform(obesity_df['Gender'])
obesity_df_1['Gender'] = obesity_df_1['Gender'].replace(2, np.nan)

obesity_df_1['Income'] = ordinal_encoder.fit_transform(obesity_df['Income'].to_numpy().reshape(-1, 1))

obesity_df_1['Race/Ethnicity'] = label_encoder.fit_transform(obesity_df['Race/Ethnicity'])
obesity_df_1['Race/Ethnicity'] = obesity_df_1['Race/Ethnicity'].replace(8, np.nan)

obesity_df_1['Total'] = label_encoder.fit_transform(obesity_df['Total'])

obesity_df_1['Total'].value_counts()
obesity_df_1['Total'] = obesity_df_1['Total'].replace(1, np.nan)


In [8]:
x = obesity_df_1.drop('Data_Value', axis=1)
y = obesity_df_1['Data_Value']

x_train, x_valid, y_train, y_valid = train_test_split(x, y)

my_model = XGBRegressor()
my_model.fit(x_train, y_train)
predictions = my_model.predict(x_valid)
print('MSE: ', mean_squared_error(y_true=y_valid, y_pred=predictions))
print('R2: ', r2_score(y_true=y_valid, y_pred=predictions))

MSE:  43.065581220472936
R2:  0.019290847543379286


# Testing One-Hot Encoder

In [9]:
obesity_df_2 = obesity_df.copy()
obesity_df_2 = pd.get_dummies(data=obesity_df_2, columns=['Total','Age(years)','Education','Gender','Income','Race/Ethnicity', 'YearStart'], dtype=int)

In [10]:
x = obesity_df_2.drop('Data_Value', axis=1)
y = obesity_df_2['Data_Value']

x_train, x_valid, y_train, y_valid = train_test_split(x, y, train_size=0.8, test_size=0.2)

my_model_2 = XGBRegressor()
my_model_2.fit(x_train, y_train)
predictions_2 = my_model_2.predict(x_valid)
print('MSE: ', mean_squared_error(y_true=y_valid, y_pred=predictions_2))
print('R2: ', r2_score(y_true=y_valid, y_pred=predictions_2))

MSE:  43.46454446353386
R2:  -0.022539433280337917


# Testing Cross Validation

In [11]:
from sklearn.model_selection import cross_val_score

obesity_df_3 = obesity_df.copy()
obesity_df_3 = pd.get_dummies(data=obesity_df_3, columns=['Total','Age(years)','Education','Gender','Income','Race/Ethnicity', 'YearStart'], dtype=int)

x = obesity_df_3.drop('Data_Value', axis=1)
y = obesity_df_3['Data_Value']

my_model_3 = XGBRegressor()
scores = -1 * cross_val_score(my_model_3, x, y, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)

print("MSE: ", scores.mean())

MSE:  38.430399191457425


In [12]:
# Thanks for Reading