# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('risk_factor_surveillance_database.csv')
df

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,Datasource,Class,Topic,Question,Data_Value_Unit,Data_Value_Type,...,GeoLocation,ClassID,TopicID,QuestionID,DataValueTypeID,LocationID,StratificationCategory1,Stratification1,StratificationCategoryId1,StratificationID1
0,2011,2011,AL,Alabama,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(32.84057112200048, -86.63186076199969)",OWS,OWS1,Q036,VALUE,1,Total,Total,OVR,OVERALL
1,2011,2011,AL,Alabama,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(32.84057112200048, -86.63186076199969)",OWS,OWS1,Q036,VALUE,1,Gender,Male,GEN,MALE
2,2011,2011,AL,Alabama,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(32.84057112200048, -86.63186076199969)",OWS,OWS1,Q036,VALUE,1,Gender,Female,GEN,FEMALE
3,2011,2011,AL,Alabama,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(32.84057112200048, -86.63186076199969)",OWS,OWS1,Q036,VALUE,1,Education,Less than high school,EDU,EDUHS
4,2011,2011,AL,Alabama,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(32.84057112200048, -86.63186076199969)",OWS,OWS1,Q036,VALUE,1,Education,High school graduate,EDU,EDUHSGRAD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53387,2016,2016,VI,Virgin Islands,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,Value,...,"(18.335765, -64.896335)",PA,PA1,Q047,VALUE,78,Race/Ethnicity,Asian,RACE,RACEASN
53388,2016,2016,VI,Virgin Islands,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,Value,...,"(18.335765, -64.896335)",PA,PA1,Q047,VALUE,78,Race/Ethnicity,Hawaiian/Pacific Islander,RACE,RACEHPI
53389,2016,2016,VI,Virgin Islands,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,Value,...,"(18.335765, -64.896335)",PA,PA1,Q047,VALUE,78,Race/Ethnicity,American Indian/Alaska Native,RACE,RACENAA
53390,2016,2016,VI,Virgin Islands,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,Value,...,"(18.335765, -64.896335)",PA,PA1,Q047,VALUE,78,Race/Ethnicity,2 or more races,RACE,RACE2PLUS


# Removing rows with no sample sizes.

In [3]:
df.drop(df[df['Data_Value_Footnote_Symbol'] == '~'].index, inplace=True)

# Dropping Unnecessary Columns

In [4]:
df = df.drop(
    ['YearStart', 'YearEnd', 'LocationDesc', 'LocationAbbr', 'LocationDesc', 'ClassID', 'TopicID', 'DataValueTypeID',
     'StratificationCategoryId1', 'Stratification1', 'StratificationCategory1', 'Data_Value_Footnote',
     'Data_Value_Footnote_Symbol', 'Data_Value_Alt', 'Data_Value_Type', 'Data_Value_Unit', 'Question', 'Topic', 'Class',
     'Datasource', 'GeoLocation'], axis=1)

# Getting obesity data

In [5]:
obesity_df = df.loc[(df.QuestionID == 'Q036') | (df.QuestionID == 'Q037')]
obesity_df = obesity_df.drop(['StratificationID1', 'QuestionID'], axis=1)

# Using Different columns as labels

In [6]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder

def predict_label(df, label_name):
    label_encoder = LabelEncoder()
    df[label_name] = label_encoder.fit_transform(df[label_name])
    x = df.drop(label_name, axis=1)
    y = df[label_name]
    x_train, x_valid, y_train, y_valid = train_test_split(x, y, train_size=0.8, test_size=0.2)
    my_model = XGBClassifier()
    my_model.fit(x_train, y_train)
    predictions = my_model.predict_proba(x_valid)
    print(f'{label_name} loss: \t\t\t', log_loss(y_true=y_valid, y_pred=predictions))


# Splitting Data With Gender And Preprocessing

In [7]:
gender_df = obesity_df.dropna(subset=['Gender'])
gender_df = gender_df.dropna(axis=1)

age_df = obesity_df.dropna(subset=['Age(years)'])
age_df = age_df.dropna(axis=1)

education_df = obesity_df.dropna(subset=['Education'])
education_df = education_df.dropna(axis=1)

income_df = obesity_df.dropna(subset=['Income'])
income_df = income_df.dropna(axis=1)

ethnicity_df = obesity_df.dropna(subset=['Race/Ethnicity'])
ethnicity_df = ethnicity_df.dropna(axis=1)

predict_label(gender_df, 'Gender')
predict_label(age_df, 'Age(years)')
predict_label(education_df, 'Education')
predict_label(income_df, 'Income')
predict_label(ethnicity_df, 'Race/Ethnicity')

Gender loss: 			 0.42582397105698344
Age(years) loss: 			 0.8077258966412245
Education loss: 			 0.5876218448629409
Income loss: 			 1.369927633019206
Race/Ethnicity loss: 			 0.8813661424995376
