# Introduction

The EDA aims to explore the model's performance and accuracy in specific scenarios and to determine a preliminar grade of fidelity in its estimation.

# Import Libraries

In [105]:
# Importing libraries
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.preprocessing import LabelBinarizer , LabelEncoder

# Data Cleaning and Treating [Part I]

The dataset was loaded and its string features were converted to categorical data type and encoded to numerical values. It was not necessary to drop null values or deal with missing values. 

In [221]:
# Loading [raw] dataset
ObesityDataset_Raw = pd.read_csv('ObesityDataset_Raw.csv')

In [222]:
# Getting categorical and numerical features
CategoricalFeatures = []
NumbericalFeatures = []

for label in ObesityDataset_Raw.columns:
    if ObesityDataset_Raw[label].dtype == 'object':
        CategoricalFeatures.append(label)
    else:
        NumbericalFeatures.append(label)

In [223]:
# Capitalizing values of categorical features
def CapitalizeIfNecessary(value):
    if value == 'no' or value == 'yes':
        return value.capitalize()
    else:
        return value

ObesityDataset_Raw[CategoricalFeatures] = ObesityDataset_Raw[CategoricalFeatures].map(CapitalizeIfNecessary)

In [224]:
# Transforming categorical features' data type into categorical data type
ObesityDataset_Raw[CategoricalFeatures] = ObesityDataset_Raw[CategoricalFeatures].astype('category')

In [231]:
# Encoding categorical features into numerical features

## Feature NObeyesdad
OrderedCategories_NObeyesdad = ['Insufficient_Weight', 'Normal_Weight', 'Overweight_Level_I', 'Overweight_Level_II', 'Obesity_Type_I', 'Obesity_Type_II', 'Obesity_Type_III']
Encoder_NObeyesdad = {category:value for value , category in enumerate(OrderedCategories_NObeyesdad)}
ObesityDataset_Raw['NObeyesdad_Encode'] = ObesityDataset_Raw['NObeyesdad'].map(Encoder_NObeyesdad)

## Feature Gender
Values_Gender = ['Female', 'Male']
Encoder_Gender = {category:value for value , category in enumerate(Values_Gender)}
ObesityDataset_Raw['Gender_Encode'] = ObesityDataset_Raw['Gender'].map(Encoder_Gender)

## Other features
Values_Binary = ['Yes','No']
__Encoder_Binary = LabelBinarizer().fit(Values_Binary)
Encoder_Binary = lambda value: __Encoder_Binary.transform([value])[0,0]

for feature in CategoricalFeatures:
    if feature != 'Gender' and feature != 'NObeyesdad':
        if len(ObesityDataset_Raw[feature].unique()) > 2:
            ObesityDataset_Raw[feature+'_Encode'] = pd.DataFrame(LabelEncoder().fit_transform(ObesityDataset_Raw[feature]))
        else:
            ObesityDataset_Raw[feature+'_Encode'] = ObesityDataset_Raw[feature].map(Encoder_Binary)

In [227]:
# Getting encoded categorical features
CategoricalFeatures_Encode = [feature+'_Encode' for feature in CategoricalFeatures]

# Data Visualization and Analysis

Because of the physical condition (insufficient weight, normal weight, overweight and obesity) is strongly related to the height, weight, gender and age of someone, hence these features become relevant for the model.

In [None]:
ObesityLevelOrder = ['Insufficient_Weight','Normal_Weight','Overweight_Level_I','Overweight_Level_II','Obesity_Type_I','Obesity_Type_II','Obesity_Type_III']

sns.lmplot(data=ObesityDataset_Raw,x='Height',y='Weight',hue='NObeyesdad',col='Gender',hue_order=ObesityLevelOrder,palette='Set2',ci=None)

The before plot shows this strong relation and also show that there are not enough data points (samples) in some categories of obesity level - gender. Therefore in some scenarios the model will have a poor estimation quality.

In [None]:
ObesityDataset_Raw.groupby(by=['NObeyesdad','Gender'],observed=True)['Age'].describe()