In [6]:
#imports
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

In [2]:
# set path to the data
input_folder = '/Users/alexandrabrown/Desktop/data_science/ML_bootcamp'
input_file   = 'ObesityDataSet.csv'
input_path   = os.path.join(input_folder, input_file)

# load the data into DataFrame
df = pd.read_csv(input_path)
df

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21,1.62,64.00,yes,no,2.0,3.0,Sometimes,no,2.00,no,0.00,1.000,no,Public_Transportation,Normal_Weight
1,Female,21,1.52,56.00,yes,no,3.0,3.0,Sometimes,yes,3.00,yes,3.00,0.000,Sometimes,Public_Transportation,Normal_Weight
2,Male,23,1.80,77.00,yes,no,2.0,3.0,Sometimes,no,2.00,no,2.00,1.000,Frequently,Public_Transportation,Normal_Weight
3,Male,27,1.80,87.00,no,no,3.0,3.0,Sometimes,no,2.00,no,2.00,0.000,Frequently,Walking,Overweight_Level_I
4,Male,22,1.78,89.80,no,no,2.0,1.0,Sometimes,no,2.00,no,0.00,0.000,Sometimes,Public_Transportation,Overweight_Level_II
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,Female,21,1.71,131.41,yes,yes,3.0,3.0,Sometimes,no,1.73,no,1.68,0.906,Sometimes,Public_Transportation,Obesity_Type_III
2107,Female,22,1.75,133.74,yes,yes,3.0,3.0,Sometimes,no,2.01,no,1.34,0.599,Sometimes,Public_Transportation,Obesity_Type_III
2108,Female,23,1.75,133.69,yes,yes,3.0,3.0,Sometimes,no,2.05,no,1.41,0.646,Sometimes,Public_Transportation,Obesity_Type_III
2109,Female,24,1.74,133.35,yes,yes,3.0,3.0,Sometimes,no,2.85,no,1.14,0.586,Sometimes,Public_Transportation,Obesity_Type_III


### <span style='color:lightblue'>Behavioral pattern classification: Can food habits predict level of obesity?</span>

- FAVC – If the person frequently consumes high-calorie foods (yes/no) 
- FCVC – Frequency of vegetable consumption (scale from 1 to 3)
- NCP – Number of main meals per day
- CH2O – Daily water intake (scale from 1 to 3)
- CALC – Frequency of alcohol consumption (Never, Sometimes, Frequently, Always)

In [3]:
data = df.loc[:, ['FAVC', 'FCVC', 'NCP', 'CH2O', 'CALC', 'NObeyesdad']]
data

Unnamed: 0,FAVC,FCVC,NCP,CH2O,CALC,NObeyesdad
0,no,2.0,3.0,2.00,no,Normal_Weight
1,no,3.0,3.0,3.00,Sometimes,Normal_Weight
2,no,2.0,3.0,2.00,Frequently,Normal_Weight
3,no,3.0,3.0,2.00,Frequently,Overweight_Level_I
4,no,2.0,1.0,2.00,Sometimes,Overweight_Level_II
...,...,...,...,...,...,...
2106,yes,3.0,3.0,1.73,Sometimes,Obesity_Type_III
2107,yes,3.0,3.0,2.01,Sometimes,Obesity_Type_III
2108,yes,3.0,3.0,2.05,Sometimes,Obesity_Type_III
2109,yes,3.0,3.0,2.85,Sometimes,Obesity_Type_III


In [4]:
# pre-process the data
# 1) make FAVC binary (0 or 1)
# 2) make CALC ordinal (0,1,2,3)
# 3) make NObeyesdad ordinal (0,1,2,3,4,5,6)

# Insufficient Weight, Normal Weight, Overweight Level I, 
# Overweight Level II, Obesity Type I, Obesity Type II, Obesity Type III

obesity_ratings = {
    'Insufficient_Weight': 0,
    'Normal_Weight': 1,
    'Overweight_Level_I': 2,
    'Overweight_Level_II': 3,
    'Obesity_Type_I': 4,
    'Obesity_Type_II': 5,
    'Obesity_Type_III': 6}

data['FAVC'] = data['FAVC'].map({'no': 0, 'yes':1})
data['CALC'] = data['CALC'].map({'no':0, 'Sometimes':1, 'Frequently':2, 'Always':3})
data['NObeyesdad'] = data['NObeyesdad'].map(obesity_ratings)
data

Unnamed: 0,FAVC,FCVC,NCP,CH2O,CALC,NObeyesdad
0,0,2.0,3.0,2.00,0,1
1,0,3.0,3.0,3.00,1,1
2,0,2.0,3.0,2.00,2,1
3,0,3.0,3.0,2.00,2,2
4,0,2.0,1.0,2.00,1,3
...,...,...,...,...,...,...
2106,1,3.0,3.0,1.73,1,6
2107,1,3.0,3.0,2.01,1,6
2108,1,3.0,3.0,2.05,1,6
2109,1,3.0,3.0,2.85,1,6


In [5]:
# Check for missing values in the dataset
missing_values = data.isnull().sum()

# Display columns with missing values
missing_values = missing_values[missing_values > 0]

if missing_values.empty:
    print("No missing values in the dataset.")
else:
    print("Missing values detected:")
    print(missing_values)

No missing values in the dataset.


In [None]:
# split the data for training / testing -- 
