In [19]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pickle

In [20]:
df = pd.read_csv('obesity_data.csv')
df.head(4)

Unnamed: 0,Age,Gender,Height,Weight,CALC,FAVC,FCVC,NCP,SCC,SMOKE,CH2O,family_history_with_overweight,FAF,TUE,CAEC,MTRANS,NObeyesdad
0,21.0,Female,1.62,64.0,no,no,2.0,3.0,no,no,2.0,yes,0.0,1.0,Sometimes,Public_Transportation,Normal_Weight
1,21.0,Female,1.52,56.0,Sometimes,no,3.0,3.0,yes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,23.0,Male,1.8,77.0,Frequently,no,2.0,3.0,no,no,2.0,yes,2.0,1.0,Sometimes,Public_Transportation,Normal_Weight
3,27.0,Male,1.8,87.0,Frequently,no,3.0,3.0,no,no,2.0,no,2.0,0.0,Sometimes,Walking,Overweight_Level_I


In [21]:
X = df.drop(columns=['NObeyesdad'])  # Features
y = df['NObeyesdad']  # Target labels

In [22]:
# Encode categorical features using One-Hot Encoding
categorical_cols = ['Gender','CALC','FAVC','SCC','SMOKE','family_history_with_overweight', 'CAEC', 'MTRANS']
encoder = OneHotEncoder(drop='first', sparse=False)
X_encoded = pd.DataFrame(encoder.fit_transform(X[categorical_cols]), columns=encoder.get_feature_names_out(categorical_cols))
X_encoded = pd.concat([X.drop(columns=categorical_cols), X_encoded], axis=1)
X_encoded



Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender_Male,CALC_Frequently,...,SCC_yes,SMOKE_yes,family_history_with_overweight_yes,CAEC_Frequently,CAEC_Sometimes,CAEC_no,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,21.000000,1.620000,64.000000,2.0,3.0,2.000000,0.000000,1.000000,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,21.000000,1.520000,56.000000,3.0,3.0,3.000000,3.000000,0.000000,0.0,0.0,...,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,23.000000,1.800000,77.000000,2.0,3.0,2.000000,2.000000,1.000000,1.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,27.000000,1.800000,87.000000,3.0,3.0,2.000000,2.000000,0.000000,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,22.000000,1.780000,89.800000,2.0,1.0,2.000000,0.000000,0.000000,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,20.976842,1.710730,131.408528,3.0,3.0,1.728139,1.676269,0.906247,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2107,21.982942,1.748584,133.742943,3.0,3.0,2.005130,1.341390,0.599270,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2108,22.524036,1.752206,133.689352,3.0,3.0,2.054193,1.414209,0.646288,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2109,24.361936,1.739450,133.346641,3.0,3.0,2.852339,1.139107,0.586035,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [23]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [24]:
# Train a Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [25]:
# Evaluate model accuracy
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

Model Accuracy: 0.95


In [26]:
X_encoded.columns

Index(['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE',
       'Gender_Male', 'CALC_Frequently', 'CALC_Sometimes', 'CALC_no',
       'FAVC_yes', 'SCC_yes', 'SMOKE_yes',
       'family_history_with_overweight_yes', 'CAEC_Frequently',
       'CAEC_Sometimes', 'CAEC_no', 'MTRANS_Bike', 'MTRANS_Motorbike',
       'MTRANS_Public_Transportation', 'MTRANS_Walking'],
      dtype='object')

In [27]:
# Define a function to decode categorical variables based on user input
def decode_categorical(feature_name, user_input):
    # Define mappings for categorical variables
    mappings = {
        'Gender': {'Male': 1, 'Female': 0},
        'FAVC': {'yes': 1, 'no': 0},
        'SCC': {'yes': 1, 'no': 0},
        'SMOKE': {'yes': 1, 'no': 0},
        'family_history_with_overweight': {'yes': 1, 'no': 0},
        'CAEC': {'no': 0, 'Always': 1, 'Frequently': 2,'Sometimes': 3},
        'CALC': {'no': 0, 'Always': 1, 'Frequently': 2,'Sometimes': 3},
        'MTRANS': {'Motorbike': 0, 'Automobile': 1, 'Bike': 2, 'Walking': 3, 'public_Transportation': 4}
    }
    return mappings[feature_name].get(user_input)

In [28]:
# Collect user input for each feature
user_input = {}
user_input['Age'] = float(input("Enter Age: "))
user_input['Height'] = float(input("Enter Height: "))
user_input['Weight'] = float(input("Enter Weight: "))
user_input['FCVC'] = float(input("How many vegetables do you usually eat in your meals? "))
user_input['NCP'] = float(input("How many main meals do you have daily?"))
user_input['CH2O'] = float(input("How much water do you drink daily?  "))
user_input['FAF'] = float(input("How often do you have physical activity? "))
user_input['TUE'] = float(input("How much time do you use technological devices such as cell phone, videogames, television, computer and others? (from 0 to 2)"))
# Collect user input for categorical variables
user_input['Gender'] = decode_categorical('Gender', input("Enter Gender (Male/Female): "))
user_input['family_history_with_overweight'] = decode_categorical('family_history_with_overweight', input("Has a family member suffered or suffers from overweight? (Yes/No): "))
user_input['CAEC'] = decode_categorical('CAEC', input("Do you eat any food between meals? (Frequently/Sometimes/No): "))
user_input['MTRANS'] = decode_categorical('MTRANS', input("Which transportation do you usually use? (Automobile/Bike/Motorbike/Walking/public_Transportation): "))
user_input['CALC'] = decode_categorical('CALC', input("How often do you drink alcohol?"))
user_input['SCC'] = decode_categorical('CAEC', input("Do you monitor the calories you eat daily? "))
user_input['SMOKE'] = decode_categorical('MTRANS', input("Do you smoke? "))
user_input['FAVC'] = decode_categorical('FAVC', input("Do you eat high caloric food frequently?"))

Enter Age:  18
Enter Height:  156
Enter Weight:  57
How many vegetables do you usually eat in your meals?  3
How many main meals do you have daily? 3
How much water do you drink daily?   0.5
How often do you have physical activity?  2
How much time do you use technological devices such as cell phone, videogames, television, computer and others? (from 0 to 2) 2
Enter Gender (Male/Female):  Female
Has a family member suffered or suffers from overweight? (Yes/No):  yes
Do you eat any food between meals? (Frequently/Sometimes/No):  Sometimes
Which transportation do you usually use? (Automobile/Bike/Motorbike/Walking/public_Transportation):  Walking
How often do you drink alcohol? no
Do you monitor the calories you eat daily?  no
Do you smoke?  no
Do you eat high caloric food frequently? no


In [29]:
# Manually encode categorical variables based on user input
encoded_user_input = {}
for feature_name, user_input_value in user_input.items():
    if feature_name in categorical_cols:
        encoded_value = decode_categorical(feature_name, user_input_value)
        encoded_user_input[feature_name] = encoded_value
    else:
        encoded_user_input[feature_name] = user_input_value

# Convert encoded user input to a DataFrame
encoded_new_data = pd.DataFrame([encoded_user_input])

# Ensure that all expected columns are present in the new data
missing_cols = set(X_train.columns) - set(encoded_new_data.columns)
for col in missing_cols:
    encoded_new_data[col] = 0  # Add missing columns with default value of 0

# Reorder columns to match the order seen during training
encoded_new_data = encoded_new_data[X_train.columns]

# Make predictions for the new data
new_data_pred = clf.predict(encoded_new_data)
predicted_obesity_level = new_data_pred[0]

print(f"Predicted Obesity Level: {predicted_obesity_level}")



Predicted Obesity Level: Normal_Weight


In [30]:
with open('model.pkl', 'wb') as file:
    pickle.dump(clf, file)

In [31]:
# Save the encoder
import joblib
with open('encoder.pkl', 'wb') as file:
    joblib.dump(encoder, file)

In [32]:
X_test

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender_Male,CALC_Frequently,...,SCC_yes,SMOKE_yes,family_history_with_overweight_yes,CAEC_Frequently,CAEC_Sometimes,CAEC_no,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
544,20.406871,1.755978,53.699561,2.000000,3.891994,1.863930,2.870127,2.000000,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1987,26.000000,1.624950,111.004920,3.000000,3.000000,2.704315,0.000000,0.322666,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
420,18.000000,1.850000,60.000000,3.000000,4.000000,2.000000,2.000000,0.000000,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
527,21.000000,1.520000,42.000000,3.000000,1.000000,1.000000,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
196,22.000000,1.750000,74.000000,2.000000,3.000000,2.000000,1.000000,2.000000,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
447,19.000000,1.800000,87.000000,2.000000,4.000000,2.000000,2.000000,1.000000,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1793,30.715160,1.650189,101.141277,2.913452,2.269799,1.000000,1.889937,0.378818,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
73,19.000000,1.850000,65.000000,2.000000,3.000000,3.000000,2.000000,1.000000,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1711,28.986237,1.758618,113.501549,2.320201,3.000000,2.164784,0.000000,1.465479,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [33]:
X_train

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender_Male,CALC_Frequently,...,SCC_yes,SMOKE_yes,family_history_with_overweight_yes,CAEC_Frequently,CAEC_Sometimes,CAEC_no,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
162,21.000000,1.630000,60.000000,3.000000,3.000000,2.000000,2.000000,0.000000,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2001,20.924956,1.752531,133.618706,3.000000,3.000000,2.887659,1.480919,0.779641,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1435,22.899740,1.661715,82.595793,1.203754,1.355354,2.765593,0.128342,1.659476,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
649,21.837996,1.588046,44.236067,3.000000,1.696080,2.550307,1.098862,0.000000,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1280,25.994746,1.811602,106.042142,3.000000,3.000000,2.858171,1.813318,0.680215,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1638,32.290160,1.754956,120.098812,2.967300,3.000000,2.530035,0.955317,1.339232,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1095,23.000000,1.718981,81.669950,2.000000,1.729553,1.400247,0.887923,1.011983,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1130,22.989846,1.650000,80.000000,2.000000,3.000000,2.000000,0.146919,2.000000,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1294,23.000000,1.628168,84.497980,2.058687,2.962004,2.010596,0.851059,0.630866,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [34]:
encoder.get_feature_names_out(categorical_cols)

array(['Gender_Male', 'CALC_Frequently', 'CALC_Sometimes', 'CALC_no',
       'FAVC_yes', 'SCC_yes', 'SMOKE_yes',
       'family_history_with_overweight_yes', 'CAEC_Frequently',
       'CAEC_Sometimes', 'CAEC_no', 'MTRANS_Bike', 'MTRANS_Motorbike',
       'MTRANS_Public_Transportation', 'MTRANS_Walking'], dtype=object)