In [475]:
import pandas as pd

In [476]:
import matplotlib.pyplot as plt

In [477]:
import seaborn as sns

In [478]:
##READ THE GIVEN DATA 

In [479]:
data = pd.read_csv('Obesity.csv')

In [480]:
##EXAMINING DATA 

In [481]:
data.head() 

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [482]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2102 non-null   object 
 1   Age                             2111 non-null   float64
 2   Height                          2101 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2105 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2102 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2104 non-null   object 
 12  FAF                             21

In [483]:
## There are catagorical and numerical values 

In [484]:
data.isnull().sum()

Gender                             9
Age                                0
Height                            10
Weight                             0
family_history_with_overweight     6
FAVC                               0
FCVC                               0
NCP                                0
CAEC                               9
SMOKE                              0
CH2O                               0
SCC                                7
FAF                                0
TUE                                0
CALC                               0
MTRANS                             7
NObeyesdad                         0
dtype: int64

In [485]:
##There are null values 

In [486]:
##PREPROCESSING

In [487]:
##Imputing null values.

In [488]:
# Impute missing values for numerical columns with median
numerical_cols = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
for col in numerical_cols:
    data[col].fillna(data[col].median(), inplace=True)

In [489]:
# Impute missing values for categorical columns with mode
categorical_cols = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
for col in categorical_cols:
    data[col].fillna(data[col].mode()[0], inplace=True)

In [490]:
# Verify if there are any remaining missing values
data.isnull().sum()

Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64

In [491]:
##Identifying and Handling Outliers.

In [492]:
# Calculate IQR for each numerical column
Q1 = data[numerical_cols].quantile(0.25)
Q3 = data[numerical_cols].quantile(0.75)
IQR = Q3 - Q1

In [493]:
# Define outlier bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [494]:
# Identify outliers
outliers = ((data[numerical_cols] < lower_bound) | (data[numerical_cols] > upper_bound)).any(axis=1)

In [495]:
# Handle outliers
data_no_outliers = data[~outliers]

In [496]:
# Verify the shape of the new dataset
print("Shape after removing outliers:", data_no_outliers.shape)

Shape after removing outliers: (1405, 17)


In [497]:
from sklearn.model_selection import train_test_split

In [498]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [499]:
from sklearn.compose import ColumnTransformer

In [500]:
from sklearn.pipeline import Pipeline

In [501]:
from sklearn.linear_model import LogisticRegression

In [502]:
from sklearn.ensemble import RandomForestClassifier

In [503]:
from sklearn.metrics import classification_report

In [504]:
# Split data into features and target variable

In [505]:
X = data_no_outliers.drop('NObeyesdad', axis=1)

In [506]:
y = data_no_outliers['NObeyesdad']

In [507]:
# Split data into train and test sets

In [508]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [509]:
# Define categorical and numerical columns

In [510]:
categorical_cols = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']

In [511]:
numerical_cols = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']

In [512]:
# Define preprocessing steps with imputation and scaling

In [513]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

In [514]:
# Define model pipelines for Logistic Regression and Random Forest
logistic_regression_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                               ('classifier', LogisticRegression(max_iter=1000))])

In [515]:
random_forest_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                         ('classifier', RandomForestClassifier())])

In [453]:
# Train the Logistic Regression model
logistic_regression_pipeline.fit(X_train, y_train)

In [454]:
# Train the Random Forest model
random_forest_pipeline.fit(X_train, y_train)


In [455]:
##Logistic Regression and Random Forest are well-suited for predicting weight categories based on demographic information, eating habits, and physical condition. Logistic Regression provides interpretable results, making it easy to understand the impact of each feature on the prediction. While assuming linearity, it can still capture non-linear relationships through appropriate feature engineering. On the other hand, Random Forest naturally handles non-linear relationships and complex interactions in the data, making it robust and effective. Despite being sensitive to outliers, Logistic Regression is computationally efficient and performs well with linear separable data. In contrast, Random Forest's ability to handle complex datasets and high dimensionality makes it a go-to choice for classification tasks. Overall, the balance between interpretability, performance, and robustness makes Logistic Regression and Random Forest suitable options for this prediction task.

In [456]:
##The better model has to be selected after evaluationg accuracy scores 

In [457]:
from sklearn.metrics import accuracy_score

In [458]:
# Calculate accuracy for Logistic Regression model
lr_accuracy = accuracy_score(y_test, y_pred_lr)
print("Logistic Regression Accuracy:", lr_accuracy)

Logistic Regression Accuracy: 0.8576512455516014


In [459]:
# Calculate accuracy for Random Forest model
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", rf_accuracy)

Random Forest Accuracy: 0.9572953736654805


In [460]:
##Random forest is the superior mmodel 

In [461]:
##User Input Prediction 

In [468]:
# Function to preprocess user input
def preprocess_user_input(user_input):
    user_df = pd.DataFrame([user_input])
    # Impute missing numerical values with median
    user_df[numerical_cols] = user_df[numerical_cols].fillna(user_df[numerical_cols].median())
    # Impute missing categorical values with mode and one-hot encode categorical variables
    user_df[categorical_cols] = user_df[categorical_cols].fillna(user_df[categorical_cols].mode().iloc[0])
    user_df_encoded = pd.get_dummies(user_df, columns=categorical_cols)
    # Ensure the order of columns matches the order of columns in the training data
    user_df_encoded = user_df_encoded.reindex(columns=X.columns, fill_value=0)
    return user_df_encoded

In [463]:
# Load the trained models
logistic_regression_model = logistic_regression_pipeline.named_steps['classifier']
random_forest_model = random_forest_pipeline.named_steps['classifier']

In [464]:
# Prompt the user to input data
user_input = {}
for col in numerical_cols:
    while True:
        try:
            user_input[col] = float(input(f"Enter {col}: "))
            break
        except ValueError:
            print("Invalid input! Please enter a valid number.")

for col in categorical_cols:
    
    user_input[col] = input(f"Enter {col}: ")
    
    

Enter Age: 25
Enter Height: 1.75
Enter Weight: 70
Enter FCVC: 2.0
Enter NCP: 3.0
Enter CH2O: 1.5
Enter FAF: 0.0
Enter TUE: 1.0
Enter Gender: Male
Enter family_history_with_overweight: yes
Enter FAVC: yes
Enter CAEC: Sometimes
Enter SMOKE: no
Enter SCC: no
Enter CALC: no
Enter MTRANS: Public_Transportation


In [469]:
# Preprocess user input
user_df_encoded = preprocess_user_input(user_input)

In [None]:
# Make predictions
lr_prediction = logistic_regression_pipeline.predict(user_df_encoded)
rf_prediction = random_forest_pipeline.predict(user_df_encoded)

In [None]:
# Print the predicted weight category
print("Logistic Regression Predicted Weight Category:", lr_prediction[0])
print("Random Forest Predicted Weight Category:", rf_prediction[0])