In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Obesity Dataset Classification and PCA Analysis
In this project, we will perform classification on an obesity dataset using three different classification algorithms. We will also apply PCA (Principal Component Analysis) to reduce the dimensionality of the data and visualize the results.

---

# Import Libraries
We will start by importing the necessary libraries.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
import numpy as np
import matplotlib.pyplot as plt


# Load and Preprocess the Data
Next, we will load the dataset and perform preprocessing. We will encode the target variable and convert categorical variables into numerical values using one-hot encoding.


In [None]:
# Load the dataset
data = pd.read_csv("/kaggle/input/obesity-prediction/Obesity prediction.csv")

# Encode the target column (Obesity) into numerical values
label_encoder = LabelEncoder()
data["Obesity_encoded"] = label_encoder.fit_transform(data["Obesity"])

# Convert categorical variables into numerical variables
categorical_columns = ["Gender", "family_history", "FAVC", "CAEC", "SMOKE", "SCC", "CALC", "MTRANS"]
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Split data into features (X) and target (y)
X = data.drop(columns=["Obesity", "Obesity_encoded"])
y = data["Obesity_encoded"]

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of training and test sets
X_train.shape, X_test.shape


# Scale the Features
We will standardize the features using `StandardScaler` to ensure that all features have a mean of 0 and a standard deviation of 1.


In [None]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Logistic Regression Classifier
We will train a Logistic Regression model and evaluate its performance using the classification report.


In [None]:
# Train Logistic Regression model
logistic_model = LogisticRegression(max_iter=1000, random_state=42)
logistic_model.fit(X_train_scaled, y_train)

# Predict and evaluate
y_pred_logistic = logistic_model.predict(X_test_scaled)
print("Logistic Regression Classification Report (scaled data):")
print(classification_report(y_test, y_pred_logistic, target_names=label_encoder.classes_))


# Random Forest Classifier
Now, let's train a Random Forest model and evaluate its performance.


In [None]:
# Train Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Predict and evaluate
y_pred_rf = rf_model.predict(X_test_scaled)
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf, target_names=label_encoder.classes_))


# Support Vector Machine (SVM) Classifier
Next, we will train a Support Vector Machine (SVM) model and evaluate its performance.


In [None]:
# Train Support Vector Machine (SVM) model
svm_model = SVC(random_state=42)
svm_model.fit(X_train_scaled, y_train)

# Predict and evaluate
y_pred_svm = svm_model.predict(X_test_scaled)
print("Support Vector Machine (SVM) Classification Report:")
print(classification_report(y_test, y_pred_svm, target_names=label_encoder.classes_))


# Apply PCA (Principal Component Analysis)
Now, let's apply PCA to reduce the dimensionality of the data to 2 components. We will then train a Logistic Regression model using the reduced data.


In [None]:
# Apply PCA for dimensionality reduction
pca = PCA(n_components=2)  # Reduce to 2 components
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Train Logistic Regression model on PCA-transformed data
logistic_model_pca = LogisticRegression(max_iter=10000, random_state=42)
logistic_model_pca.fit(X_train_pca, y_train)

# Predict and evaluate
y_pred_pca = logistic_model_pca.predict(X_test_pca)
print("Logistic Regression (PCA) Classification Report:")
print(classification_report(y_test, y_pred_pca, target_names=label_encoder.classes_))


# Visualize PCA Components
Let's visualize the 2 PCA components to understand the distribution of different classes in the reduced feature space.


In [None]:
# Visualize PCA components
plt.figure(figsize=(8, 6))
for label in np.unique(y_train):
    plt.scatter(X_train_pca[y_train == label, 0], X_train_pca[y_train == label, 1], label=label_encoder.inverse_transform([label])[0])
plt.title("PCA Components Scatter Plot")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend()
plt.show()
