# **Exploratory Data Analysis**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score



In [None]:
pd.set_option('display.max_columns, None)
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/ObesityDataset.csv')

In [None]:
print("5 Data Pertama")
display(df.head())

In [None]:
print("\nInformasi Dataset")
print(df.info())

In [None]:
print("\nStatistik Deskriptif")
display(df.describe(include='all'))

In [None]:
print("\nJumlah Missing Value")
print(df.isnull().sum())

In [None]:
print("\nJumlah Data Duplikat")
print(df.duplicated().sum())

In [None]:
print("\nUnique Value per Kolom")
print(df.nUnique())

# **Visualisasi**

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(data=df, x='NObeyesdad', order=df['NObeyesdad'].value_counts().index)
plt.xticks(rotation=45)
plt.title('Distribusi Kelas Target (NObeyesdad)')
plt.show()

In [None]:
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
plt.figure(figsize=(15,10))
for i, col in enumerate(numerical_cols, 1):
    plt.subplot(3, 3, i)
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot of {col}')
plt.tight_layout()
plt.show()

# **Preprocessing Data**

In [None]:
print("Missing values:")
print(df.isnull().sum())

df = df.drop_duplicates()

def remove_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return data[(data[column] >= lower) & (data[column] <= upper)]

for col in ['Age', 'Height', 'Weight']:
    df = remove_outliers_iqr(df, col)

# **Encoding Kategorikal**

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df_encoded = df.copy()

for col in df_encoded.columns:
    if df_encoded[col].dtype == 'object':
        df_encoded[col] = label_encoder.fit_transform(df_encoded[col])

X = df_encoded.drop('NObeyesdad', axis=1)
y = df_encoded['NObeyesdad']

print("\nDistribusi kelas setelah encoding:")
print(y.value_counts())

# **Handling Imbalance dan Normalisasi**

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_resampled)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_resampled, test_size=0.2, random_state=42)

# Inisialisasi model
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'KNN': KNeighborsClassifier()
}

# Training dan evaluasi
for name, model in models.items():
    print(f"\nModel: {name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

## **Visualisasi Model**

In [None]:
model_scores = {
    'Model': [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1 Score': []
}

for name, model in models.items():
    y_pred = model.predict(X_test)
    model_scores['Model'].append(name)
    model_scores['Accuracy'].append(accuracy_score(y_test, y_pred))
    model_scores['Precision'].append(precision_score(y_test, y_pred, average='weighted'))
    model_scores['Recall'].append(recall_score(y_test, y_pred, average='weighted'))
    model_scores['F1 Score'].append(f1_score(y_test, y_pred, average='weighted'))

score_df = pd.DataFrame(model_scores)


plt.figure(figsize=(10, 6))
score_df_melted = score_df.melt(id_vars='Model', var_name='Metric', value_name='Score')
sns.barplot(data=score_df_melted, x='Model', y='Score', hue='Metric')
plt.title('Perbandingan Performa Model')
plt.ylim(0, 1)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

## **Hyperparameter Tuning**

In [None]:
param_grid_rf = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
}

grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=3, scoring='f1_weighted', n_jobs=-1)
grid_rf.fit(X_train, y_train)

best_rf = grid_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test)