In [None]:
# Install necessary libraries
!pip install xgboost
!pip installinstall imbalanced-learn
!pip install scikit-learn

from google.colab import drive
drive.mount('/content/drive')

# import pandas and numpy packet
import pandas as pd
import numpy as np

# read train csv file
sample_data = pd.read_csv("/content/drive/MyDrive/ml dataset/sample_submission.csv")
train_data = pd.read_csv("/content/drive/MyDrive/ml dataset/train (1).csv")
test_data = pd.read_csv("/content/drive/MyDrive/ml dataset/test.csv")


from sklearn.preprocessing import LabelEncoder, StandardScaler

# check column names in the train and test data
print(train_data.columns)
print(test_data.columns)

# calculate BMI (assuming 'Height and 'Weight' columns exist in both train_data and test_data set)
train_data['BMI'] = train_data['Weight'] / ((train_data['Height'] / 100) ** 2)
test_data['BMI'] = test_data['Weight'] / ((test_data['Height'] / 100) ** 2)

# separate categorical and numerical features
categorical_cols = ['Gender', 'FAVC', 'SMOKE', 'CAEC', 'MTRANS']  # Adjust based on your dataset
numerical_cols = ['Age', 'Height', 'Weight', 'CALC', 'FCVC', 'NCP', 'SCC', 'CH2O', 'family_history_with_overweight', 'FAF', 'TUE', 'BMI']

train_categorical = train_data[categorical_cols]
train_numerical = train_data[numerical_cols]

test_categorical = test_data[categorical_cols]
test_numerical = test_data[numerical_cols]


# Apply Label Encoding for categorical features
label_encoder = LabelEncoder()

# Fit on the combined unique values from both train and test data
for col in categorical_cols:
    all_values = pd.concat([train_data[col], test_data[col]]).unique()
    label_encoder.fit(all_values)  # Fit on all unique values
    train_data[col] = label_encoder.transform(train_data[col])
    test_data[col] = label_encoder.transform(test_data[col])



# Handle missing values (if any) in numerical columns (filling with mean)
# Ensure we are working only with numerical data
train_numerical = train_numerical.apply(pd.to_numeric, errors='coerce')
test_numerical = test_numerical.apply(pd.to_numeric, errors='coerce')

train_numerical.fillna(train_numerical.mean(), inplace=True)
test_numerical.fillna(test_numerical.mean(), inplace=True)

# Feature scaling for numerical columns
scaler = StandardScaler()
X_train_numerical = scaler.fit_transform(train_numerical)
X_test_numerical = scaler.transform(test_numerical)

# Combine the processed categorical and numerical data
X_train = pd.concat([pd.DataFrame(X_train_numerical, columns=numerical_cols), train_categorical.reset_index(drop=True)], axis=1)
X_test = pd.concat([pd.DataFrame(X_test_numerical, columns=numerical_cols), test_categorical.reset_index(drop=True)], axis=1)


# Define target variable
y_train = train_data['NObeyesdad']

from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
import pandas as pd  # Make sure pandas is imported

# Check for missing values again
print("Missing values in X_train:", X_train.isnull().sum().sum())
print("Missing values in y_train:", y_train.isnull().sum())

# Separate numerical and categorical features in X_train
X_train_numerical = X_train.select_dtypes(include=np.number)
X_train_categorical = X_train.select_dtypes(exclude=np.number)

# Use SimpleImputer only on numerical features
imputer = SimpleImputer(strategy='mean')  # Using mean to fill missing values
X_train_numerical_imputed = imputer.fit_transform(X_train_numerical)

# Convert the imputed numerical data back to a DataFrame
# Instead of using X_train_numerical.columns, use the columns present after imputation.
X_train_numerical_imputed_df = pd.DataFrame(X_train_numerical_imputed, columns=imputer.get_feature_names_out(), index=X_train_numerical.index)
# Get the feature names as strings

# Combine the imputed numerical and original categorical features
X_train_imputed = pd.concat([X_train_numerical_imputed_df, X_train_categorical], axis=1)

# Convert categorical features to numerical using one-hot encoding
X_train_imputed = pd.get_dummies(X_train_imputed, columns=X_train_categorical.columns) #This line was added to fix the problem

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_imputed, y_train)

# Check the shape of the resampled dataset
print("Resampled X_train shape:", X_train_resampled.shape)
print("Resampled y_train shape:", y_train_resampled.shape)


from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score

# Encode the target variable (y_train_resampled) to numeric labels
label_encoder = LabelEncoder()
y_train_resampled_encoded = label_encoder.fit_transform(y_train_resampled)

# XGBoost Model and Hyperparameter Tuning
xgb_model = XGBClassifier(eval_metric='mlogloss', random_state=42)

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'n_estimators': [50, 100, 150],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, n_jobs=-1)
grid_search.fit(X_train_resampled, y_train_resampled_encoded)  # Use encoded labels

# Best Model
best_xgb_model = grid_search.best_estimator_

# Predict on the resampled training set
y_pred_train = best_xgb_model.predict(X_train_resampled)

# Inverse transform predictions to original labels for evaluation
y_pred_train_original = label_encoder.inverse_transform(y_pred_train)

# Evaluate the model's performance
print("Training Accuracy: ", accuracy_score(y_train_resampled, y_pred_train_original))
print("Training F1-Score: ", f1_score(y_train_resampled, y_pred_train_original, average='weighted'))


from sklearn.preprocessing import OneHotEncoder

# Combine test and train datasets if required for consistent encoding
all_data = pd.concat([X_train, X_test], axis=0)

# Apply one-hot encoding to object columns
object_columns = all_data.select_dtypes(include=['object']).columns
all_data_encoded = pd.get_dummies(all_data, columns=object_columns)

# Split back into train and test
X_train_encoded = all_data_encoded.iloc[:len(X_train), :]
X_test_encoded = all_data_encoded.iloc[len(X_train):, :]

import os
print(os.listdir())  # Check if 'final_submission.csv' is listed

# Import necessary libraries
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

# Convert object columns to 'category' dtype for XGBoost
categorical_columns = ['Gender', 'FAVC', 'SMOKE', 'CAEC', 'MTRANS']  # List of categorical columns
for col in categorical_columns:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')

# Encode the target variable 'y_train' from string to numeric labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# Train the model with enable_categorical
best_xgb_model = xgb.XGBClassifier(tree_method='hist', enable_categorical=True)
best_xgb_model.fit(X_train, y_train_encoded)

# Predict using the model
test_predictions = best_xgb_model.predict(X_test)

# Convert numeric predictions back to the original string labels
test_predictions_str = label_encoder.inverse_transform(test_predictions)

# Create the submission DataFrame using the predicted values
submission = pd.DataFrame({
    'ID': test_data['ID'],  # Using 'ID' from the test dataset
    'NObeyesdad': test_predictions_str  # Using predicted string labels
})

# Save the submission file in the correct path
submission.to_csv('/content/drive/MyDrive/ml dataset/submit.csv', index=False)

ERROR: unknown command "installinstall" - maybe you meant "install"
Mounted at /content/drive
Index(['ID', 'Age', 'Gender', 'Height', 'Weight', 'CALC', 'FAVC', 'FCVC',
       'NCP', 'SCC', 'SMOKE', 'CH2O', 'family_history_with_overweight', 'FAF',
       'TUE', 'CAEC', 'MTRANS', 'NObeyesdad'],
      dtype='object')
Index(['ID', 'Age', 'Gender', 'Height', 'Weight', 'CALC', 'FAVC', 'FCVC',
       'NCP', 'SCC', 'SMOKE', 'CH2O', 'family_history_with_overweight', 'FAF',
       'TUE', 'CAEC', 'MTRANS'],
      dtype='object')


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


Missing values in X_train: 5020
Missing values in y_train: 0
Resampled X_train shape: (2030, 24)
Resampled y_train shape: (2030,)




Training Accuracy:  1.0
Training F1-Score:  1.0
['.config', 'drive', 'sample_data']


In [None]:
from google.colab import drive
drive.mount('/content/drive')