In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s4e2/sample_submission.csv
/kaggle/input/playground-series-s4e2/train.csv
/kaggle/input/playground-series-s4e2/test.csv
/kaggle/input/btl-review/reviews_dataset.xlsx


In [2]:
train_data = pd.read_csv("/kaggle/input/playground-series-s4e2/train.csv")
test_data = pd.read_csv("/kaggle/input/playground-series-s4e2/test.csv")

In [3]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Combine train and test data for preprocessing
combined_data = pd.concat([train_data, test_data], ignore_index=True)


# Step 3: Encode categorical variables
cat_cols = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
label_encoders = {}
for col in cat_cols:
    label_encoders[col] = LabelEncoder()
    combined_data[col] = label_encoders[col].fit_transform(combined_data[col])

# Step 4: Encode target variable
label_encoder_target = LabelEncoder()
combined_data['NObeyesdad'] = label_encoder_target.fit_transform(combined_data['NObeyesdad'])

# Step 5: Scale numerical features
num_cols = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
scaler = StandardScaler()
combined_data[num_cols] = scaler.fit_transform(combined_data[num_cols])

# Step 6: Split combined data back into train and test sets
train_data = combined_data[:len(train_data)]
test_data = combined_data[len(train_data):]

# Step 7: Separate features and target variable for training data
X = train_data.drop(['id', 'NObeyesdad'], axis=1)
y = train_data['NObeyesdad']


# Step 8: Choose models
models = [
#     ('GradientBoost', GradientBoostingClassifier(random_state=42)),
#     ('XGBoost', XGBClassifier(random_state=42)),
    ('CatBoost', CatBoostClassifier(iterations=500, depth=10, learning_rate=0.05, loss_function='MultiClass',
                                    eval_metric='Accuracy', random_seed=42))
]

# Step 9: Evaluate models using cross-validation
best_model = None
best_accuracy = 0
for name, model in models:
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    mean_accuracy = scores.mean()
    print(f"{name} Mean Accuracy: {mean_accuracy:.4f}")
    if mean_accuracy > best_accuracy:
        best_accuracy = mean_accuracy
        best_model = model

# Step 10: Hyperparameter tuning for the best model (if applicable)
if isinstance(best_model, RandomForestClassifier) or isinstance(best_model, GradientBoostingClassifier):
    param_grid = {'n_estimators': [50, 100, 200],
                  'max_depth': [None, 5, 10],
                  'min_samples_split': [2, 5, 10]}
    grid_search = GridSearchCV(best_model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X, y)
    best_model = grid_search.best_estimator_
    print("Best Model Hyperparameters:", best_model)

# Step 11: Train the best model
best_model.fit(X, y)

# Step 12: Make predictions on the test data
test_predictions = best_model.predict(test_data.drop(['id', 'NObeyesdad'], axis=1))


0:	learn: 0.8082621	total: 274ms	remaining: 2m 16s
1:	learn: 0.8153679	total: 418ms	remaining: 1m 44s
2:	learn: 0.8210888	total: 557ms	remaining: 1m 32s
3:	learn: 0.8263881	total: 692ms	remaining: 1m 25s
4:	learn: 0.8322895	total: 826ms	remaining: 1m 21s
5:	learn: 0.8354812	total: 961ms	remaining: 1m 19s
6:	learn: 0.8446947	total: 1.1s	remaining: 1m 17s
7:	learn: 0.8489703	total: 1.23s	remaining: 1m 15s
8:	learn: 0.8508370	total: 1.37s	remaining: 1m 14s
9:	learn: 0.8534265	total: 1.5s	remaining: 1m 13s
10:	learn: 0.8537878	total: 1.63s	remaining: 1m 12s
11:	learn: 0.8546911	total: 1.76s	remaining: 1m 11s
12:	learn: 0.8573407	total: 1.9s	remaining: 1m 11s
13:	learn: 0.8603517	total: 2.1s	remaining: 1m 12s
14:	learn: 0.8616765	total: 2.23s	remaining: 1m 12s
15:	learn: 0.8638444	total: 2.39s	remaining: 1m 12s
16:	learn: 0.8648681	total: 2.54s	remaining: 1m 12s
17:	learn: 0.8660123	total: 2.68s	remaining: 1m 11s
18:	learn: 0.8664338	total: 2.83s	remaining: 1m 11s
19:	learn: 0.8675780	total

In [4]:
# Step 13: Prepare the submission file
submission_df = pd.DataFrame({'id': test_data['id'], 'NObeyesdad': label_encoder_target.inverse_transform(test_predictions)})
submission_df.to_csv('submission.csv', index=False)

  y = column_or_1d(y, warn=True)
