In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

In [3]:
def prepare_encoded_data():
    # Load training data
    train_df = pd.read_csv('train.csv')

    # Load test data
    test_df = pd.read_csv('test.csv')

    # Identify categorical features
    categorical_cols = train_df.select_dtypes(include='object').columns.tolist()

    # Drop target from that list (we encode it separately)
    categorical_cols.remove('NObeyesdad')

    ## Step 1: Tag the data so we can combine and split cleanly later
    train_df['source'] = 'train'
    test_df['source'] = 'test'

    # Combine both datasets
    combined = pd.concat([train_df, test_df], axis=0)
    # One-hot encode the 8 categorical features
    combined_encoded = pd.get_dummies(combined, columns=categorical_cols)

    # Separate back the encoded train and test data
    rl_train_encoded = combined_encoded[combined_encoded['source'] == 'train'].drop(['source'], axis=1)
    rl_test_encoded = combined_encoded[combined_encoded['source'] == 'test'].drop(['source'], axis=1)

    # Create and apply the encoder
    target_encoder = LabelEncoder()
    rl_train_encoded['target'] = target_encoder.fit_transform(rl_train_encoded['NObeyesdad'])
    rl_test_encoded['target'] = target_encoder.fit_transform(rl_test_encoded['NObeyesdad'])

    # Optional: drop the original text column now
    rl_train_encoded = rl_train_encoded.drop(columns=['NObeyesdad'])
    rl_test_encoded = rl_test_encoded.drop(columns=['NObeyesdad'])

    return rl_train_encoded, rl_test_encoded




In [None]:
rl_train_encoded, rl_test_encoded = prepare_encoded_data()
# Separate features and target
X = rl_train_encoded

y = rl_train_encoded['target']

# Split data for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Build and train the decision tree model
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict on validation set
y_pred = clf.predict(X_val)

# Print accuracy and classification report
print('Accuracy:', accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       524
           1       1.00      1.00      1.00       626
           2       1.00      1.00      1.00       543
           3       1.00      1.00      1.00       657
           4       1.00      1.00      1.00       804
           5       1.00      1.00      1.00       484
           6       1.00      1.00      1.00       514

    accuracy                           1.00      4152
   macro avg       1.00      1.00      1.00      4152
weighted avg       1.00      1.00      1.00      4152



In [7]:
rl_train_encoded, rl_test_encoded = prepare_encoded_data()
# Separate features and target
X = rl_train_encoded

y = rl_train_encoded['target']

# Split data for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# Build and train the bagged decision tree model,
bagged_clf = BaggingClassifier(DecisionTreeClassifier(random_state=42), n_estimators=50, random_state=42)
bagged_clf.fit(X_train, y_train)

# Predict on validation set,
y_pred_bagged = bagged_clf.predict(X_val)

# Print accuracy and classification report,
print('Bagged Model Accuracy:', accuracy_score(y_val, y_pred_bagged))
print(classification_report(y_val, y_pred_bagged))


Bagged Model Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       524
           1       1.00      1.00      1.00       626
           2       1.00      1.00      1.00       543
           3       1.00      1.00      1.00       657
           4       1.00      1.00      1.00       804
           5       1.00      1.00      1.00       484
           6       1.00      1.00      1.00       514

    accuracy                           1.00      4152
   macro avg       1.00      1.00      1.00      4152
weighted avg       1.00      1.00      1.00      4152



In [8]:
rl_train_encoded


Unnamed: 0,id,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender_Female,...,CALC_Always,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking,target
0,0,24.443011,1.699998,81.669950,2.000000,2.983297,2.763573,0.000000,0.976473,False,...,False,False,True,False,False,False,False,True,False,6
1,1,18.000000,1.560000,57.000000,2.000000,3.000000,2.000000,1.000000,1.000000,True,...,False,False,False,True,True,False,False,False,False,1
2,2,18.000000,1.711460,50.165754,1.880534,1.411685,1.910378,0.866045,1.673584,True,...,False,False,False,True,False,False,False,True,False,0
3,3,20.952737,1.710730,131.274851,3.000000,3.000000,1.674061,1.467863,0.780199,True,...,False,False,True,False,False,False,False,True,False,4
4,4,31.641081,1.914186,93.798055,2.679664,1.971472,1.979848,1.967973,0.931721,False,...,False,False,True,False,False,False,False,True,False,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20753,20753,25.137087,1.766626,114.187096,2.919584,3.000000,2.151809,1.330519,0.196680,False,...,False,False,True,False,False,False,False,True,False,3
20754,20754,18.000000,1.710000,50.000000,3.000000,4.000000,1.000000,2.000000,1.000000,False,...,False,False,True,False,False,False,False,True,False,0
20755,20755,20.101026,1.819557,105.580491,2.407817,3.000000,2.000000,1.158040,1.198439,False,...,False,False,False,True,False,False,False,True,False,3
20756,20756,33.852953,1.700000,83.520113,2.671238,1.971472,2.144838,0.000000,0.973834,False,...,False,False,False,True,True,False,False,False,False,6


In [15]:

# Step 1: Load the data
train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')
combined_data = pd.concat([train_data, test_data], ignore_index=True)

# Step 2: Handle missing values if any (if necessary)
# Assuming no missing values for simplicity

# Step 3: Encode categorical variables
cat_cols = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
label_encoders = {}
for col in cat_cols:
    label_encoders[col] = LabelEncoder()
    combined_data[col] = label_encoders[col].fit_transform(combined_data[col])

# Step 4: Encode target variable
label_encoder_target = LabelEncoder()
combined_data['NObeyesdad'] = label_encoder_target.fit_transform(combined_data['NObeyesdad'])

# Step 5: Scale numerical features
num_cols = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
scaler = StandardScaler()
combined_data[num_cols] = scaler.fit_transform(combined_data[num_cols])
train_data = combined_data[:len(train_data)]
test_data = combined_data[len(train_data):]

# Step 7: Separate features and target variable for training data
X = train_data.drop(['id', 'NObeyesdad'], axis=1)
y = train_data['NObeyesdad']

# Step 8: Choose models
models = [
    ('DecisionTree', DecisionTreeClassifier(random_state=42)),
    ('BaggingClassifier', BaggingClassifier(DecisionTreeClassifier(random_state=42), n_estimators=50, random_state=42)),
    ('RandomForest', RandomForestClassifier(random_state=42)),
    ('GradientBoost', GradientBoostingClassifier(random_state=42)),
    ('XGBoost', XGBClassifier(random_state=42))
]

# Step 9: Evaluate models using cross-validation
best_model = None
best_accuracy = 0
for name, model in models:
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    mean_accuracy = scores.mean()
    print(f"{name} Mean Accuracy: {mean_accuracy:.4f}")
    if mean_accuracy > best_accuracy:
        best_accuracy = mean_accuracy
        best_model = model

# Step 10: Hyperparameter tuning for the best model (if applicable)
if isinstance(best_model, RandomForestClassifier) or isinstance(best_model, GradientBoostingClassifier):
    param_grid = {'n_estimators': [50, 100, 200],
                  'max_depth': [None, 5, 10],
                  'min_samples_split': [2, 5, 10]}
    grid_search = GridSearchCV(best_model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X, y)
    best_model = grid_search.best_estimator_
    print("Best Model Hyperparameters:", best_model)

best_model.fit(X, y)

# Step 12: Make predictions on the test data
test_predictions = best_model.predict(test_data.drop(['id', 'NObeyesdad'], axis=1))

# Step 13: Prepare the submission file
submission_df = pd.DataFrame({'id': test_data['id'], 'NObeyesdad': label_encoder_target.inverse_transform(test_predictions)})
submission_df.to_csv('submission.csv', index=False)

DecisionTree Mean Accuracy: 0.8464
BaggingClassifier Mean Accuracy: 0.8920
BaggingClassifier Mean Accuracy: 0.8920
RandomForest Mean Accuracy: 0.8976
RandomForest Mean Accuracy: 0.8976
GradientBoost Mean Accuracy: 0.9048
GradientBoost Mean Accuracy: 0.9048
XGBoost Mean Accuracy: 0.9063
XGBoost Mean Accuracy: 0.9063
