In [43]:
import zipfile
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# import packages for hyperparameters tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

In [44]:
zf = zipfile.ZipFile("playground-series-s4e2.zip")
df_train = pd.read_csv(zf.open("train.csv"))
df_test = pd.read_csv(zf.open("test.csv"))

In [45]:
df_train['BMI'] = df_train.apply(lambda x: x['Weight']/ np.square(x['Height']), axis = 1)
df_train['BMI'] = df_train['BMI'].round(2)
df_test['BMI'] = df_test.apply(lambda x: x['Weight']/ np.square(x['Height']), axis = 1)
df_test['BMI'] = df_test['BMI'].round(2)
df_train = df_train.drop(['MTRANS'], axis=1)
df_test = df_test.drop(['MTRANS'], axis=1)

In [46]:
df_dummy_train = df_train.drop(['NObeyesdad'], axis = 1)
df_dummy_train = pd.get_dummies(df_dummy_train, drop_first = True, dtype = float)

y = df_train['NObeyesdad']
dummy_y = pd.get_dummies(y, dtype=float)
x = df_dummy_train.drop(['id'], axis = 1)

x_test = pd.get_dummies(df_test, drop_first = True, dtype = float)
x_test = x_test.drop(['id'], axis=1)

In [47]:
X_train, X_test, y_train, y_test = train_test_split(x, dummy_y, test_size = 0.2, random_state= 25 )

In [48]:
model = xgb.XGBClassifier(booster='gbtree',
                    tree_method="hist",
                    objective='multi:softmax',  # 'multi:softmax' for multiclass classification
                    num_class=7)


In [49]:
model.fit(X_train, 
          y_train, 
          verbose=True,
          eval_metric='auc',
          eval_set=[(x_test, y_test)])



XGBoostError: [09:12:55] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0b3782d1791676daf-1\xgboost\xgboost-ci-windows\src\data\data.cc:501: Check failed: this->labels.Size() % this->num_row_ == 0 (1384 vs. 0) : Incorrect size for labels.

In [None]:
y_pred = model.predict(x_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print('\nClassification Report:')
print(classification_report(y_test, y_pred))

Accuracy: 0.87

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.95      0.92     32529
           1       0.75      0.55      0.63      8730

    accuracy                           0.87     41259
   macro avg       0.82      0.75      0.78     41259
weighted avg       0.86      0.87      0.86     41259



In [None]:
y_test_pred = model.predict(x_test)

y_test_pred




In [None]:
result_labels = dummy_y.columns[np.argmax(y_test_pred, axis=1)]
result_column = pd.Series(result_labels, name='Result')
result_df = result_column
result_df

In [None]:
df_submission = pd.DataFrame({
    'id': dummy_y['id'],
    'NObeyesdad': result_df
})
df_submission

Unnamed: 0,id,Exited
0,165034,0.0
1,165035,0.8
2,165036,0.0
3,165037,0.2
4,165038,0.4
...,...,...
110018,275052,0.0
110019,275053,0.1
110020,275054,0.0
110021,275055,0.1


In [None]:
df_submission.to_csv('XGBoost_Submission.csv', index=False)