In [49]:
import zipfile
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# import packages for hyperparameters tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

In [50]:
zf = zipfile.ZipFile("playground-series-s4e2.zip")
df_train = pd.read_csv(zf.open("train.csv"))
df_test = pd.read_csv(zf.open("test.csv"))

In [51]:
df_train['BMI'] = df_train.apply(lambda x: x['Weight']/ np.square(x['Height']), axis = 1)
df_train['BMI'] = df_train['BMI'].round(2)
df_test['BMI'] = df_test.apply(lambda x: x['Weight']/ np.square(x['Height']), axis = 1)
df_test['BMI'] = df_test['BMI'].round(2)
#df_train = df_train.drop(['MTRANS'], axis=1)
#df_test = df_test.drop(['MTRANS'], axis=1)

In [None]:
df_dummy_train = df_train.drop(['NObeyesdad'], axis = 1)
df_dummy_train = pd.get_dummies(df_dummy_train, drop_first = True, dtype = float)

y = df_train['NObeyesdad']
dummy_y = pd.get_dummies(y, dtype=float)
x = df_dummy_train.drop(['id'], axis = 1)



x_test = pd.get_dummies(df_test, drop_first = True, dtype = float)
x_test = x_test.drop(['id'], axis=1)

print(dummy_y)
print(x)

In [54]:

dummy_y_one_column = pd.DataFrame({'NObeyesdad': np.argmax(dummy_y.values, axis=1)})

X_train, X_test, y_train, y_test = train_test_split(x, dummy_y_one_column, stratify=y, random_state= 25 )
print(dummy_y_one_column)


[6 1 0 ... 3 6 3]


In [55]:
# Print the shape of training data and number of unique labels
print(X_train.shape)
print(len(np.unique(y_train)))

# Print the shape of testing data and number of unique labels
print(x_test.shape)
print(len(np.unique(y_test)))


(15568, 23)
7
(13840, 24)
7


In [56]:
model = xgb.XGBClassifier(
    learning_rate =0.1,
    n_estimators=1000,
    early_stopping_rounds=50,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    colsample_bytree=0.8,
    subsample=0.8,
    objective= 'multi:softprob',
    nthread=-1,
    random_state=42
)

In [None]:
model.fit(X_train, 
          y_train, 
          eval_set=[(x_test, y_test)])

In [None]:
y_pred = model.predict(x_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print('\nClassification Report:')
print(classification_report(y_test, y_pred))

Accuracy: 0.87

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.95      0.92     32529
           1       0.75      0.55      0.63      8730

    accuracy                           0.87     41259
   macro avg       0.82      0.75      0.78     41259
weighted avg       0.86      0.87      0.86     41259



In [None]:
y_test_pred = model.predict(x_test)

y_test_pred




In [None]:
result_labels = dummy_y.columns[np.argmax(y_test_pred, axis=1)]
result_column = pd.Series(result_labels, name='Result')
result_df = result_column
result_df

In [None]:
df_submission = pd.DataFrame({
    'id': dummy_y['id'],
    'NObeyesdad': result_df
})
df_submission

Unnamed: 0,id,Exited
0,165034,0.0
1,165035,0.8
2,165036,0.0
3,165037,0.2
4,165038,0.4
...,...,...
110018,275052,0.0
110019,275053,0.1
110020,275054,0.0
110021,275055,0.1


In [None]:
df_submission.to_csv('XGBoost_Submission.csv', index=False)