In [1]:
# Импорт бибилотек
import pandas as pd
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV

In [2]:
# Данные по соревнованию от Kaggle, состоящие из обучающего и тестового набора
train = pd.read_csv('datasets/train.csv', index_col='id')
test = pd.read_csv('datasets/test.csv', index_col='id')

In [3]:
# One-hot кодирование стобца color
object_train = train.select_dtypes(include='object')['color']
object_test = test.select_dtypes(include='object')['color']
num_train = train.select_dtypes(exclude='object')
num_test = test.select_dtypes(exclude = 'object')
one_hot_train = pd.get_dummies(object_train, dtype='int')
one_hot_test = pd.get_dummies(object_test, dtype='int')

In [4]:
# Объединение закодированных признаков
X_train = num_train.join(one_hot_train)
X_test = num_test.join(one_hot_test)

In [5]:
# Кодирование целевого признака для xgboost
y_train_categorical = train.select_dtypes(include='object')['type']
label_encoder_train = LabelEncoder().fit(y_train_categorical)
label_encoded_y = label_encoder_train.transform(y_train_categorical)
y_train = pd.Series(label_encoded_y, index=train.index, name='type')

In [6]:
# Обучение градиентому бустингу
params = {'n_estimators': [10, 50, 100],
         'max_depth': [2, 4, 6, 8, 10],
         'learning_rate': [0.3, 0.5, 1.0]}
bst = XGBClassifier(objective='binary:logistic')
bst_grd = GridSearchCV(bst, param_grid=params, cv=5)
bst_grd.fit(X_train, y_train)
preds = pd.DataFrame(bst_grd.predict(X_test), index=test.index).rename(columns={0:'num_types'})

In [7]:
y_train

id
0      1
1      2
2      1
4      1
5      0
      ..
886    2
889    0
890    1
896    0
897    1
Name: type, Length: 371, dtype: int32

In [8]:
# Преобразование результатов обратно в категориальный признак
monsters = {0: 'Ghost', 1: 'Ghoul', 2: 'Goblin'}
preds['type'] = preds['num_types'].map(monsters)
preds = preds.drop(columns='num_types')

In [9]:
# Перевод в csv файл
preds.reset_index().to_csv(r'predicted/predict_monsters_xgb.csv', index=False)

In [10]:
# Лучшие параметры
bst_grd.best_params_

{'learning_rate': 0.5, 'max_depth': 6, 'n_estimators': 10}