In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import random
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import xgboost as xgb

In [5]:
FEATS = [
   'KnowledgeTag', 'month', 'hour', 'week', 'elapsed', 'elapsed_cate',
   'assessmentItemID0', 'assessmentItemID1', 'assessmentItemID2',
   'testId0', 'testId1', 'test0_mean', 'test0_std', 'test1_mean',
   'test1_std', 'tag_mean', 'tag_std', 'ass0_mean', 'ass0_std',
   'ass1_mean', 'ass1_std', 'ass2_mean', 'ass2_std',
'as0_as1', 'as0_as2', 'as1_as2', 'assessmentItemID', 'week_hour']

cate = ['KnowledgeTag', 'month', 'hour', 'week', 'elapsed_cate', 'testId0', 'testId1',
       'assessmentItemID0', 'assessmentItemID1', 'assessmentItemID2',
        'as0_as1', 'as0_as2', 'as1_as2', 'assessmentItemID', 'week_hour']

df = pd.read_csv('../../data/elo.csv')

for i in cate:
    df[i] = df[i].astype('category')

In [48]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[FEATS], df['answerCode'], test_size=0.2, shuffle=True, random_state=42)

dtrain = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest  = xgb.DMatrix(X_test,  y_test, enable_categorical=True)

In [27]:
watchlist = [(dtrain, 'train'), (dtest, 'test')]
params = {
    'booster': 'gbtree', 
    'objective': 'binary:logistic',
    # 'subsample': 0.8,          # 80% of data to grow trees and prevent overfitting
    # 'colsample_bytree': 0.85,  # 85% of features used
    # 'eta': 0.1, 
    # 'max_depth': 6, 
    'seed': 42,
    'eval_metric': 'auc'}
# xgb_cv = xgb.cv(dtrain=dtrain, params=params, nfold=3,
#                     num_boost_round=500, early_stopping_rounds=10, metrics="auc", as_pandas=True, seed=123)

# xgb_cv.head()

xgb_model = xgb.train(params, dtrain, 100, evals = watchlist,
                      early_stopping_rounds = 50, verbose_eval = 100)

preds = xgb_model.predict(dtest)
acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))
auc = roc_auc_score(y_test, preds)

print(f'VALID AUC : {auc} ACC : {acc}\n')

VALID AUC : 0.6981963973604752 ACC : 0.7625991702164722



In [53]:
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.model_selection import StratifiedKFold
params = {
    'booster': 'gbtree', 
    'objective': 'binary:logistic',
    'seed': 42,
    'eval_metric': 'auc'}

str_kf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)

X, y = df[FEATS], df['answerCode']
p =[]
for i, (train_index, test_index) in tqdm(enumerate(str_kf.split(X, y))):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    
    dtrain = xgb.DMatrix(X_train, y_train, enable_categorical=True)
    dtest  = xgb.DMatrix(X_test,  y_test, enable_categorical=True)
    
    watchlist = [(dtrain, 'train'), (dtest, 'test')]
    
    xgb_model = xgb.train(params, dtrain, 250, evals = watchlist,
                          early_stopping_rounds = 500, verbose_eval = 100)

    
    preds = xgb_model.predict(dtest)
    acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))
    auc = roc_auc_score(y_test, preds)
    p.append(xgb_model.predict(dsub))
    print(f'VALID AUC : {auc} ACC : {acc}\n')
    
    xgb_model.save_model(f'xgboost/model/catboost_{i}.model')
    
m = (p[0] + p[1] + p[2] + p[3] + p[4])/5
s['prediction'] = m

s.to_csv('output/submission_xgb_elo.csv', index=False)

0it [00:00, ?it/s]

[0]	train-auc:0.73949	test-auc:0.73841
[100]	train-auc:0.78316	test-auc:0.77811
[200]	train-auc:0.79717	test-auc:0.78948
[249]	train-auc:0.80153	test-auc:0.79248


1it [04:03, 243.27s/it]

VALID AUC : 0.792481712024945 ACC : 0.7587550871747771

[0]	train-auc:0.73923	test-auc:0.73822
[100]	train-auc:0.78359	test-auc:0.77785
[200]	train-auc:0.79683	test-auc:0.78796
[249]	train-auc:0.80153	test-auc:0.79137


2it [07:53, 239.38s/it]

VALID AUC : 0.7913732770288868 ACC : 0.7582854801451332

[0]	train-auc:0.73895	test-auc:0.73945
[100]	train-auc:0.78282	test-auc:0.77936
[200]	train-auc:0.79637	test-auc:0.78977
[249]	train-auc:0.80141	test-auc:0.79350


3it [11:44, 236.95s/it]

VALID AUC : 0.7935000067899785 ACC : 0.7594315813227076

[0]	train-auc:0.73890	test-auc:0.73940
[100]	train-auc:0.78234	test-auc:0.77829
[200]	train-auc:0.79678	test-auc:0.78972
[249]	train-auc:0.80139	test-auc:0.79313


4it [15:32, 234.23s/it]

VALID AUC : 0.7931303472921546 ACC : 0.7591702940076129

[0]	train-auc:0.73924	test-auc:0.73905
[100]	train-auc:0.78320	test-auc:0.77918
[200]	train-auc:0.79667	test-auc:0.78982
[249]	train-auc:0.80097	test-auc:0.79291


5it [19:26, 233.23s/it]

VALID AUC : 0.7929105050250722 ACC : 0.7588892121989504






In [52]:
sub = pd.read_csv('../../data/infer.csv')

for i in cate:
    sub[i] = sub[i].astype('category')
p = []
dsub  = xgb.DMatrix(sub[FEATS], sub['answerCode'], enable_categorical=True)

for i in tqdm(range(5)):
    new_xgb_model = xgb.Booster(params)
    new_xgb_model.load_model(f'xgboost/model/catboost_{i}.model') 
    preds = new_xgb_model.predict(dsub)
    p.append(preds)
    
s = pd.read_csv('output/submission.csv')
m = (p[0] + p[1] + p[2] + p[3] + p[4])/5
s['prediction'] = m

s.to_csv('output/submission_xgb_elo.csv', index=False)

100%|██████████| 5/5 [00:00<00:00, 17.32it/s]


In [51]:
new_xgb_model = xgb.Booster()
new_xgb_model.load_model(f'xgboost/model/catboost_1.model') 
    
preds = new_xgb_model.predict(dtest)
acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))
auc = roc_auc_score(y_test, preds)

print(f'VALID AUC : {auc} ACC : {acc}\n')

ValueError: Found input variables with inconsistent numbers of samples: [505192, 744]