In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os
import sys
sys.path.append(r'../')
from feature_engineering import load_xgb_data
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import roc_auc_score, average_precision_score
import xgboost as xgb
from xgboost import XGBRegressor, XGBClassifier


In [4]:
xgb_data = load_xgb_data()

In [5]:
def split_train_test(data):
    train = data[data["answerCode"]>=0]
    test = data[data["answerCode"]<0]
    x_train, x_valid, y_train, y_valid = train_test_split(train.drop("answerCode", axis=1), train["answerCode"], test_size=0.2, shuffle=True, random_state=42)
    return x_train, x_valid, y_train, y_valid, test
x_train, x_valid, y_train, y_valid, test = split_train_test(xgb_data)

In [6]:
x_train.head()

Unnamed: 0,userID,assessmentItemID,testId,Timestamp,KnowledgeTag,userAnswerRate,userSolvedLen,testAnswerRate,testSolvedLen,tagAnswerRate,...,second,first3,timeConcentrationRate,timeConcentrationCount,timeConcentrationLevel,user_correct_answer,user_total_answer,user_acc,monthAnswerRate,monthSolvedCount
904324,1222,A070052010,A070000052,2020-07-07 00:21:05,6785,0.699301,429,0.558844,2940,0.455713,...,5,70,0.650389,110005,2,138.0,200,0.69,0.665516,297273
1928204,3251,A020181001,A020000181,2020-08-12 08:40:55,8135,0.792812,473,0.637486,1782,0.654036,...,55,20,0.668766,172419,2,273.0,341,0.800587,0.656892,347104
1824231,2970,A020091005,A020000091,2020-07-16 11:04:52,7941,0.541667,480,0.791275,1490,0.800199,...,52,20,0.709353,164041,2,101.0,148,0.682432,0.665516,297273
1946186,3304,A090014002,A090000014,2020-05-22 04:33:27,9728,0.313131,297,0.441015,2916,0.427994,...,27,90,0.633326,163186,1,34.0,102,0.333333,0.642794,199003
1783086,2866,A070055009,A070000055,2020-07-01 03:56:08,1262,0.358251,709,0.471774,2232,0.542545,...,8,70,0.633099,155993,1,35.0,107,0.327103,0.665516,297273


In [7]:
DROPS = ["userID","assessmentItemID","testId","Timestamp"]

# XGB preprocessing

In [8]:
def xgb_preprocessing(data):
    data = data.drop(DROPS, axis=1)
    for col in data.columns:
        data[col] = data[col].astype(float)
    return data
x_train = xgb_preprocessing(x_train)
x_valid = xgb_preprocessing(x_valid)
test = xgb_preprocessing(test)

In [9]:
x_train.head()

Unnamed: 0,KnowledgeTag,userAnswerRate,userSolvedLen,testAnswerRate,testSolvedLen,tagAnswerRate,tagSolvedLen,year,month,day,...,second,first3,timeConcentrationRate,timeConcentrationCount,timeConcentrationLevel,user_correct_answer,user_total_answer,user_acc,monthAnswerRate,monthSolvedCount
904324,6785.0,0.699301,429.0,0.558844,2940.0,0.455713,2958.0,2020.0,7.0,7.0,...,5.0,70.0,0.650389,110005.0,2.0,138.0,200.0,0.69,0.665516,297273.0
1928204,8135.0,0.792812,473.0,0.637486,1782.0,0.654036,3642.0,2020.0,8.0,12.0,...,55.0,20.0,0.668766,172419.0,2.0,273.0,341.0,0.800587,0.656892,347104.0
1824231,7941.0,0.541667,480.0,0.791275,1490.0,0.800199,5035.0,2020.0,7.0,16.0,...,52.0,20.0,0.709353,164041.0,2.0,101.0,148.0,0.682432,0.665516,297273.0
1946186,9728.0,0.313131,297.0,0.441015,2916.0,0.427994,3465.0,2020.0,5.0,22.0,...,27.0,90.0,0.633326,163186.0,1.0,34.0,102.0,0.333333,0.642794,199003.0
1783086,1262.0,0.358251,709.0,0.471774,2232.0,0.542545,2468.0,2020.0,7.0,1.0,...,8.0,70.0,0.633099,155993.0,1.0,35.0,107.0,0.327103,0.665516,297273.0


In [10]:
model=XGBRegressor()
param_grid={'booster' :['gbtree'],'verbosity':[0],
                 'max_depth':[5,6,8],
                 'min_child_weight':[1,3,5],
                 'gamma':[1,2,3],
                 'nthread':[4],
                 'colsample_bytree':[0.5,0.8],
                 'colsample_bylevel':[0.9],
                 'n_estimators':[100,200,300],
                 'objective':['binary:logistic'],
                 'random_state':[42],
            }
cv=KFold(n_splits=5, random_state=42, shuffle=True)

gcv=GridSearchCV(model, param_grid=param_grid, cv=cv, scoring='roc_auc', n_jobs=-1)

In [11]:

gcv.fit(x_train,y_train)
print('final params', gcv.best_params_)
print('best score', gcv.best_score_)

KeyboardInterrupt: 

In [None]:
model = gcv.best_estimator_
model.predict(test)