In [103]:
import pandas as pd
import numpy as np

import optuna 
from optuna import Trial, visualization
from optuna.samplers import TPESampler

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import Ridge, LogisticRegression, Lasso, ElasticNet, SGDClassifier, RidgeClassifier, BayesianRidge, LinearRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB, CategoricalNB
from sklearn.neighbors import NearestCentroid
from sklearn.svm import SVC, SVR
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, StackingClassifier, StackingRegressor
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostClassifier, CatBoostRegressor

import math
import itertools
import time

In [76]:
train = pd.read_csv('pre_train_data.csv')
test = pd.read_csv('pre_test_data.csv')
train.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,problem_n,test_pre,test_post,year,...,interval,test_interval,count,correct,tag_count,tag_correct,rate,tag_rate,log_interval,pre_interval
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,1,6,1,2020,...,0,0,0,0,0,0,0.0,0.0,0.0,0
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,2,6,1,2020,...,3,3,1,1,0,0,1.0,0.0,1.098612,3
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,3,6,1,2020,...,8,11,2,2,1,1,1.0,1.0,2.079442,8
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,4,6,1,2020,...,7,18,3,3,2,2,1.0,1.0,1.94591,7
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,5,6,1,2020,...,7,25,4,4,3,3,1.0,1.0,1.94591,7


In [77]:
data = pd.concat([train, test])
data = data[data.answerCode >= 0]
data.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,problem_n,test_pre,test_post,year,...,interval,test_interval,count,correct,tag_count,tag_correct,rate,tag_rate,log_interval,pre_interval
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,1,6,1,2020,...,0,0,0,0,0,0,0.0,0.0,0.0,0
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,2,6,1,2020,...,3,3,1,1,0,0,1.0,0.0,1.098612,3
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,3,6,1,2020,...,8,11,2,2,1,1,1.0,1.0,2.079442,8
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,4,6,1,2020,...,7,18,3,3,2,2,1.0,1.0,1.94591,7
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,5,6,1,2020,...,7,25,4,4,3,3,1.0,1.0,1.94591,7


In [78]:
Item2Vec = {v:k for k, v in enumerate(sorted(data.assessmentItemID.unique()))}
test2Vec = {v:k for k, v in enumerate(sorted(data.testId.unique()))}
tag2Vec = {v:k for k, v in enumerate(sorted(data.KnowledgeTag.unique()))}

train['assessmentItemID'] = train['assessmentItemID'].apply(lambda x : Item2Vec[x])
test['assessmentItemID'] = test['assessmentItemID'].apply(lambda x : Item2Vec[x])
data['assessmentItemID'] = data['assessmentItemID'].apply(lambda x : Item2Vec[x])
train['testId'] = train['testId'].apply(lambda x : test2Vec[x])
test['testId'] = test['testId'].apply(lambda x : test2Vec[x])
data['testId'] = data['testId'].apply(lambda x : test2Vec[x])
train['KnowledgeTag'] = train['KnowledgeTag'].apply(lambda x : tag2Vec[x])
test['KnowledgeTag'] = test['KnowledgeTag'].apply(lambda x : tag2Vec[x])
data['KnowledgeTag'] = data['KnowledgeTag'].apply(lambda x : tag2Vec[x])

In [79]:
c_columns = ['assessmentItemID', 'testId', 'KnowledgeTag', 'problem_n', 'test_pre', 'test_post']
n_columns = ['pre_interval', 'correct', 'rate', 'tag_correct', 'tag_rate']
X, y = data[data.answerCode >= 0][c_columns + n_columns], data[data.answerCode >= 0]['answerCode']

In [94]:
X_train, X_test, y_train, y_test = train[c_columns+n_columns], test[test.answerCode >= 0][c_columns+n_columns], train['answerCode'], test[test.answerCode >= 0]['answerCode']

In [80]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
X_train

Unnamed: 0,assessmentItemID,testId,KnowledgeTag,problem_n,test_pre,test_post,pre_interval,correct,rate,tag_correct,tag_rate
481549,6710,1177,592,1,7,17,0,86,0.860000,0,0.000000
249125,5034,924,57,5,5,143,40,23,0.450980,0,0.000000
848939,3552,679,261,1,4,96,0,41,0.546667,0,0.000000
906888,1043,206,638,1,2,15,0,11,0.733333,0,0.000000
1863010,2584,495,87,4,3,110,121,439,0.624467,2,1.000000
...,...,...,...,...,...,...,...,...,...,...,...
2249467,290,58,583,1,1,59,0,0,0.000000,0,0.000000
963395,5661,1022,100,3,6,48,24,63,0.750000,1,0.500000
2215104,779,152,610,2,1,153,44,1,1.000000,1,1.000000
1484405,1318,257,600,6,2,66,18,263,0.659148,5,0.333333


In [95]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test), roc_auc_score(y_test, lr.predict(X_test))

(0.14952556106856285, 0.7280537543590179)

In [96]:
lgbm = LGBMRegressor()
lgbm.fit(X_train, y_train)
lgbm.score(X_test, y_test), roc_auc_score(y_test, lgbm.predict(X_test))

(0.24682020155172935, 0.7824081596962055)

In [97]:
lgbm = LGBMClassifier()
lgbm.fit(X_train, y_train)
lgbm.score(X_test, y_test), roc_auc_score(y_test, lgbm.predict_proba(X_test)[:, 1])

(0.7591780082507614, 0.7828756706664989)

In [98]:
cat = CatBoostRegressor()
cat.fit(X_train, y_train, verbose=False)
cat.score(X_test, y_test), roc_auc_score(y_test, cat.predict(X_test))

(0.2709289391520935, 0.8005585742109068)

In [99]:
cat = CatBoostClassifier()
cat.fit(X_train, y_train, verbose=False)
cat.score(X_test, y_test), roc_auc_score(y_test, cat.predict_proba(X_test)[:,1])

(0.7709796815360296, 0.8105729856430635)

In [100]:
xgb = XGBRegressor()
xgb.fit(X_train, y_train)
xgb.score(X_test, y_test), roc_auc_score(y_test, xgb.predict(X_test))

(0.2708477740738572, 0.8006647721129712)

In [101]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
xgb.score(X_test, y_test), roc_auc_score(y_test, xgb.predict_proba(X_test)[:,1])

(0.7655357211705286, 0.7999765308466192)

In [10]:
gnb = GaussianNB(var_smoothing=0)
gnb.fit(X_train, y_train)
gnb.score(X_test, y_test), roc_auc_score(y_test, gnb.predict(X_test))

(0.6799184150476097, 0.6429304780037544)

In [88]:
rg = Ridge()
rg.fit(X_train, y_train)
rg.score(X_test, y_test), roc_auc_score(y_test, rg.predict(X_test))

(0.15252618513769778, 0.729079399670045)

In [12]:
br = BayesianRidge()
br.fit(X_train, y_train)
br.score(X_test, y_test), roc_auc_score(y_test, rg.predict(X_test))

(0.1526237848455324, 0.7291395171144252)

In [13]:
ls = Lasso()
ls.fit(X_train, y_train)
ls.score(X_test, y_test), roc_auc_score(y_test, ls.predict(X_test))

(0.05087771953141085, 0.6338985135597898)

In [14]:
en = ElasticNet()
en.fit(X_train, y_train)
en.score(X_test, y_test), roc_auc_score(y_test, en.predict(X_test))

(0.05307988700906707, 0.6356061882843402)

In [15]:
lr = LogisticRegression(random_state=0)
lr.fit(X_train, y_train)
lr.score(X_test, y_test), roc_auc_score(y_test, lr.predict_proba(X_test)[:,1])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


(0.6811979306052837, 0.6368405472404607)

In [16]:
bnb = BernoulliNB(alpha=0.001)
bnb.fit(X_train, y_train)
bnb.score(X_test, y_test), roc_auc_score(y_test, bnb.predict(X_test))

(0.6818519404138472, 0.5912863225873386)

In [17]:
nc = NearestCentroid(shrink_threshold=0.1)
nc.fit(X_train, y_train)
nc.score(X_test, y_test), roc_auc_score(y_test, nc.predict(X_test))

(0.5806276910603352, 0.5772755375077397)

In [18]:
estimators = [('ridge', Ridge()), ('lgbm', LGBMRegressor()), ('cat', CatBoostRegressor(verbose=False)), ('xgb', XGBRegressor())]
final_estimator = LGBMRegressor()
reg = StackingRegressor(estimators=estimators, final_estimator=final_estimator)
reg.fit(X_train, y_train)
reg.score(X_test, y_test), roc_auc_score(y_test, reg.predict(X_test))

(0.2865808387812879, 0.8080716913489948)

In [90]:
outputs = {}
outputs['rg'] = rg.predict(X_train)
outputs['lgbm'] = lgbm.predict_proba(X_train)[:, 1]
outputs['cat'] = cat.predict_proba(X_train)[:, 1]
outputs['xgb'] = xgb.predict_proba(X_train)[:, 1]
outputs_train = pd.DataFrame(outputs)

outputs = {}
outputs['rg'] = rg.predict(X_test)
outputs['lgbm'] = lgbm.predict_proba(X_test)[:, 1]
outputs['cat'] = cat.predict_proba(X_test)[:, 1]
outputs['xgb'] = xgb.predict_proba(X_test)[:, 1]

outputs_valid = pd.DataFrame(outputs)
outputs_train

Unnamed: 0,rg,lgbm,cat,xgb
0,0.714943,0.873430,0.914681,0.880984
1,0.379388,0.506843,0.546059,0.534871
2,0.589900,0.735820,0.789487,0.728360
3,0.687495,0.880355,0.912651,0.898320
4,0.756477,0.697648,0.725514,0.700752
...,...,...,...,...
1894462,0.335425,0.742430,0.655992,0.642633
1894463,0.700822,0.708474,0.773225,0.731172
1894464,1.039358,0.834533,0.903799,0.896257
1894465,0.526837,0.407854,0.249690,0.360953


In [93]:
#stack = Ridge()
stack = LGBMClassifier()
stack.fit(outputs_train, y_train)
stack.score(outputs_valid, y_test), roc_auc_score(y_test, stack.predict_proba(outputs_valid)[:, 1])

(0.7744917171953906, 0.8155966580129845)

In [23]:
def objectiveLGBM(trial: Trial, X, y, test):
    param = {
        'verbosity': -1,
        'num_leaves': trial.suggest_int('num_leaves', 2, 512),
        'learning_rate': 0.01,
        'n_estimators': trial.suggest_int('n_estimators', 700, 3000),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100)
    }
    lgbm_regr = LGBMRegressor(**param)
    lgbm_regr = lgbm_regr.fit(X, y, verbose=False)
    
    score = roc_auc_score(y_test, lgbm_regr.predict(test))
    return score

In [29]:
study = optuna.create_study(direction='maximize',sampler=TPESampler())
study.optimize(lambda trial : objectiveLGBM(trial, outputs_train, y_train, outputs_valid), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study.best_trial.value,study.best_trial.params))

best_param2 = study.best_trial.params
lgbm = LGBMRegressor(**best_param2, objective='regression', metric='root_mean_squared_error',  learning_rate= 0.01, boosting_type='gbdt')

[32m[I 2022-11-29 05:52:44,183][0m A new study created in memory with name: no-name-003b0ac5-1c71-45ee-9caf-849a10521d84[0m
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),




[32m[I 2022-11-29 05:54:14,171][0m Trial 0 finished with value: 0.8064238585009919 and parameters: {'num_leaves': 429, 'n_estimators': 1545, 'feature_fraction': 0.4371730102478605, 'min_child_samples': 36}. Best is trial 0 with value: 0.8064238585009919.[0m
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),




[32m[I 2022-11-29 05:55:31,298][0m Trial 1 finished with value: 0.8071461191916937 and parameters: {'num_leaves': 181, 'n_estimators': 1619, 'feature_fraction': 0.5292672517004122, 'min_child_samples': 61}. Best is trial 1 with value: 0.8071461191916937.[0m
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),




[32m[I 2022-11-29 05:56:09,520][0m Trial 2 finished with value: 0.8079052667373618 and parameters: {'num_leaves': 312, 'n_estimators': 731, 'feature_fraction': 0.9948232580427259, 'min_child_samples': 43}. Best is trial 2 with value: 0.8079052667373618.[0m
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),




[32m[I 2022-11-29 05:57:35,227][0m Trial 3 finished with value: 0.8075081339342849 and parameters: {'num_leaves': 311, 'n_estimators': 2028, 'feature_fraction': 0.8275050682230314, 'min_child_samples': 37}. Best is trial 2 with value: 0.8079052667373618.[0m
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),




[32m[I 2022-11-29 05:58:42,681][0m Trial 4 finished with value: 0.8070859613766564 and parameters: {'num_leaves': 243, 'n_estimators': 1340, 'feature_fraction': 0.4046094073103571, 'min_child_samples': 39}. Best is trial 2 with value: 0.8079052667373618.[0m
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),




[32m[I 2022-11-29 06:02:14,577][0m Trial 5 finished with value: 0.8068770365378316 and parameters: {'num_leaves': 397, 'n_estimators': 2714, 'feature_fraction': 0.6687085863118138, 'min_child_samples': 88}. Best is trial 2 with value: 0.8079052667373618.[0m
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),




[32m[I 2022-11-29 06:07:34,392][0m Trial 6 finished with value: 0.8059112867046684 and parameters: {'num_leaves': 512, 'n_estimators': 1751, 'feature_fraction': 0.4822971219549416, 'min_child_samples': 82}. Best is trial 2 with value: 0.8079052667373618.[0m
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),




[32m[I 2022-11-29 06:09:45,431][0m Trial 7 finished with value: 0.8076742348380854 and parameters: {'num_leaves': 188, 'n_estimators': 2226, 'feature_fraction': 0.6669660926888865, 'min_child_samples': 54}. Best is trial 2 with value: 0.8079052667373618.[0m
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),




[32m[I 2022-11-29 06:12:27,416][0m Trial 8 finished with value: 0.8079577875989126 and parameters: {'num_leaves': 53, 'n_estimators': 2405, 'feature_fraction': 0.7395086273379834, 'min_child_samples': 73}. Best is trial 8 with value: 0.8079577875989126.[0m
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),




[32m[I 2022-11-29 06:14:24,039][0m Trial 9 finished with value: 0.8079876412370567 and parameters: {'num_leaves': 49, 'n_estimators': 1947, 'feature_fraction': 0.8246513005323333, 'min_child_samples': 34}. Best is trial 9 with value: 0.8079876412370567.[0m
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),




[32m[I 2022-11-29 06:16:44,362][0m Trial 10 finished with value: 0.8079899793658628 and parameters: {'num_leaves': 8, 'n_estimators': 2919, 'feature_fraction': 0.9696201623070262, 'min_child_samples': 9}. Best is trial 10 with value: 0.8079899793658628.[0m
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),




[32m[I 2022-11-29 06:18:01,283][0m Trial 11 finished with value: 0.8080097628165371 and parameters: {'num_leaves': 26, 'n_estimators': 2965, 'feature_fraction': 0.9782418584630306, 'min_child_samples': 13}. Best is trial 11 with value: 0.8080097628165371.[0m
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),




[32m[I 2022-11-29 06:19:13,313][0m Trial 12 finished with value: 0.8079170421570014 and parameters: {'num_leaves': 5, 'n_estimators': 2975, 'feature_fraction': 0.9974637100104918, 'min_child_samples': 5}. Best is trial 11 with value: 0.8080097628165371.[0m
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),




[32m[I 2022-11-29 06:22:30,167][0m Trial 13 finished with value: 0.8078249919828696 and parameters: {'num_leaves': 110, 'n_estimators': 2922, 'feature_fraction': 0.8903413129062339, 'min_child_samples': 9}. Best is trial 11 with value: 0.8080097628165371.[0m
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),




[32m[I 2022-11-29 06:25:16,828][0m Trial 14 finished with value: 0.8078380558554413 and parameters: {'num_leaves': 110, 'n_estimators': 2568, 'feature_fraction': 0.9174999636933917, 'min_child_samples': 16}. Best is trial 11 with value: 0.8080097628165371.[0m
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),




[32m[I 2022-11-29 06:27:10,389][0m Trial 15 finished with value: 0.8077824669181644 and parameters: {'num_leaves': 121, 'n_estimators': 2633, 'feature_fraction': 0.9153532907751704, 'min_child_samples': 21}. Best is trial 11 with value: 0.8080097628165371.[0m
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),




[32m[I 2022-11-29 06:29:01,683][0m Trial 16 finished with value: 0.8079028311980674 and parameters: {'num_leaves': 7, 'n_estimators': 2989, 'feature_fraction': 0.7827436370431766, 'min_child_samples': 20}. Best is trial 11 with value: 0.8080097628165371.[0m
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),




[32m[I 2022-11-29 06:32:51,109][0m Trial 17 finished with value: 0.8069441279480989 and parameters: {'num_leaves': 161, 'n_estimators': 2288, 'feature_fraction': 0.5906285360911665, 'min_child_samples': 24}. Best is trial 11 with value: 0.8080097628165371.[0m
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),




[32m[I 2022-11-29 06:34:27,702][0m Trial 18 finished with value: 0.8080816466311013 and parameters: {'num_leaves': 50, 'n_estimators': 983, 'feature_fraction': 0.9488387762454793, 'min_child_samples': 27}. Best is trial 18 with value: 0.8080816466311013.[0m
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),




[32m[I 2022-11-29 06:35:34,924][0m Trial 19 finished with value: 0.8080438640296694 and parameters: {'num_leaves': 73, 'n_estimators': 1007, 'feature_fraction': 0.8537020970167365, 'min_child_samples': 26}. Best is trial 18 with value: 0.8080816466311013.[0m


Best trial: score 0.8080816466311013,
params {'num_leaves': 50, 'n_estimators': 983, 'feature_fraction': 0.9488387762454793, 'min_child_samples': 27}


In [58]:
def objectiveRidge(trial: Trial, X, y, test):
    param = {
        'alpha' : trial.suggest_categorical('alpha', [0.1 * i for i in range(51)]),
        'max_iter' : trial.suggest_int('max_iter', 1, 500),
        'tol' : trial.suggest_uniform('tol', 1e-5, 1),
        
    }
    ridge_regr = Ridge(**param)
    ridge_regr = ridge_regr.fit(X, y)
    
    score = roc_auc_score(y_test, ridge_regr.predict(test))
    return score

In [59]:
study = optuna.create_study(direction='maximize',sampler=TPESampler())
study.optimize(lambda trial : objectiveRidge(trial, X_train, y_train, X_test), n_trials=200)
print('Best trial: score {},\nparams {}'.format(study.best_trial.value,study.best_trial.params))

best_param2 = study.best_trial.params
rg = Ridge(**best_param2)

[32m[I 2022-11-29 07:38:48,373][0m A new study created in memory with name: no-name-8996f6d4-3cf4-4b6b-ad76-7c937a8d322b[0m
  'tol' : trial.suggest_uniform('tol', 1e-5, 1),
[32m[I 2022-11-29 07:38:49,000][0m Trial 0 finished with value: 0.7291394473233401 and parameters: {'alpha': 2.6, 'max_iter': 47, 'tol': 0.9441304104270728}. Best is trial 0 with value: 0.7291394473233401.[0m
  'tol' : trial.suggest_uniform('tol', 1e-5, 1),
[32m[I 2022-11-29 07:38:49,634][0m Trial 1 finished with value: 0.729139489993767 and parameters: {'alpha': 1.6, 'max_iter': 254, 'tol': 0.27170535398857765}. Best is trial 1 with value: 0.729139489993767.[0m
  'tol' : trial.suggest_uniform('tol', 1e-5, 1),
[32m[I 2022-11-29 07:38:50,282][0m Trial 2 finished with value: 0.7291395543984032 and parameters: {'alpha': 0.2, 'max_iter': 400, 'tol': 0.04607056732553644}. Best is trial 2 with value: 0.7291395543984032.[0m
  'tol' : trial.suggest_uniform('tol', 1e-5, 1),
[32m[I 2022-11-29 07:38:50,928][0m Tr

KeyboardInterrupt: 

In [31]:
answer = test[test.answerCode == -1][c_columns + n_columns]
answer

Unnamed: 0,assessmentItemID,testId,KnowledgeTag,problem_n,test_pre,test_post,hour,pre_interval,correct,rate,tag_correct,tag_rate,test_interval
1035,4965,914,469,8,5,133,13,46,717,0.692754,9,0.818182,316
1706,7748,1306,781,8,7,146,2,23,465,0.694030,2,0.666667,172
3023,7484,1271,820,8,7,111,4,8,915,0.695289,2,0.333333,104
4283,9381,1526,309,6,9,64,5,75,1031,0.818904,5,1.000000,380
4670,6231,1109,183,7,6,135,11,17,293,0.759067,4,0.666667,275
...,...,...,...,...,...,...,...,...,...,...,...,...,...
260052,3692,705,883,5,4,122,2,2,7,0.304348,0,0.000000,12
260067,2590,496,589,5,3,111,9,107,7,0.500000,2,0.500000,219
260082,5353,974,865,4,5,193,2,24,7,0.500000,2,0.666667,47
260097,5353,974,865,4,5,193,13,21,2,0.142857,2,0.666667,63


In [32]:
sub = pd.read_csv('sample_submission.csv')
sub

Unnamed: 0,id,prediction
0,0,0.5
1,1,0.5
2,2,0.5
3,3,0.5
4,4,0.5
...,...,...
739,739,0.5
740,740,0.5
741,741,0.5
742,742,0.5


In [35]:
outputs = {}
outputs['rg'] = rg.predict(answer)
outputs['lgbm'] = lgbm.predict(answer)
outputs['cat'] = cat.predict(answer)
outputs['xgb'] = xgb.predict(answer)
outputs_answer = pd.DataFrame(outputs)
outputs_answer

Unnamed: 0,rg,lgbm,cat,xgb
0,0.574071,0.641527,0.677954,0.679700
1,0.490252,0.614373,0.618966,0.568996
2,0.404787,0.295778,0.298877,0.252923
3,0.640263,0.857448,0.922047,0.968080
4,0.585621,0.499192,0.470188,0.434486
...,...,...,...,...
739,0.296518,-0.009824,0.014658,-0.025801
740,0.550769,0.508691,0.513813,0.540107
741,0.604267,0.743842,0.796857,0.660498
742,0.434749,0.667337,0.746709,0.671633


In [36]:
sub['prediction'] = stack.predict(outputs_answer)
sub

Unnamed: 0,id,prediction
0,0,0.742766
1,1,0.596325
2,2,0.258098
3,3,0.957751
4,4,0.352142
...,...,...
739,739,0.024747
740,740,0.522488
741,741,0.702807
742,742,0.807155


In [37]:
sub.to_csv('stack.csv', index=False)

In [139]:
train = pd.read_csv('train_feature_engineering.csv')
test = pd.read_csv('test_feature_engineering.csv')
train.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,assessmentItemID_last,testId_first,testId_last,elapsed,...,prior_ac_count,prior_quest_count,prior_ac_accuracy,prior_relative_ac_sum,prior_relative_accuracy,prior_assessment_frequency,prior_test_frequency,prior_tags_frequency,diff_time_btw_tags,prev_tag_answer
0,0,0,9454,1.0,2020-03-24 00:17:11,10991,11903,11916,11925,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.086643,0.0
1,0,1,9454,1.0,2020-03-24 00:17:14,10992,11904,11916,11925,1.386294,...,0.693147,0.693147,1.0,0.016,0.016,0.0,0.693147,0.0,17.086643,0.0
2,0,2,9454,1.0,2020-03-24 00:17:22,10992,11905,11916,11925,2.197225,...,1.098612,1.098612,1.0,0.048,0.024,0.0,1.098612,0.693147,2.197225,1.0
3,0,3,9454,1.0,2020-03-24 00:17:29,10992,11906,11916,11925,2.079442,...,1.386294,1.386294,1.0,0.132,0.044,0.0,1.386294,1.098612,2.079442,1.0
4,0,4,9454,1.0,2020-03-24 00:17:36,10992,11907,11916,11925,2.079442,...,1.609438,1.609438,1.0,0.16,0.04,0.0,1.609438,1.386294,2.079442,1.0


In [150]:
cs = train.columns
cs = cs.drop('answerCode')
cs = cs.drop('Timestamp')
cs = cs.drop('relative_answered_correctly')
data_train, data_test, data_y_train, data_y_test = train[cs], test[test.answerCode >= 0][cs], train['answerCode'], test[test.answerCode >= 0]['answerCode']

In [149]:
cs

Index(['userID', 'assessmentItemID', 'testId', 'KnowledgeTag',
       'assessmentItemID_last', 'testId_first', 'testId_last', 'elapsed',
       'accuracy_by_assessment', 'accuracy_by_test', 'accuracy_by_tag',
       'accuracy_by_assessment_last', 'accuracy_by_test_first',
       'accuracy_by_test_last', 'relative_answered_correctly',
       'prior_ac_count', 'prior_quest_count', 'prior_ac_accuracy',
       'prior_relative_ac_sum', 'prior_relative_accuracy',
       'prior_assessment_frequency', 'prior_test_frequency',
       'prior_tags_frequency', 'diff_time_btw_tags', 'prev_tag_answer'],
      dtype='object')

In [141]:
data = pd.concat([train, test])
data, answer = data[data.answerCode >= 0], data[data.answerCode == -1]
data

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,assessmentItemID_last,testId_first,testId_last,elapsed,...,prior_ac_count,prior_quest_count,prior_ac_accuracy,prior_relative_ac_sum,prior_relative_accuracy,prior_assessment_frequency,prior_test_frequency,prior_tags_frequency,diff_time_btw_tags,prev_tag_answer
0,0,0,9454,1.0,2020-03-24 00:17:11,10991,11903,11916,11925,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,17.086643,0.0
1,0,1,9454,1.0,2020-03-24 00:17:14,10992,11904,11916,11925,1.386294,...,0.693147,0.693147,1.000000,0.016000,0.016000,0.0,0.693147,0.000000,17.086643,0.0
2,0,2,9454,1.0,2020-03-24 00:17:22,10992,11905,11916,11925,2.197225,...,1.098612,1.098612,1.000000,0.048000,0.024000,0.0,1.098612,0.693147,2.197225,1.0
3,0,3,9454,1.0,2020-03-24 00:17:29,10992,11906,11916,11925,2.079442,...,1.386294,1.386294,1.000000,0.132000,0.044000,0.0,1.386294,1.098612,2.079442,1.0
4,0,4,9454,1.0,2020-03-24 00:17:36,10992,11907,11916,11925,2.079442,...,1.609438,1.609438,1.000000,0.160000,0.040000,0.0,1.609438,1.386294,2.079442,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260108,7439,1396,9671,1.0,2020-08-21 07:39:45,11289,11909,11919,12064,1.791759,...,2.079442,2.397895,0.700000,-0.833333,-0.083333,0.0,1.791759,1.791759,1.791759,0.0
260109,7439,5952,10408,0.0,2020-10-14 23:07:23,11281,11903,11919,12098,0.000000,...,2.197225,2.484907,0.727273,-0.786353,-0.071487,0.0,0.000000,0.000000,17.086643,0.0
260110,7439,5953,10408,1.0,2020-10-14 23:07:41,11281,11904,11919,12098,2.944439,...,2.197225,2.564949,0.666667,-1.223020,-0.101918,0.0,0.693147,0.693147,2.944439,0.0
260111,7439,5954,10408,1.0,2020-10-14 23:08:02,11283,11905,11919,12098,3.091042,...,2.302585,2.639057,0.692308,-0.706353,-0.054335,0.0,1.098612,0.000000,17.086643,0.0


In [151]:
cs = data.columns
cs = cs.drop('answerCode')
cs = cs.drop('Timestamp')
cs = cs.drop('relative_answered_correctly')
data_train, data_test, data_y_train, data_y_test = train_test_split(data[cs], data['answerCode'], random_state=0)

In [160]:
lr_best_auc = 0
lr_best_col = []
temp_col = cs
tolerance = 0
for i in range(len(cs), 1, -1) :
    cols = temp_col
    best_temp_auc = 0
    for item in itertools.combinations(cols, i) :
        item = list(item)
        lr = LinearRegression()
        lr.fit(data_train[item], data_y_train)
        auc = roc_auc_score(data_y_test, lr.predict(data_test[item]))
        if best_temp_auc < auc :
            best_temp_auc = auc
            temp_col = item
            
    if best_temp_auc >= lr_best_auc :
        lr_best_auc = best_temp_auc
        lr_best_col = temp_col[:]
        tolerance = 0
    else :
        tolerance += 1
        if tolerance == 10 :
            break

    print(f"len:{i}, best_temp_auc:{best_temp_auc}, cols:{temp_col}")
            
lr_best_auc, lr_best_col

len:24, best_temp_auc:0.8173218272356843, cols:['userID', 'assessmentItemID', 'testId', 'KnowledgeTag', 'assessmentItemID_last', 'testId_first', 'testId_last', 'elapsed', 'accuracy_by_assessment', 'accuracy_by_test', 'accuracy_by_tag', 'accuracy_by_assessment_last', 'accuracy_by_test_first', 'accuracy_by_test_last', 'prior_ac_count', 'prior_quest_count', 'prior_ac_accuracy', 'prior_relative_ac_sum', 'prior_relative_accuracy', 'prior_assessment_frequency', 'prior_test_frequency', 'prior_tags_frequency', 'diff_time_btw_tags', 'prev_tag_answer']
len:23, best_temp_auc:0.8173298429363124, cols:['userID', 'assessmentItemID', 'testId', 'KnowledgeTag', 'assessmentItemID_last', 'testId_first', 'testId_last', 'elapsed', 'accuracy_by_assessment', 'accuracy_by_test', 'accuracy_by_tag', 'accuracy_by_assessment_last', 'accuracy_by_test_first', 'accuracy_by_test_last', 'prior_ac_count', 'prior_quest_count', 'prior_ac_accuracy', 'prior_relative_ac_sum', 'prior_relative_accuracy', 'prior_assessment_fre

KeyboardInterrupt: 

In [None]:
rg_best_auc = 0
rg_best_col = []
temp_col = cs
tolerance = 0
for i in range(len(cs), 1, -1) :
    cols = temp_col
    best_temp_auc = 0
    for item in itertools.combinations(cols, i) :
        item = list(item)
        rg = Ridge()
        rg.fit(data_train[item], data_y_train)
        auc = roc_auc_score(data_y_test, rg.predict(data_test[item]))
        if best_temp_auc < auc :
            best_temp_auc = auc
            temp_col = item
            
    if best_temp_auc >= rg_best_auc :
        rg_best_auc = best_temp_auc
        rg_best_col = temp_col[:]
        tolerance = 0
    else :
        tolerance += 1
        if tolerance == 10 :
            break

    print(f"len:{i}, best_temp_auc:{best_temp_auc}, cols:{temp_col}")
            
rg_best_auc, rg_best_col

In [161]:
lgbm_best_auc = 0
lgbm_best_col = []
item = list(cs)
tolerance = 0
for i in range(len(cs)-1) :
    lgbm = LGBMClassifier(boosting_type='gbdt', metric='auc')
    lgbm.fit(data_train[item], data_y_train)
    auc = roc_auc_score(data_y_test, lgbm.predict_proba(data_test[item])[:, 1])
    if auc >= lgbm_best_auc :
        lgbm_best_auc = auc
        lgbm_best_col = item.copy()
        tolerance = 0
    else :
        tolerance += 1
        if tolerance == 10 :
            break
            
    print(f"count:{i}, auc:{auc}, cols:{item}")
    
    item.pop(np.argmin(lgbm.feature_importances_))
            
lgbm_best_auc, lgbm_best_col

count:0, auc:0.8374380101662094, cols:['userID', 'assessmentItemID', 'testId', 'KnowledgeTag', 'assessmentItemID_last', 'testId_first', 'testId_last', 'elapsed', 'accuracy_by_assessment', 'accuracy_by_test', 'accuracy_by_tag', 'accuracy_by_assessment_last', 'accuracy_by_test_first', 'accuracy_by_test_last', 'prior_ac_count', 'prior_quest_count', 'prior_ac_accuracy', 'prior_relative_ac_sum', 'prior_relative_accuracy', 'prior_assessment_frequency', 'prior_test_frequency', 'prior_tags_frequency', 'diff_time_btw_tags', 'prev_tag_answer']
count:1, auc:0.8374330902216689, cols:['userID', 'assessmentItemID', 'testId', 'KnowledgeTag', 'assessmentItemID_last', 'testId_first', 'testId_last', 'elapsed', 'accuracy_by_assessment', 'accuracy_by_test', 'accuracy_by_tag', 'accuracy_by_test_first', 'accuracy_by_test_last', 'prior_ac_count', 'prior_quest_count', 'prior_ac_accuracy', 'prior_relative_ac_sum', 'prior_relative_accuracy', 'prior_assessment_frequency', 'prior_test_frequency', 'prior_tags_freq

(0, [])

In [163]:
cat_best_auc = 0
cat_best_col = []
item = list(cs)
tolerance = 0
for i in range(len(cs)-1) :
    cat = CatBoostClassifier(task_type="GPU", eval_metric='AUC')
    cat.fit(data_train[item], data_y_train, verbose=False)
    auc = roc_auc_score(data_y_test, cat.predict_proba(data_test[item])[:, 1])
    if auc >= cat_best_auc :
        cat_best_auc = auc
        cat_best_col = item[:]
        tolerance = 0
    else :
        tolerance += 1
        if tolerance == 10 :
            break
            
    print(f"count:{i}, auc:{auc}, cols:{item}")
    
    item.pop(np.argmin(cat.feature_importances_))
            
cat_best_auc, cat_best_col

Default metric period is 5 because AUC is/are not implemented for GPU


count:0, auc:0.8374846455678355, cols:['userID', 'assessmentItemID', 'testId', 'KnowledgeTag', 'assessmentItemID_last', 'testId_first', 'testId_last', 'elapsed', 'accuracy_by_assessment', 'accuracy_by_test', 'accuracy_by_tag', 'accuracy_by_assessment_last', 'accuracy_by_test_first', 'accuracy_by_test_last', 'prior_ac_count', 'prior_quest_count', 'prior_ac_accuracy', 'prior_relative_ac_sum', 'prior_relative_accuracy', 'prior_assessment_frequency', 'prior_test_frequency', 'prior_tags_frequency', 'diff_time_btw_tags', 'prev_tag_answer']


Default metric period is 5 because AUC is/are not implemented for GPU


count:1, auc:0.8375205280108301, cols:['userID', 'assessmentItemID', 'testId', 'KnowledgeTag', 'assessmentItemID_last', 'testId_first', 'testId_last', 'elapsed', 'accuracy_by_assessment', 'accuracy_by_test', 'accuracy_by_tag', 'accuracy_by_assessment_last', 'accuracy_by_test_first', 'accuracy_by_test_last', 'prior_ac_count', 'prior_quest_count', 'prior_ac_accuracy', 'prior_relative_ac_sum', 'prior_relative_accuracy', 'prior_test_frequency', 'prior_tags_frequency', 'diff_time_btw_tags', 'prev_tag_answer']


Default metric period is 5 because AUC is/are not implemented for GPU


KeyboardInterrupt: 

In [None]:
xgb_best_auc = 0
xgb_best_col = []
item = list(cs)
tolerance = 0
for i in range(len(cs)-1) :
    xgb = XGBClassifier(tree_method='gpu_hist', gpu_id=0, 'eval_metric'='auc')
    xgb.fit(data_train[item], data_y_train, verbose=False)
    auc = roc_auc_score(data_y_test, xgb.predict_proba(data_test[item])[:, 1])
    if auc >= xgb_best_auc :
        xgb_best_auc = auc
        xgb_best_col = item[:]
        tolerance = 0
    else :
        tolerance += 1
        if tolerance == 10 :
            break
            
    print(f"count:{i}, auc:{auc}, cols:{item}")
    
    item.pop(np.argmin(xgb.feature_importances_))
            
xgb_best_auc, xgb_best_col