In [1]:
import json
from collections import Counter, defaultdict, deque
import gc

import numpy as np
import pandas as pd
import glob
import time
from tqdm import tqdm

from sklearn import model_selection, metrics
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sn

import lightgbm as lgb
from scipy.stats import skew

import feather
import pickle
import h5py

from concurrent import futures
#import riiideducation

In [2]:
pocket_1221 = pd.read_csv("./pred_1221.csv")
pocket_1223 = pd.read_csv("./pred_1223.csv")
owruby = pd.read_csv("./owruby_v022.csv")
sakami = pd.read_csv("./sakami-1221.csv")

In [3]:
print(pocket_1221.head(2))
print(pocket_1223.head(2))
print(owruby.head(2))
print(sakami.head(2))

          0
0  0.899222
1  0.875296
          0
0  0.882067
1  0.861982
     row_id  answered_correctly  label
0  70629059            0.851218      1
1  70629060            0.832965      1
   Unnamed: 0  row_id  answered_correctly  prediction
0          96      96                   1    0.297885
1          97      97                   0    0.318944


In [4]:
merged = owruby.copy()
merged["pocket_1221"] = pocket_1221
merged["pocket_1223"] = pocket_1223
merged["owruby"] = merged["answered_correctly"]
merged.drop(columns="answered_correctly", inplace=True)

In [5]:
merged2 = pd.merge(merged, sakami, on="row_id", how="inner")

In [6]:
print(merged.shape, merged2.shape)

(5000000, 5) (5000000, 8)


In [7]:
merged2["sakami"] = merged2["prediction"]
merged2.drop(columns="prediction", inplace=True)

In [8]:
merged2.head(2)

Unnamed: 0.1,row_id,label,pocket_1221,pocket_1223,owruby,Unnamed: 0,answered_correctly,sakami
0,70629059,1,0.899222,0.882067,0.851218,70629059,1,0.900553
1,70629060,1,0.875296,0.861982,0.832965,70629060,1,0.865924


In [9]:
pred_col = ["pocket_1221", "pocket_1223", "owruby", "sakami"]

In [10]:
valid_feats = pd.read_feather("../valid_1223.feather")

In [11]:
valid_feats.shape

(5000000, 76)

In [12]:
merged = pd.concat([merged2, valid_feats], axis=1)

In [13]:
merged.head()

Unnamed: 0.1,row_id,label,pocket_1221,pocket_1223,owruby,Unnamed: 0,answered_correctly,sakami,index,ac,...,u_td_p2p3,u_td_p3p4,u_td_p4p5,u_td_p5p6,u_td_p6p7,u_td_p7p8,u_td_p8p9,u_td_p9p10,ut_ac_mean,ut_ac_mean2
0,70629059,1,0.899222,0.882067,0.851218,70629059,1,0.900553,96130348,1.0,...,325317.0,454170.0,2732487.0,143467.0,190046.0,182226.0,190944.0,110019856.0,0.705882,0.688819
1,70629060,1,0.875296,0.861982,0.832965,70629060,1,0.865924,96130349,1.0,...,820188.0,325317.0,454170.0,2732487.0,143467.0,190046.0,182226.0,190944.0,0.737113,0.711192
2,70629061,0,0.765987,0.749301,0.835464,70629061,0,0.779606,96130350,0.0,...,820188.0,325317.0,454170.0,2732487.0,143467.0,190046.0,182226.0,190944.0,0.705882,0.688819
3,59577091,1,0.831968,0.776635,0.77514,59577091,1,0.575242,96130351,1.0,...,2243661.0,657769.0,720899.0,597984.0,415187.0,527912.0,407554.0,291635.0,0.647059,0.645833
4,59577092,1,0.924458,0.946935,0.970708,59577092,1,0.845466,96130352,1.0,...,56203.0,2243661.0,657769.0,720899.0,597984.0,415187.0,527912.0,407554.0,0.75,0.8125


In [14]:
class SingleLgb:
    def __init__(self, seed=99, dry_run=False):
        self.train_param = self.get_param()
        if dry_run:
            self.num_rounds = 100
        else:
            self.num_rounds = 500

    def do_train_direct(self, x_train, x_test, y_train, y_test):
        lgb_train = lgb.Dataset(x_train, y_train)
        lgb_eval = lgb.Dataset(x_test, y_test)

        # print('Start training...')
        model = lgb.train(self.train_param,
                          lgb_train,
                          valid_sets=[lgb_eval],
                          verbose_eval=100,
                          num_boost_round=self.num_rounds,
                          early_stopping_rounds=100,
                          #categorical_feature=[]
                         )
        # print('End training...')
        return model

    @staticmethod
    def show_feature_importance(model, filename=None):
        fi = pd.DataFrame({
            "name": model.feature_name(),
            "importance_split": model.feature_importance(importance_type="split").astype(int),
            "importance_gain": model.feature_importance(importance_type="gain").astype(int),
        })
        fi = fi.sort_values(by="importance_gain", ascending=False)
        #print(fi)
        with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
            print(fi)

    @staticmethod
    def get_param():
        return {
            'num_leaves': 127,
            'min_data_in_leaf': 50,
            'objective': 'binary',
            #'metric': 'auc',
            'metric': 'binary_logloss',
            'max_depth': -1,
            'learning_rate': 0.02,
            "boosting": "gbdt",
            "feature_fraction": 0.9,
            "verbosity": -1,
            "random_state": 81,
        }
    
class SingleTrainer:
    def __init__(self, pred_col, dry_run=False):
        self.pred_col = pred_col
        self.target_col = "ac"
        self.dry_run = dry_run
        self.val_size = 2*1000*1000

    def train_model(self, df):
        X = df[self.pred_col]
        y = df[self.target_col]
        
        models, scores = list(), list()
        for fold in range(4):
            print("---------")
            print("fold=", fold)
            f, c = fold, self.val_size
            val_s, val_e = -c-f*c, len(df)-f*c
            train_idx = -c-f*c
            X_train, X_val = X.iloc[:train_idx], X.iloc[val_s:val_e]
            y_train, y_val = y.iloc[:train_idx], y.iloc[val_s:val_e]
            print(X_train.shape, X_val.shape)
            
            lgbm = SingleLgb(seed=99, dry_run=self.dry_run)
            model = lgbm.do_train_direct(X_train, X_val, y_train, y_val)
            score = model.best_score["valid_0"]["binary_logloss"]
            pred = model.predict(X_val)
            score = metrics.roc_auc_score(y_val, pred)
            print("AUC=", score)
            if fold == 0:
                lgbm.show_feature_importance(model)
            models.append(model)
            scores.append(score)
            break
        return models, np.mean(scores)

In [15]:
pred_col = [
    "et",  "q_ac_cnt",  'b_ac_mean', 'b_ac_cnt', "q_ac_mean",
    "u_ac_mean", "u_cnt", "u_qm_mean", #"u_td", #"uc_td",
    "u_ac_cnt", #"u_et_cnt"
    'u_ok_qm_mean', 'u_ng_qm_mean',
    'q_et_mean', 'q_et_cnt', 'q_et_std',
   "up_ac_cnt", "up_ac_mean",
    "u_td", 
    "q_ng_uac_mean","q_ok_uac_mean", "q_ok_uac_std", "q_ng_uac_std",
    "correct_answer",
    "u_ac_mean20",
    "ulr_ac_mean", #"ulr_ac_cnt",  "ub_cnt",
    "ut_ac_mean","ut_ac_mean2",
    "uca_ac_mean", 
    "ul_td", "u_td_wl", 
    #"q_pqhe_true_uac_mean", "q_pqhe_true_uac_std", "q_pqhe_false_uac_mean", "q_pqhe_false_uac_std",
    #"pqhe",
    "uc_td", "uc_ac_mean", "uc_ac_cnt", 
    #"u_td_final", #"ub_td_final"
    'u_td_tp1','u_td_p1p2', 'u_td_p2p3', 'u_td_p3p4', 'u_td_p4p5',
    'u_td_p5p6', 'u_td_p6p7', 'u_td_p7p8', 'u_td_p8p9', 'u_td_p9p10',
    "uac_prev1", "uac_prev2",
    "elo_beta", "u_elo_theta", 
    "u_rate",
]
nn_col = [f"nn_svd{i}" for i in range(20)]
pred_col += nn_col
print(pred_col)

['et', 'q_ac_cnt', 'b_ac_mean', 'b_ac_cnt', 'q_ac_mean', 'u_ac_mean', 'u_cnt', 'u_qm_mean', 'u_ac_cnt', 'u_ok_qm_mean', 'u_ng_qm_mean', 'q_et_mean', 'q_et_cnt', 'q_et_std', 'up_ac_cnt', 'up_ac_mean', 'u_td', 'q_ng_uac_mean', 'q_ok_uac_mean', 'q_ok_uac_std', 'q_ng_uac_std', 'correct_answer', 'u_ac_mean20', 'ulr_ac_mean', 'ut_ac_mean', 'ut_ac_mean2', 'uca_ac_mean', 'ul_td', 'u_td_wl', 'uc_td', 'uc_ac_mean', 'uc_ac_cnt', 'u_td_tp1', 'u_td_p1p2', 'u_td_p2p3', 'u_td_p3p4', 'u_td_p4p5', 'u_td_p5p6', 'u_td_p6p7', 'u_td_p7p8', 'u_td_p8p9', 'u_td_p9p10', 'uac_prev1', 'uac_prev2', 'elo_beta', 'u_elo_theta', 'u_rate', 'nn_svd0', 'nn_svd1', 'nn_svd2', 'nn_svd3', 'nn_svd4', 'nn_svd5', 'nn_svd6', 'nn_svd7', 'nn_svd8', 'nn_svd9', 'nn_svd10', 'nn_svd11', 'nn_svd12', 'nn_svd13', 'nn_svd14', 'nn_svd15', 'nn_svd16', 'nn_svd17', 'nn_svd18', 'nn_svd19']


In [17]:
pred_col = [
    "owruby", "sakami", "pocket_1223",
    "uc_td", "uc_ac_mean",
    "ul_td", "u_td", "u_td_p1p2"
]

In [18]:
trainer = SingleTrainer(pred_col, dry_run=False)
models, score = trainer.train_model(merged)

---------
fold= 0
(3000000, 8) (2000000, 8)
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.505861
[200]	valid_0's binary_logloss: 0.499969
[300]	valid_0's binary_logloss: 0.49968
[400]	valid_0's binary_logloss: 0.499666
[500]	valid_0's binary_logloss: 0.499663
Did not meet early stopping. Best iteration is:
[481]	valid_0's binary_logloss: 0.499662
AUC= 0.8128734162744986
          name  importance_split  importance_gain
1       sakami              8671         18601614
0       owruby              9124          4145668
2  pocket_1223             10721           499615
5        ul_td              9117           100166
3        uc_td              6305            88450
6         u_td              7155            67837
4   uc_ac_mean              2671            60282
7    u_td_p1p2              6842            59413


In [55]:
# base feats
#Early stopping, best iteration is:
#[295]	valid_0's binary_logloss: 0.517508
#AUC= 0.7967711921542647

# with owruby, sakami
# [73]	valid_0's binary_logloss: 0.499981
#AUC= 0.812631130956799

# add pocket1223
#AUC= 0.8130389687940041

#just 3
#AUC= 0.8119216727001559

# just3 + top feats
#0.81279

In [22]:
metrics.roc_auc_score(merged2["label"], merged2["ranked_3avg"])

0.8115313781174149

In [17]:
for power in [1.5, 1.75, 2, 2.25, 2.5, 3, 3.5, 4]:
    merged2["power_mean"] = (merged2["sakami"]**power + merged2["owruby"]**power + merged2["pocket_1223"]**power)
    score = metrics.roc_auc_score(merged2["label"], merged2["power_mean"])
    print(power, score)

1.5 0.8115757793963951
1.75 0.8115617807361065
2 0.8115436387419797
2.25 0.8115220983411143
2.5 0.8114978412488558
3 0.811442840131525
3.5 0.8113818106061845
4 0.8113171371978005
