In [1]:
import json
from collections import Counter, defaultdict, deque
import gc

import numpy as np
import pandas as pd
import glob
import time
from tqdm import tqdm

from sklearn import model_selection, metrics
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sn

import lightgbm as lgb
from scipy.stats import skew

import feather
import pickle
import h5py

from concurrent import futures
#import riiideducation

In [2]:
# lyaka1 = np.load("./ssakt15_valid_preds.npy")
# lyaka2 = np.load("./ssakt31_valid_preds.npy")
# sakami["lyaka1"] = lyaka1
# sakami["lyaka2"] = lyaka2

In [3]:
owruby =  pd.read_csv("./owruby/owruby_usplit.csv")

In [4]:
owruby.head(2)

Unnamed: 0,row_id,answered_correctly,label
0,1874,0.822659,0
1,1875,0.408418,0


In [5]:
sakami_rows = owruby[["row_id"]].sort_values(by="row_id")

In [6]:
sakami_rows["sakami192"] = np.load("./sakami/sakami192_usplit.npy")
sakami_rows["sakami193"] = np.load("./sakami/sakami193_usplit.npy")
sakami_rows["sakami197"] = np.load("./sakami/sakami197_usplit.npy")
sakami_rows["sakami198"] = np.load("./sakami/sakami198_usplit.npy")
sakami_rows["sakami200"] = np.load("./sakami/sakami200_usplit.npy")
sakami_rows["sakami201"] = np.load("./sakami/sakami201_usplit.npy")

In [7]:
sakami_rows.head(2)

Unnamed: 0,row_id,sakami192,sakami193,sakami197,sakami198,sakami200,sakami201
0,1874,0.845537,0.824574,0.82625,0.838496,0.833348,0.835412
1,1875,0.268288,0.25253,0.252409,0.259301,0.22417,0.265376


In [11]:
merged = pd.merge(owruby, sakami_rows, on="row_id", how="inner")
merged.head(2)

Unnamed: 0,row_id,answered_correctly,label,sakami192,sakami193,sakami197,sakami198,sakami200,sakami201
0,1874,0.822659,0,0.845537,0.824574,0.82625,0.838496,0.833348,0.835412
1,1875,0.408418,0,0.268288,0.25253,0.252409,0.259301,0.22417,0.265376


In [8]:
pocket = pd.read_csv("../pred_0106_2.csv")

In [9]:
pocket.head(2)

Unnamed: 0,0
0,0.317725
1,0.176799


In [12]:
merged["pocket"] = pocket

In [13]:
merged["owruby"] = merged["answered_correctly"]
merged.drop(columns="answered_correctly", inplace=True)

In [14]:
merged.head(2)

Unnamed: 0,row_id,label,sakami192,sakami193,sakami197,sakami198,sakami200,sakami201,pocket,owruby
0,1874,0,0.845537,0.824574,0.82625,0.838496,0.833348,0.835412,0.317725,0.822659
1,1875,0,0.268288,0.25253,0.252409,0.259301,0.22417,0.265376,0.176799,0.408418


In [15]:
merged.columns

Index(['row_id', 'label', 'sakami192', 'sakami193', 'sakami197', 'sakami198',
       'sakami200', 'sakami201', 'pocket', 'owruby'],
      dtype='object')

In [16]:
valid_feats = pd.read_feather("../valid_0106_2.feather")

In [17]:
valid_feats.head(2)

Unnamed: 0,index,row_id,user_id,ac,et,q_ac_mean,q_ac_cnt,q_et_mean,q_et_cnt,q_et_std,...,u_td_p4p5,u_td_p5p6,u_td_p6p7,u_td_p7p8,u_td_p8p9,u_td_p9p10,ut_ac_mean,ut_ac_mean2,u_ok_ts,u_ng_ts
0,30755,74094689,1576045200,0,,0.554006,9897,25198.609375,9842,21517.152005,...,,,,,,,,,,
1,30769,74094690,1576045200,1,27000.0,0.364658,14419,26085.080078,14356,20998.380102,...,,,,,,,,,,-61704.0


In [18]:
valid_feats.shape

(5296660, 83)

In [19]:
merged = pd.merge(merged, valid_feats, on="row_id", how="inner")

In [20]:
#merged["owruby2"] = (merged["owruby2_1"] + merged["owruby2_2"] + merged["owruby2_3"] + merged["owruby2_4"]) / 4

In [21]:
merged.head(2)

Unnamed: 0,row_id,label,sakami192,sakami193,sakami197,sakami198,sakami200,sakami201,pocket,owruby,...,u_td_p4p5,u_td_p5p6,u_td_p6p7,u_td_p7p8,u_td_p8p9,u_td_p9p10,ut_ac_mean,ut_ac_mean2,u_ok_ts,u_ng_ts
0,1874,0,0.845537,0.824574,0.82625,0.838496,0.833348,0.835412,0.317725,0.822659,...,,,,,,,,,,
1,1875,0,0.268288,0.25253,0.252409,0.259301,0.22417,0.265376,0.176799,0.408418,...,,,,,,,,,,-24224.0


In [22]:
usplit_rowid = pd.read_csv("./usplit_rowid.csv")

In [24]:
usplit_rowid.head(2)

Unnamed: 0,row_id
0,24785
1,24786


In [25]:
usplit_rowid.shape

(1999704, 1)

In [26]:
class SingleLgb:
    def __init__(self, seed=99, dry_run=False):
        self.train_param = self.get_param(seed)
        if dry_run:
            self.num_rounds = 100
        else:
            self.num_rounds = 1100

    def do_train_direct(self, x_train, x_test, y_train, y_test):
        lgb_train = lgb.Dataset(x_train, y_train)
        lgb_eval = lgb.Dataset(x_test, y_test)

        # print('Start training...')
        model = lgb.train(self.train_param,
                          lgb_train,
                          valid_sets=[lgb_eval],
                          verbose_eval=100,
                          num_boost_round=self.num_rounds,
                          early_stopping_rounds=100,
                          #categorical_feature=[]
                         )
        # print('End training...')
        return model

    @staticmethod
    def show_feature_importance(model, filename=None):
        fi = pd.DataFrame({
            "name": model.feature_name(),
            "importance_split": model.feature_importance(importance_type="split").astype(int),
            "importance_gain": model.feature_importance(importance_type="gain").astype(int),
        })
        fi = fi.sort_values(by="importance_gain", ascending=False)
        #print(fi)
        with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
            print(fi)

    @staticmethod
    def get_param(seed):
        return {
            'num_leaves': 127,
            'min_data_in_leaf': 100,
            'objective': 'binary',
            #'metric': 'auc',
            'metric': 'binary_logloss',
            'max_depth': -1,
            'learning_rate': 0.02,
            "boosting": "gbdt",
            "feature_fraction": 0.9,
            "verbosity": -1,
            "random_state": seed,
        }
    
class SingleTrainer:
    def __init__(self, pred_col, dry_run=False):
        self.pred_col = pred_col
        self.target_col = "ac"
        self.dry_run = dry_run
        self.val_size = 2*1000*1000

    
    def train_model(self, df_train, df_valid):
        X_train, X_val = df_train[self.pred_col], df_valid[self.pred_col]
        y_train, y_val = df_train[self.target_col], df_valid[self.target_col]
        
        models, scores = list(), list()
        for fold in range(4):
            print("---------")
            print("fold=", fold)
            print(X_train.shape, X_val.shape)
            
            lgbm = SingleLgb(seed=99, dry_run=self.dry_run)
            model = lgbm.do_train_direct(X_train, X_val, y_train, y_val)
            score = model.best_score["valid_0"]["binary_logloss"]
            pred = model.predict(X_val)
            score = metrics.roc_auc_score(y_val, pred)
            print("AUC=", score)
            if fold == 0:
                lgbm.show_feature_importance(model)
            models.append(model)
            scores.append(score)
            break
        return models, np.mean(scores)

In [27]:
# pred_col = [
#     "et",  "q_ac_cnt",  'b_ac_mean', 'b_ac_cnt', "q_ac_mean",
#     "u_ac_mean", "u_cnt", "u_qm_mean", #"u_td", #"uc_td",
#     "u_ac_cnt", #"u_et_cnt"
#     'u_ok_qm_mean', 'u_ng_qm_mean',
#     'q_et_mean', 'q_et_cnt', 'q_et_std',
#    "up_ac_cnt", "up_ac_mean",
#     "u_td", 
#     "q_ng_uac_mean","q_ok_uac_mean", "q_ok_uac_std", "q_ng_uac_std",
#     "correct_answer",
#     "u_ac_mean20",
#     "ulr_ac_mean", #"ulr_ac_cnt",  "ub_cnt",
#     "ut_ac_mean","ut_ac_mean2",
#     "uca_ac_mean", 
#     "ul_td", "u_td_wl", 
#     #"q_pqhe_true_uac_mean", "q_pqhe_true_uac_std", "q_pqhe_false_uac_mean", "q_pqhe_false_uac_std",
#     #"pqhe",
#     "uc_td", "uc_ac_mean", "uc_ac_cnt", 
#     #"u_td_final", #"ub_td_final"
#     'u_td_tp1','u_td_p1p2', 'u_td_p2p3', 'u_td_p3p4', 'u_td_p4p5',
#     'u_td_p5p6', 'u_td_p6p7', 'u_td_p7p8', 'u_td_p8p9', 'u_td_p9p10',
#     "uac_prev1", "uac_prev2",
#     "elo_beta", "u_elo_theta", 
#     "u_rate",
# ]
# nn_col = [f"nn_svd{i}" for i in range(20)]
# pred_col += nn_col
# print(pred_col)

['et', 'q_ac_cnt', 'b_ac_mean', 'b_ac_cnt', 'q_ac_mean', 'u_ac_mean', 'u_cnt', 'u_qm_mean', 'u_ac_cnt', 'u_ok_qm_mean', 'u_ng_qm_mean', 'q_et_mean', 'q_et_cnt', 'q_et_std', 'up_ac_cnt', 'up_ac_mean', 'u_td', 'q_ng_uac_mean', 'q_ok_uac_mean', 'q_ok_uac_std', 'q_ng_uac_std', 'correct_answer', 'u_ac_mean20', 'ulr_ac_mean', 'ut_ac_mean', 'ut_ac_mean2', 'uca_ac_mean', 'ul_td', 'u_td_wl', 'uc_td', 'uc_ac_mean', 'uc_ac_cnt', 'u_td_tp1', 'u_td_p1p2', 'u_td_p2p3', 'u_td_p3p4', 'u_td_p4p5', 'u_td_p5p6', 'u_td_p6p7', 'u_td_p7p8', 'u_td_p8p9', 'u_td_p9p10', 'uac_prev1', 'uac_prev2', 'elo_beta', 'u_elo_theta', 'u_rate', 'nn_svd0', 'nn_svd1', 'nn_svd2', 'nn_svd3', 'nn_svd4', 'nn_svd5', 'nn_svd6', 'nn_svd7', 'nn_svd8', 'nn_svd9', 'nn_svd10', 'nn_svd11', 'nn_svd12', 'nn_svd13', 'nn_svd14', 'nn_svd15', 'nn_svd16', 'nn_svd17', 'nn_svd18', 'nn_svd19']


In [28]:
pred_col = [
    'sakami192', 'sakami193', 'sakami197',
    'sakami198', 'sakami200', "sakami201",
    'pocket', "owruby", 
    "uc_td", "uc_ac_mean", "uc_ac_cnt", 
    "ul_td", "u_td", "u_td_p1p2", 
    "ut_ac_mean", "ut_ac_mean2",
    "up_ac_mean",
    "et", "q_et_mean", "q_et_std",
    "u_elo_theta","q_ac_mean",  #"ulr_elo_mean", "up_elo_mean",
    "up_ac_cnt", "u_cnt", "q_ac_cnt"
]


In [29]:
#sorted_df = merged.sort_values(by="row_id")

In [31]:
cond = merged["row_id"].isin(usplit_rowid["row_id"])
df_train = merged[~cond]
df_valid = merged[cond]

In [32]:
trainer = SingleTrainer(pred_col, dry_run=False)
models, score = trainer.train_model(df_train, df_valid)

---------
fold= 0
(3296956, 25) (1999704, 25)
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.495334
[200]	valid_0's binary_logloss: 0.489187
[300]	valid_0's binary_logloss: 0.488709
[400]	valid_0's binary_logloss: 0.48862
[500]	valid_0's binary_logloss: 0.488604
[600]	valid_0's binary_logloss: 0.488597
[700]	valid_0's binary_logloss: 0.488592
[800]	valid_0's binary_logloss: 0.488591
[900]	valid_0's binary_logloss: 0.488593
Early stopping, best iteration is:
[843]	valid_0's binary_logloss: 0.488587
AUC= 0.8130197113324136
           name  importance_split  importance_gain
7        owruby              6735          9412458
0     sakami192              4471          8549454
1     sakami193              4858          6844404
2     sakami197              3647           148438
11        ul_td              7421           106067
8         uc_td              5301           104403
3     sakami198              3738           103581
9    uc_ac_mean 

In [33]:
# trainer = SingleTrainer(pred_col, dry_run=False)
# models, score = trainer.seed_avg_model(merged)

In [34]:
# final_valid = merged.iloc[-2*1000*1000:]
# preds = []
# for model in models:
#     pred_y = models[0].predict(final_valid[pred_col])
#     preds.append(pred_y)
# final_pred = (preds[0] +  preds[1] +  preds[2] +  preds[3])/4
# final_pred.shape
# score = metrics.roc_auc_score(final_valid["ac"], final_pred)
# print(score)

In [35]:
# new valid 3m-2m, 20feats: 

In [36]:
models[0].save_model("./ensemble_model_0106_usplit.lgb")

<lightgbm.basic.Booster at 0x7fbd0a7b4fd0>

In [92]:
merged.iloc[-2*1000*1000:]["row_id"]

3000000    25708400
3000001    25708401
3000002    25708402
3000003    25708403
3000004    25708404
             ...   
4999995    26498141
4999996    26498142
4999997    26498143
4999998    26498144
4999999    32938743
Name: row_id, Length: 2000000, dtype: int64

In [99]:
final_pred = models[0].predict(merged.iloc[-2*1000*1000:][pred_col])

In [100]:
final_pred.shape

(2000000,)

In [101]:
final_pred_df = pd.DataFrame({
    "row_id": merged.iloc[-2*1000*1000:]["row_id"],
    "pocket_pred": final_pred
})

In [103]:
final_pred_df.head(2)

Unnamed: 0,row_id,pocket_pred
3000000,25708400,0.726013
3000001,25708401,0.943335


In [104]:
final_pred_df.to_csv("old_valid_pocket_stack_pred.csv")

In [None]:
# before lyaka 8140
# after sakami1 8141
# after lyaka12 8142
# only lyaka12, no sakami1 0.81419: model0101
# only lyaka2 0.81415

# valid 

In [55]:
# base feats
#Early stopping, best iteration is:
#[295]	valid_0's binary_logloss: 0.517508
#AUC= 0.7967711921542647

# with owruby, sakami
# [73]	valid_0's binary_logloss: 0.499981
#AUC= 0.812631130956799

# add pocket1223
#AUC= 0.8130389687940041

#just 3
#AUC= 0.8119216727001559

# just3 + top feats
#0.81279

In [58]:
class SeedLgb:
    def __init__(self, seed=99, dry_run=False):
        self.train_param = self.get_param(seed)
        if dry_run:
            self.num_rounds = 100
        else:
            self.num_rounds = 1100

    def do_train_direct(self, x_train, y_train):
        lgb_train = lgb.Dataset(x_train, y_train)

        # print('Start training...')
        model = lgb.train(self.train_param,
                          lgb_train,
                          valid_sets=None,
                          verbose_eval=100,
                          num_boost_round=self.num_rounds,
                          #categorical_feature=[]
                         )
        # print('End training...')
        return model

    @staticmethod
    def show_feature_importance(model, filename=None):
        fi = pd.DataFrame({
            "name": model.feature_name(),
            "importance_split": model.feature_importance(importance_type="split").astype(int),
            "importance_gain": model.feature_importance(importance_type="gain").astype(int),
        })
        fi = fi.sort_values(by="importance_gain", ascending=False)
        #print(fi)
        with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
            print(fi)

    @staticmethod
    def get_param(seed):
        return {
            'num_leaves': 127,
            'min_data_in_leaf': 100,
            'objective': 'binary',
            #'metric': 'auc',
            'metric': 'binary_logloss',
            'max_depth': -1,
            'learning_rate': 0.02,
            "boosting": "gbdt",
            "feature_fraction": 0.9,
            "verbosity": -1,
            "random_state": seed,
        }
    
class SeedTrainer:
    def __init__(self, pred_col, dry_run=False):
        self.pred_col = pred_col
        self.target_col = "ac"
        self.dry_run = dry_run

    def train_model(self, df):
        X = df[self.pred_col]
        y = df[self.target_col]
        
        models, scores = list(), list()
        for fold in range(4):
            print("---------")
            print("fold=", fold)
            
            lgbm = SeedLgb(seed=fold, dry_run=self.dry_run)
            model = lgbm.do_train_direct(X, y)
            #score = model.best_score["valid_0"]["binary_logloss"]
            #pred = model.predict(X_val)
            #score = metrics.roc_auc_score(y_val, pred)
            #print("AUC=", score)
            if fold == 0:
                lgbm.show_feature_importance(model)
            models.append(model)
            #scores.append(score)
        return models, scores

In [59]:
trainer = SeedTrainer(pred_col, dry_run=False)
models, score = trainer.train_model(merged)

---------
fold= 0
           name  importance_split  importance_gain
1       sakami2              9218         21663621
2       owruby2             10160         16689718
0   pocket_1226             11127           640644
3        lyaka2              7528           252855
4         uc_td              6046           140199
6         ul_td              9285           131655
7          u_td              7975            96863
5    uc_ac_mean              2511            93665
19        u_cnt              7030            93439
16    q_ac_mean              7188            89085
15  u_elo_theta              7249            81063
8     u_td_p1p2              6702            73352
20     q_ac_cnt              6421            68977
11   up_ac_mean              6620            68750
13    q_et_mean              5564            64248
17    up_ac_cnt              5850            62281
9    ut_ac_mean              5259            61034
12           et              5675            60832
10  ut_ac_mea

In [60]:
for i in range(4):
    models[i].save_model(f"./ensemble_model_0101_all{i}.lgb")

In [22]:
metrics.roc_auc_score(merged2["label"], merged2["ranked_3avg"])

0.8115313781174149

In [17]:
for power in [1.5, 1.75, 2, 2.25, 2.5, 3, 3.5, 4]:
    merged2["power_mean"] = (merged2["sakami"]**power + merged2["owruby"]**power + merged2["pocket_1223"]**power)
    score = metrics.roc_auc_score(merged2["label"], merged2["power_mean"])
    print(power, score)

1.5 0.8115757793963951
1.75 0.8115617807361065
2 0.8115436387419797
2.25 0.8115220983411143
2.5 0.8114978412488558
3 0.811442840131525
3.5 0.8113818106061845
4 0.8113171371978005
