In [1]:
import json
from collections import Counter, defaultdict, deque
import gc

import numpy as np
import pandas as pd
import glob
import time
from tqdm import tqdm

from sklearn import model_selection, metrics
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sn

import lightgbm as lgb
from scipy.stats import skew

import feather
import pickle
import h5py

from concurrent import futures
#import riiideducation

In [2]:
# lyaka1 = np.load("./ssakt15_valid_preds.npy")
# lyaka2 = np.load("./ssakt31_valid_preds.npy")
# sakami["lyaka1"] = lyaka1
# sakami["lyaka2"] = lyaka2

In [3]:
owruby =  pd.read_csv("./owruby/owruby_old_large.csv")

In [4]:
owruby.head(2)

Unnamed: 0,row_id,answered_correctly,label
0,70629059,0.881129,1
1,70629060,0.846699,1


In [5]:
sakami_rows = owruby[["row_id"]].sort_values(by="row_id")

In [6]:
sakami_rows["sakami181"] = np.load("./sakami/sakami181_old.npy")
sakami_rows["sakami188"] = np.load("./sakami/sakami188_old.npy")
sakami_rows["sakami189"] = np.load("./sakami/sakami189_old.npy")

In [7]:
lyaka = np.load("./lyaka/lyaka_old.npy")
sakami_rows["lyaka"] = lyaka

In [8]:
sakami_rows.head(2)

Unnamed: 0,row_id,sakami181,sakami188,sakami189,lyaka
608902,96,0.312084,0.34943,0.275476,0.319501
608949,97,0.290176,0.320404,0.289015,0.335971


In [9]:
merged = pd.merge(owruby, sakami_rows, on="row_id", how="inner")
merged.head(2)

Unnamed: 0,row_id,answered_correctly,label,sakami181,sakami188,sakami189,lyaka
0,70629059,0.881129,1,0.907971,0.921032,0.877498,0.939251
1,70629060,0.846699,1,0.829032,0.874914,0.818954,0.859678


In [10]:
pocket = pd.read_csv("../pred_0106_long.csv")

In [11]:
pocket.head(2)

Unnamed: 0,0
0,0.870664
1,0.864616


In [12]:
merged["pocket"] = pocket

In [13]:
merged["owruby"] = merged["answered_correctly"]
merged.drop(columns="answered_correctly", inplace=True)

In [14]:
merged.head(2)

Unnamed: 0,row_id,label,sakami181,sakami188,sakami189,lyaka,pocket,owruby
0,70629059,1,0.907971,0.921032,0.877498,0.939251,0.870664,0.881129
1,70629060,1,0.829032,0.874914,0.818954,0.859678,0.864616,0.846699


In [15]:
merged.columns

Index(['row_id', 'label', 'sakami181', 'sakami188', 'sakami189', 'lyaka',
       'pocket', 'owruby'],
      dtype='object')

In [16]:
valid_feats = pd.read_feather("../valid_0106.feather")

In [17]:
valid_feats.head(2)

Unnamed: 0,index,row_id,user_id,ac,et,q_ac_mean,q_ac_cnt,q_et_mean,q_et_cnt,q_et_std,...,u_td_p4p5,u_td_p5p6,u_td_p6p7,u_td_p7p8,u_td_p8p9,u_td_p9p10,ut_ac_mean,ut_ac_mean2,u_ok_ts,u_ng_ts
0,96130348,70629059,1503172082,1,93333.0,0.890898,15875,24237.519531,15875,13892.560771,...,2732487.0,143467.0,190046.0,182226.0,190944.0,110019856.0,0.705882,0.688819,-9272200.0,-9597517.0
1,96130349,70629060,1503172082,1,93333.0,0.872811,15874,24237.470703,15874,13892.997062,...,454170.0,2732487.0,143467.0,190046.0,182226.0,190944.0,0.737113,0.711192,-9272200.0,-9597517.0


In [18]:
valid_feats.shape

(5000000, 83)

In [19]:
merged = pd.merge(merged, valid_feats, on="row_id", how="inner")

In [20]:
merged.head(2)

Unnamed: 0,row_id,label,sakami181,sakami188,sakami189,lyaka,pocket,owruby,index,user_id,...,u_td_p4p5,u_td_p5p6,u_td_p6p7,u_td_p7p8,u_td_p8p9,u_td_p9p10,ut_ac_mean,ut_ac_mean2,u_ok_ts,u_ng_ts
0,70629059,1,0.907971,0.921032,0.877498,0.939251,0.870664,0.881129,96130348,1503172082,...,2732487.0,143467.0,190046.0,182226.0,190944.0,110019856.0,0.705882,0.688819,-9272200.0,-9597517.0
1,70629060,1,0.829032,0.874914,0.818954,0.859678,0.864616,0.846699,96130349,1503172082,...,454170.0,2732487.0,143467.0,190046.0,182226.0,190944.0,0.737113,0.711192,-9272200.0,-9597517.0


In [21]:
pred_col = [
    'sakami181', 'sakami188', 'sakami189', 
    #'sakami194', #"sakami199", 'sakami195',  
    'pocket', "owruby", "lyaka",
    "uc_td", "uc_ac_mean", "uc_ac_cnt", 
    "ul_td", "u_td", "u_td_p1p2", 
    "ut_ac_mean", "ut_ac_mean2",
    "up_ac_mean",
    "et", "q_et_mean", "q_et_std",
    "u_elo_theta","q_ac_mean",  #"ulr_elo_mean", "up_elo_mean",
    "up_ac_cnt", "u_cnt", "q_ac_cnt"
]

In [27]:
class SingleLgb:
    def __init__(self, seed=99, dry_run=False):
        self.train_param = self.get_param(seed)
        if dry_run:
            self.num_rounds = 100
        else:
            self.num_rounds = 1100

    def do_train_direct(self, x_train, x_test, y_train, y_test):
        lgb_train = lgb.Dataset(x_train, y_train)
        lgb_eval = lgb.Dataset(x_test, y_test)

        # print('Start training...')
        model = lgb.train(self.train_param,
                          lgb_train,
                          valid_sets=[lgb_eval],
                          verbose_eval=100,
                          num_boost_round=self.num_rounds,
                          early_stopping_rounds=100,
                          #categorical_feature=[]
                         )
        # print('End training...')
        return model

    @staticmethod
    def show_feature_importance(model, filename=None):
        fi = pd.DataFrame({
            "name": model.feature_name(),
            "importance_split": model.feature_importance(importance_type="split").astype(int),
            "importance_gain": model.feature_importance(importance_type="gain").astype(int),
        })
        fi = fi.sort_values(by="importance_gain", ascending=False)
        #print(fi)
        with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
            print(fi)

    @staticmethod
    def get_param(seed):
        return {
            'num_leaves': 127,
            'min_data_in_leaf': 100,
            'objective': 'binary',
            #'metric': 'auc',
            'metric': 'binary_logloss',
            'max_depth': -1,
            'learning_rate': 0.02,
            "boosting": "gbdt",
            "feature_fraction": 0.9,
            "verbosity": -1,
            "random_state": seed,
        }
    
class SingleTrainer:
    def __init__(self, pred_col, dry_run=False):
        self.pred_col = pred_col
        self.target_col = "ac"
        self.dry_run = dry_run
        self.val_size = 2*1000*1000
        
    def do_kfold(self, df):
        X = df[self.pred_col]
        y = df[self.target_col]
        
        models, scores = list(), list()
        kf = model_selection.KFold(n_splits=5, random_state=None, shuffle=False)
        for fold, (train_index, test_index) in enumerate(kf.split(X)):
            print("---------")
            print("fold=", fold)
            X_train, X_val = X.iloc[train_index], X.iloc[test_index]
            y_train, y_val = y.iloc[train_index], y.iloc[test_index]
            print(X_train.shape, X_val.shape)
            
            lgbm = SingleLgb(seed=99, dry_run=self.dry_run)
            model = lgbm.do_train_direct(X_train, X_val, y_train, y_val)
            score = model.best_score["valid_0"]["binary_logloss"]
            pred = model.predict(X_val)
            score = metrics.roc_auc_score(y_val, pred)
            print("AUC=", score)
            if fold == 0:
                lgbm.show_feature_importance(model)
            models.append(model)
            scores.append(score)
        return models, np.mean(scores)

    
    def train_model(self, df):
        X = df[self.pred_col]
        y = df[self.target_col]
        
        models, scores = list(), list()
        for fold in range(3):
            print("---------")
            print("fold=", fold)
            f, c = fold, self.val_size
            val_s = -c+f*500*1000
            train_idx = -c+f*500*1000
            X_train, X_val = X.iloc[:train_idx], X.iloc[val_s:]
            y_train, y_val = y.iloc[:train_idx], y.iloc[val_s:]
            print(X_train.shape, X_val.shape)
            
            lgbm = SingleLgb(seed=99, dry_run=self.dry_run)
            model = lgbm.do_train_direct(X_train, X_val, y_train, y_val)
            score = model.best_score["valid_0"]["binary_logloss"]
            pred = model.predict(X_val)
            score = metrics.roc_auc_score(y_val, pred)
            print("AUC=", score)
            if fold == 0:
                lgbm.show_feature_importance(model)
            models.append(model)
            scores.append(score)
        return models, np.mean(scores)

In [28]:
trainer = SingleTrainer(pred_col, dry_run=False)
models, score = trainer.train_model(merged)

---------
fold= 0
(3000000, 23) (2000000, 23)
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.504202
[200]	valid_0's binary_logloss: 0.498013
[300]	valid_0's binary_logloss: 0.497589
[400]	valid_0's binary_logloss: 0.497511
[500]	valid_0's binary_logloss: 0.497484
[600]	valid_0's binary_logloss: 0.497475
[700]	valid_0's binary_logloss: 0.497471
Early stopping, best iteration is:
[635]	valid_0's binary_logloss: 0.49747
AUC= 0.8148371931373621
           name  importance_split  importance_gain
4        owruby              6191         11981923
0     sakami181              3579          6966366
1     sakami188              4305          3955505
3        pocket              7373           426322
2     sakami189              3766           143414
5         lyaka              3535           107558
9         ul_td              4949            72491
6         uc_td              3362            69572
21        u_cnt              3821            55

In [29]:
models[0].save_model("./ensemble_model_0107_old_mv0.lgb")
models[1].save_model("./ensemble_model_0107_old_mv1.lgb")
models[2].save_model("./ensemble_model_0107_old_mv2.lgb")

<lightgbm.basic.Booster at 0x7f6fecd21fd0>

In [24]:
class SingleLgb:
    def __init__(self, seed=99, dry_run=False):
        self.train_param = self.get_param(seed)
        if dry_run:
            self.num_rounds = 100
        else:
            self.num_rounds = 1100

    def do_train_direct(self, x_train, x_test, y_train, y_test):
        lgb_train = lgb.Dataset(x_train, y_train)
        lgb_eval = lgb.Dataset(x_test, y_test)

        # print('Start training...')
        model = lgb.train(self.train_param,
                          lgb_train,
                          valid_sets=[lgb_eval],
                          verbose_eval=100,
                          num_boost_round=self.num_rounds,
                          early_stopping_rounds=100,
                          #categorical_feature=[]
                         )
        # print('End training...')
        return model

    @staticmethod
    def show_feature_importance(model, filename=None):
        fi = pd.DataFrame({
            "name": model.feature_name(),
            "importance_split": model.feature_importance(importance_type="split").astype(int),
            "importance_gain": model.feature_importance(importance_type="gain").astype(int),
        })
        fi = fi.sort_values(by="importance_gain", ascending=False)
        #print(fi)
        with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
            print(fi)

    @staticmethod
    def get_param(seed):
        return {
            'num_leaves': 127,
            'min_data_in_leaf': 100,
            'objective': 'binary',
            #'metric': 'auc',
            'metric': 'binary_logloss',
            'max_depth': -1,
            'learning_rate': 0.02,
            "boosting": "gbdt",
            "feature_fraction": 0.9,
            "verbosity": -1,
            "random_state": seed,
        }
    
class SingleTrainer:
    def __init__(self, pred_col, dry_run=False):
        self.pred_col = pred_col
        self.target_col = "ac"
        self.dry_run = dry_run
        self.val_size = 1*1000*1000
        
    def do_kfold(self, df):
        X = df[self.pred_col]
        y = df[self.target_col]
        
        models, scores = list(), list()
        kf = model_selection.KFold(n_splits=5, random_state=None, shuffle=False)
        for fold, (train_index, test_index) in enumerate(kf.split(X)):
            print("---------")
            print("fold=", fold)
            X_train, X_val = X.iloc[train_index], X.iloc[test_index]
            y_train, y_val = y.iloc[train_index], y.iloc[test_index]
            print(X_train.shape, X_val.shape)
            
            lgbm = SingleLgb(seed=99, dry_run=self.dry_run)
            model = lgbm.do_train_direct(X_train, X_val, y_train, y_val)
            score = model.best_score["valid_0"]["binary_logloss"]
            pred = model.predict(X_val)
            score = metrics.roc_auc_score(y_val, pred)
            print("AUC=", score)
            if fold == 0:
                lgbm.show_feature_importance(model)
            models.append(model)
            scores.append(score)
        return models, np.mean(scores)

    
    def train_model(self, df):
        X = df[self.pred_col]
        y = df[self.target_col]
        
        models, scores = list(), list()
        for fold in range(4):
            print("---------")
            print("fold=", fold)
            f, c = fold, self.val_size
            val_s, val_e = -c-f*c, len(df)-f*c
            train_idx = -c-f*c
            X_train, X_val = X.iloc[:train_idx], X.iloc[val_s:val_e]
            y_train, y_val = y.iloc[:train_idx], y.iloc[val_s:val_e]
            print(X_train.shape, X_val.shape)
            
            lgbm = SingleLgb(seed=99, dry_run=self.dry_run)
            model = lgbm.do_train_direct(X_train, X_val, y_train, y_val)
            score = model.best_score["valid_0"]["binary_logloss"]
            pred = model.predict(X_val)
            score = metrics.roc_auc_score(y_val, pred)
            print("AUC=", score)
            if fold == 0:
                lgbm.show_feature_importance(model)
            models.append(model)
            scores.append(score)
        return models, np.mean(scores)

In [25]:
# pred_col = [
#     "et",  "q_ac_cnt",  'b_ac_mean', 'b_ac_cnt', "q_ac_mean",
#     "u_ac_mean", "u_cnt", "u_qm_mean", #"u_td", #"uc_td",
#     "u_ac_cnt", #"u_et_cnt"
#     'u_ok_qm_mean', 'u_ng_qm_mean',
#     'q_et_mean', 'q_et_cnt', 'q_et_std',
#    "up_ac_cnt", "up_ac_mean",
#     "u_td", 
#     "q_ng_uac_mean","q_ok_uac_mean", "q_ok_uac_std", "q_ng_uac_std",
#     "correct_answer",
#     "u_ac_mean20",
#     "ulr_ac_mean", #"ulr_ac_cnt",  "ub_cnt",
#     "ut_ac_mean","ut_ac_mean2",
#     "uca_ac_mean", 
#     "ul_td", "u_td_wl", 
#     #"q_pqhe_true_uac_mean", "q_pqhe_true_uac_std", "q_pqhe_false_uac_mean", "q_pqhe_false_uac_std",
#     #"pqhe",
#     "uc_td", "uc_ac_mean", "uc_ac_cnt", 
#     #"u_td_final", #"ub_td_final"
#     'u_td_tp1','u_td_p1p2', 'u_td_p2p3', 'u_td_p3p4', 'u_td_p4p5',
#     'u_td_p5p6', 'u_td_p6p7', 'u_td_p7p8', 'u_td_p8p9', 'u_td_p9p10',
#     "uac_prev1", "uac_prev2",
#     "elo_beta", "u_elo_theta", 
#     "u_rate",
# ]
# nn_col = [f"nn_svd{i}" for i in range(20)]
# pred_col += nn_col
# print(pred_col)

In [27]:
#sorted_df = merged.sort_values(by="row_id")

In [28]:
train_df, holdout = merged.iloc[:-1000*1000], merged.iloc[-1000*1000:], 

In [29]:
train_df.shape, holdout.shape

((4000000, 89), (1000000, 89))

In [30]:
trainer = SingleTrainer(pred_col, dry_run=False)
models, score = trainer.do_kfold(train_df)

---------
fold= 0
(3200000, 22) (800000, 22)
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.499546
[200]	valid_0's binary_logloss: 0.493288
[300]	valid_0's binary_logloss: 0.49285
[400]	valid_0's binary_logloss: 0.492773
[500]	valid_0's binary_logloss: 0.492738
[600]	valid_0's binary_logloss: 0.492715
[700]	valid_0's binary_logloss: 0.492708
[800]	valid_0's binary_logloss: 0.492704
Early stopping, best iteration is:
[783]	valid_0's binary_logloss: 0.492702
AUC= 0.8140606222283642
           name  importance_split  importance_gain
4        owruby              6873         16142563
0     sakami181              4752          5236242
1     sakami188              5751          3174670
3        pocket              8485           488095
2     sakami189              5162           174695
8         ul_td              6730            92801
5         uc_td              4080            77737
20        u_cnt              4875            65096
9      

In [36]:
preds = []
for model in models:
    pred = model.predict(holdout[pred_col])
    preds.append(pred)



In [37]:
avg_preds = np.mean(preds, axis=0)

In [38]:
avg_preds

array([0.4300971 , 0.8993032 , 0.42278034, ..., 0.05937455, 0.03265606,
       0.93313205])

In [39]:
avg_preds.shape

(1000000,)

In [40]:
score = metrics.roc_auc_score(holdout["ac"], avg_preds)
score

0.8151531033551489

In [41]:
single_pred = models[0].predict(holdout[pred_col])

In [42]:
score = metrics.roc_auc_score(holdout["ac"], single_pred)
score

0.8150540116748162

In [86]:
# trainer = SingleTrainer(pred_col, dry_run=False)
# models, score = trainer.seed_avg_model(merged)

In [87]:
# final_valid = merged.iloc[-2*1000*1000:]
# preds = []
# for model in models:
#     pred_y = models[0].predict(final_valid[pred_col])
#     preds.append(pred_y)
# final_pred = (preds[0] +  preds[1] +  preds[2] +  preds[3])/4
# final_pred.shape
# score = metrics.roc_auc_score(final_valid["ac"], final_pred)
# print(score)

In [88]:
# new valid 3m-2m, 20feats: 

In [89]:
models[0].save_model("./ensemble_model_0106.lgb")

<lightgbm.basic.Booster at 0x7f288a9dd390>

In [92]:
merged.iloc[-2*1000*1000:]["row_id"]

3000000    25708400
3000001    25708401
3000002    25708402
3000003    25708403
3000004    25708404
             ...   
4999995    26498141
4999996    26498142
4999997    26498143
4999998    26498144
4999999    32938743
Name: row_id, Length: 2000000, dtype: int64

In [99]:
final_pred = models[0].predict(merged.iloc[-2*1000*1000:][pred_col])

In [100]:
final_pred.shape

(2000000,)

In [101]:
final_pred_df = pd.DataFrame({
    "row_id": merged.iloc[-2*1000*1000:]["row_id"],
    "pocket_pred": final_pred
})

In [103]:
final_pred_df.head(2)

Unnamed: 0,row_id,pocket_pred
3000000,25708400,0.726013
3000001,25708401,0.943335


In [104]:
final_pred_df.to_csv("old_valid_pocket_stack_pred.csv")

In [None]:
# before lyaka 8140
# after sakami1 8141
# after lyaka12 8142
# only lyaka12, no sakami1 0.81419: model0101
# only lyaka2 0.81415

# valid 

In [55]:
# base feats
#Early stopping, best iteration is:
#[295]	valid_0's binary_logloss: 0.517508
#AUC= 0.7967711921542647

# with owruby, sakami
# [73]	valid_0's binary_logloss: 0.499981
#AUC= 0.812631130956799

# add pocket1223
#AUC= 0.8130389687940041

#just 3
#AUC= 0.8119216727001559

# just3 + top feats
#0.81279

In [58]:
class SeedLgb:
    def __init__(self, seed=99, dry_run=False):
        self.train_param = self.get_param(seed)
        if dry_run:
            self.num_rounds = 100
        else:
            self.num_rounds = 1100

    def do_train_direct(self, x_train, y_train):
        lgb_train = lgb.Dataset(x_train, y_train)

        # print('Start training...')
        model = lgb.train(self.train_param,
                          lgb_train,
                          valid_sets=None,
                          verbose_eval=100,
                          num_boost_round=self.num_rounds,
                          #categorical_feature=[]
                         )
        # print('End training...')
        return model

    @staticmethod
    def show_feature_importance(model, filename=None):
        fi = pd.DataFrame({
            "name": model.feature_name(),
            "importance_split": model.feature_importance(importance_type="split").astype(int),
            "importance_gain": model.feature_importance(importance_type="gain").astype(int),
        })
        fi = fi.sort_values(by="importance_gain", ascending=False)
        #print(fi)
        with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
            print(fi)

    @staticmethod
    def get_param(seed):
        return {
            'num_leaves': 127,
            'min_data_in_leaf': 100,
            'objective': 'binary',
            #'metric': 'auc',
            'metric': 'binary_logloss',
            'max_depth': -1,
            'learning_rate': 0.02,
            "boosting": "gbdt",
            "feature_fraction": 0.9,
            "verbosity": -1,
            "random_state": seed,
        }
    
class SeedTrainer:
    def __init__(self, pred_col, dry_run=False):
        self.pred_col = pred_col
        self.target_col = "ac"
        self.dry_run = dry_run

    def train_model(self, df):
        X = df[self.pred_col]
        y = df[self.target_col]
        
        models, scores = list(), list()
        for fold in range(4):
            print("---------")
            print("fold=", fold)
            
            lgbm = SeedLgb(seed=fold, dry_run=self.dry_run)
            model = lgbm.do_train_direct(X, y)
            #score = model.best_score["valid_0"]["binary_logloss"]
            #pred = model.predict(X_val)
            #score = metrics.roc_auc_score(y_val, pred)
            #print("AUC=", score)
            if fold == 0:
                lgbm.show_feature_importance(model)
            models.append(model)
            #scores.append(score)
        return models, scores

In [59]:
trainer = SeedTrainer(pred_col, dry_run=False)
models, score = trainer.train_model(merged)

---------
fold= 0
           name  importance_split  importance_gain
1       sakami2              9218         21663621
2       owruby2             10160         16689718
0   pocket_1226             11127           640644
3        lyaka2              7528           252855
4         uc_td              6046           140199
6         ul_td              9285           131655
7          u_td              7975            96863
5    uc_ac_mean              2511            93665
19        u_cnt              7030            93439
16    q_ac_mean              7188            89085
15  u_elo_theta              7249            81063
8     u_td_p1p2              6702            73352
20     q_ac_cnt              6421            68977
11   up_ac_mean              6620            68750
13    q_et_mean              5564            64248
17    up_ac_cnt              5850            62281
9    ut_ac_mean              5259            61034
12           et              5675            60832
10  ut_ac_mea

In [60]:
for i in range(4):
    models[i].save_model(f"./ensemble_model_0101_all{i}.lgb")

In [22]:
metrics.roc_auc_score(merged2["label"], merged2["ranked_3avg"])

0.8115313781174149

In [17]:
for power in [1.5, 1.75, 2, 2.25, 2.5, 3, 3.5, 4]:
    merged2["power_mean"] = (merged2["sakami"]**power + merged2["owruby"]**power + merged2["pocket_1223"]**power)
    score = metrics.roc_auc_score(merged2["label"], merged2["power_mean"])
    print(power, score)

1.5 0.8115757793963951
1.75 0.8115617807361065
2 0.8115436387419797
2.25 0.8115220983411143
2.5 0.8114978412488558
3 0.811442840131525
3.5 0.8113818106061845
4 0.8113171371978005
