In [3]:
from pathlib import Path
from kedro.framework.context import load_context

current_dir = Path.cwd()
proj_path = current_dir.parent
context = load_context(proj_path)

In [4]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

In [5]:
df_train = catalog.load("train")
df_test = catalog.load("test")
df_stage = catalog.load("stagedata")
df_weapon = catalog.load("weapon")

target = df_train['y']

players = ["A1", "A2", "A3", "A4", "B1", "B2", "B3", "B4"]

2020-09-29 07:34:36,619 - kedro.io.data_catalog - INFO - Loading data from `train` (CSVDataSet)...
2020-09-29 07:34:36,808 - kedro.io.data_catalog - INFO - Loading data from `test` (CSVDataSet)...


  and should_run_async(code)


2020-09-29 07:34:36,897 - kedro.io.data_catalog - INFO - Loading data from `stagedata` (CSVDataSet)...
2020-09-29 07:34:36,899 - kedro.io.data_catalog - INFO - Loading data from `weapon` (CSVDataSet)...


In [6]:
cat_cols = ['mode', 'stage', 'A1-weapon', 'A2-weapon','A3-weapon','A4-weapon',
          'B1-weapon', 'B2-weapon','B3-weapon','B4-weapon']
te_cat_cols = ['A1-mainweapon', 'A1-subweapon', 'A1-special', 'A1-category',
              'A2-mainweapon', 'A2-subweapon', 'A2-special', 'A2-category',
              'A3-mainweapon', 'A3-subweapon', 'A3-special', 'A3-category',
              'A4-mainweapon', 'A4-subweapon', 'A4-special', 'A4-category',
              'B1-mainweapon', 'B1-subweapon', 'B1-special', 'B1-category',
              'B2-mainweapon', 'B2-subweapon', 'B2-special', 'B2-category',
              'B3-mainweapon', 'B3-subweapon', 'B3-special', 'B3-category',
              'B4-mainweapon', 'B4-subweapon', 'B4-special', 'B4-category']

In [7]:
def rank_to_value(x):
    rank = {'c-': 1, 'c': 2, 'c+': 3,
            'b-': 4, 'b': 5, 'b+': 6,
            'a-': 7, 'a': 8, 'a+': 9,
            's': 10, 's+': 11, 'x': 12
            }
    if x in rank:
        x = rank[x]
    else:
        x = 0
    return x

In [8]:
def rank_average(x, team):
    x[f"{team}-rank-ave"] = (x[f"{team}1-rank"] + x[f"{team}2-rank"] + x[f"{team}3-rank"] + x[f"{team}4-rank"]) / 4
    return x[f"{team}-rank-ave"]

In [9]:
def OneHot_encoder(x):
    import category_encoders as ce
    list_cols = ['mode', 'stage']
    ce_ohe = ce.OneHotEncoder(cols=list_cols, handle_unknown='impute')
    return ce_ohe.fit_transform(x)
    

In [10]:
def target_encoder(train_x, train_y, test_x, cat_cols):
    from sklearn.model_selection import KFold
    
    for c in cat_cols:
        data_tmp = pd.DataFrame({c: train_x[c], 'target': train_y})
        target_mean = data_tmp.groupby(c)['target'].mean()
        
        test_x[c] = test_x[c].map(target_mean)
        
        tmp = np.repeat(np.nan, train_x.shape[0])
        
        kf = KFold(n_splits=4, shuffle=True, random_state=1234)
        
        for idx_1, idx_2 in kf.split(train_x):
            target_mean = data_tmp.iloc[idx_1].groupby(c)['target'].mean()
            tmp[idx_2] = train_x[c].iloc[idx_2].map(target_mean)
        train_x[c] = tmp
            

In [11]:
def level_average(x, team):
    x[f"{team}-level-ave"] = (x[f"{team}1-level"] + x[f"{team}2-level"] + x[f"{team}3-level"] + x[f"{team}4-level"]) / 4
    return x[f"{team}-level-ave"]

In [12]:
def preprocess_weapon(weapon):
    weapon["category"] = weapon["category2"]
    weapon = weapon.drop(["category1", "category2"], axis=1)
    return weapon

In [13]:
def preprocess_stage(stagedata):
    return stagedata

In [14]:
def preprocess_data(train, test, stagedata):
    data = pd.concat([train, test])
    players = ["A1", "A2", "A3", "A4", "B1", "B2", "B3", "B4"]
    data = data.join(stagedata.set_index("stage"), on="stage")
    
    for player in players:
        data[f"{player}-rank"] = data[f"{player}-rank"].apply(rank_to_value)
        data[f"{player}-weapon"] = data[f"{player}-weapon"].fillna("Nothing")
        data[f"{player}-level"] = data[f"{player}-level"].fillna(0)
    data["A-rank-ave"] = rank_average(data, "A")
    data["B-rank-ave"] = rank_average(data, "B")
    data["A-level-ave"] = level_average(data, "A")
    data["B-level-ave"] = level_average(data, "B")
    data = OneHot_encoder(data)
    return data

In [15]:
processed_weapon = preprocess_weapon(df_weapon)

In [16]:
processed_stage = preprocess_stage(df_stage)

In [17]:
processed_data = preprocess_data(df_train, df_test, df_stage)

  from collections import Mapping
  elif pd.api.types.is_categorical(cols):


In [18]:
processed_data.columns

  and should_run_async(code)


Index(['id', 'period', 'game-ver', 'lobby-mode', 'lobby', 'mode_1', 'mode_2',
       'mode_3', 'mode_4', 'mode_5', 'stage_1', 'stage_2', 'stage_3',
       'stage_4', 'stage_5', 'stage_6', 'stage_7', 'stage_8', 'stage_9',
       'stage_10', 'stage_11', 'stage_12', 'stage_13', 'stage_14', 'stage_15',
       'stage_16', 'stage_17', 'stage_18', 'stage_19', 'stage_20', 'stage_21',
       'stage_22', 'stage_23', 'A1-weapon', 'A1-rank', 'A1-level', 'A2-weapon',
       'A2-rank', 'A2-level', 'A3-weapon', 'A3-rank', 'A3-level', 'A4-weapon',
       'A4-rank', 'A4-level', 'B1-weapon', 'B1-rank', 'B1-level', 'B2-weapon',
       'B2-rank', 'B2-level', 'B3-weapon', 'B3-rank', 'B3-level', 'B4-weapon',
       'B4-rank', 'B4-level', 'y', 'size', 'A-rank-ave', 'B-rank-ave',
       'A-level-ave', 'B-level-ave'],
      dtype='object')

In [19]:
processed_data.head()

Unnamed: 0,id,period,game-ver,lobby-mode,lobby,mode_1,mode_2,mode_3,mode_4,mode_5,...,B3-level,B4-weapon,B4-rank,B4-level,y,size,A-rank-ave,B-rank-ave,A-level-ave,B-level-ave
0,1,2019-10-15T20:00:00+00:00,5.0.1,regular,standard,1,0,0,0,0,...,68.0,sharp_neo,0,31.0,1.0,2855.0,0.0,0.0,70.0,38.25
1,2,2019-12-14T04:00:00+00:00,5.0.1,regular,standard,1,0,0,0,0,...,168.0,sputtery_clear,0,151.0,0.0,2391.0,0.0,0.0,149.0,130.0
2,3,2019-12-25T14:00:00+00:00,5.0.1,gachi,standard,0,1,0,0,0,...,160.0,dualsweeper_custom,8,126.0,0.0,2426.0,7.75,7.75,128.5,124.75
3,4,2019-11-11T14:00:00+00:00,5.0.1,regular,standard,1,0,0,0,0,...,194.0,hotblaster_custom,0,391.0,0.0,2237.4,0.0,0.0,174.25,261.75
4,5,2019-12-14T06:00:00+00:00,5.0.1,gachi,standard,0,1,0,0,0,...,246.0,wakaba,12,160.0,1.0,2390.0,12.0,12.0,157.0,138.0


In [20]:
def merge_weapon(data, weapondata):
    weapons = [p + "-weapon" for p in players]
    for weapon in weapons:
        temp_weapon_detail = data[[weapon]].join(weapondata.set_index("key"), on=weapon)
        weapon_detail = [weapon[:3] + col for col in temp_weapon_detail.columns]
        temp_weapon_detail.columns = weapon_detail
        data = pd.concat([data, temp_weapon_detail], axis=1)
        data = data.drop(weapon[:3] + weapon, axis=1)
        data = data.drop(weapon[:3] + 'i', axis=1)
        data = data.drop(weapon, axis=1)
    return data

  and should_run_async(code)


In [21]:
def drop_column(data):
    drop_col = ["id", "period", "game-ver", "lobby-mode", "lobby", ]
    data = data.drop(drop_col, axis=1)
    return data

In [22]:
def process_nan(data):
    data = data.fillna(0.0)
    return data

In [23]:
def create_master_table(data, weapon, stagedata):
    master_table = merge_weapon(data, weapon)
    master_table = drop_column(master_table)
    master_table = process_nan(master_table)
    train = master_table.iloc[:len(target), :]
    test = master_table.iloc[len(target):, :]
    test = test.drop('y', axis=1)
    train_x = train.drop('y', axis=1)
    train_y = train['y']
    target_encoder(train_x, train_y, test, te_cat_cols)
    train_x = process_nan(train_x)
    test = process_nan(test)
    return train_x, train_y, test

In [24]:
train_x, train_y, test = create_master_table(processed_data, processed_weapon, processed_stage)

In [25]:
train_x.isnull().sum().sum()


0

In [24]:
pd.set_option('display.max_rows', 500)
train_x.isna().any()

mode_1           False
mode_2           False
mode_3           False
mode_4           False
mode_5           False
stage_1          False
stage_2          False
stage_3          False
stage_4          False
stage_5          False
stage_6          False
stage_7          False
stage_8          False
stage_9          False
stage_10         False
stage_11         False
stage_12         False
stage_13         False
stage_14         False
stage_15         False
stage_16         False
stage_17         False
stage_18         False
stage_19         False
stage_20         False
stage_21         False
stage_22         False
stage_23         False
A1-rank          False
A1-level         False
A2-rank          False
A2-level         False
A3-rank          False
A3-level         False
A4-rank          False
A4-level         False
B1-rank          False
B1-level         False
B2-rank          False
B2-level         False
B3-rank          False
B3-level         False
B4-rank          False
B4-level   

In [25]:
import torch
import torch.utils.data
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.autograd import Variable
import torchvision
from torchvision import datasets, models, transforms
import torch.nn.functional as F
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
import time

In [26]:
class CustomLinear(nn.Module):
    def __init__(self, in_features, out_features, bias=True, p=0.5):
        super().__init__()
        self.fc1 = nn.Linear(in_features, out_features, bias)
        self.relu = nn.ReLU()
        self.drop = nn.Dropout(p)
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.drop(x)
        return x

In [27]:
net = nn.Sequential(CustomLinear(len(train_x.columns), len(train_x.columns)),
                   nn.Linear(len(train_x.columns), 1))

In [61]:
class Net(nn.Module):
    def __init__(self, in_features, out_features):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(in_features, in_features)
        self.fc2 = nn.Linear(in_features, out_features)
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [62]:
net = Net(len(train_x.columns), 1)

In [28]:
splits = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=1234).split(train_x, train_y))

In [29]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

  and should_run_async(code)


In [30]:
def seed_everything(seed=1234):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [34]:
batch_size = 32
max_epoch = 6

In [35]:
train_preds = np.zeros((len(train_x)))
test_preds = np.zeros((len(test)))

seed_everything()

test_cuda = torch.tensor(test.values, dtype=torch.float32).cuda()
test_dataset = torch.utils.data.TensorDataset(test_cuda)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [36]:
for i, (train_idx, valid_idx) in enumerate(splits):
    x_train_fold = torch.tensor(train_x.values[train_idx], dtype=torch.float32).cuda()
    y_train_fold = torch.tensor(train_y.values[train_idx, np.newaxis], dtype=torch.float32).cuda()
    x_val_fold = torch.tensor(train_x.values[valid_idx], dtype=torch.float32).cuda()
    y_val_fold = torch.tensor(train_y.values[valid_idx, np.newaxis], dtype=torch.float32).cuda()
    
    model = net
    model.cuda()
    
    loss_fn = torch.nn.BCEWithLogitsLoss(reduction="sum")
    optimizer = torch.optim.Adam(model.parameters())
    
    train = torch.utils.data.TensorDataset(x_train_fold, y_train_fold)
    valid = torch.utils.data.TensorDataset(x_val_fold, y_val_fold)
    
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)
    
    print(f'Fold {i + 1}')
    
    for epoch in range(max_epoch):
        model.train()
        avg_loss = 0.
        for x_batch, y_batch in train_loader:
            y_pred = model(x_batch)
            loss = loss_fn(y_pred, y_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            avg_loss += loss.item() / len(train_loader)
        model.eval()
        valid_preds_fold = np.zeros((x_val_fold.size(0)))
        test_preds_fold = np.zeros(len(test))
        avg_val_loss = 0.
        for i, (x_batch, y_batch) in enumerate(valid_loader):
            y_pred = model(x_batch).detach()
            avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
            valid_preds_fold[i * batch_size:(i+1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]
            
        print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f} '.format(
            epoch + 1, max_epoch, avg_loss, avg_val_loss))
        
    for i, (x_batch,) in enumerate(test_loader):
        y_pred = model(x_batch).detach()
        test_preds_fold[i * batch_size:(i+1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]

    train_preds[valid_idx] = valid_preds_fold
    test_preds += test_preds_fold / len(splits)

Fold 1
Epoch 1/6 	 loss=22.1312 	 val_loss=22.1031 
Epoch 2/6 	 loss=22.1302 	 val_loss=22.1053 
Epoch 3/6 	 loss=22.1313 	 val_loss=22.1032 
Epoch 4/6 	 loss=22.1308 	 val_loss=22.1040 
Epoch 5/6 	 loss=22.1310 	 val_loss=22.1032 
Epoch 6/6 	 loss=22.1309 	 val_loss=22.1040 
Fold 2
Epoch 1/6 	 loss=22.1306 	 val_loss=22.1034 
Epoch 2/6 	 loss=22.1309 	 val_loss=22.1033 
Epoch 3/6 	 loss=22.1308 	 val_loss=22.1038 
Epoch 4/6 	 loss=22.1313 	 val_loss=22.1034 
Epoch 5/6 	 loss=22.1309 	 val_loss=22.1033 
Epoch 6/6 	 loss=22.1312 	 val_loss=22.1035 
Fold 3
Epoch 1/6 	 loss=22.1310 	 val_loss=22.1033 
Epoch 2/6 	 loss=22.1306 	 val_loss=22.1038 
Epoch 3/6 	 loss=22.1309 	 val_loss=22.1032 
Epoch 4/6 	 loss=22.1312 	 val_loss=22.1033 
Epoch 5/6 	 loss=22.1304 	 val_loss=22.1036 
Epoch 6/6 	 loss=22.1308 	 val_loss=22.1039 
Fold 4
Epoch 1/6 	 loss=22.1311 	 val_loss=22.1033 
Epoch 2/6 	 loss=22.1309 	 val_loss=22.1036 
Epoch 3/6 	 loss=22.1310 	 val_loss=22.1032 
Epoch 4/6 	 loss=22.1308 	 

In [37]:
from sklearn.metrics import accuracy_score

def threshold_search(y_true, y_proba):
    best_threshold = 0
    best_score = 0
    for threshold in [i * 0.01 for i in range(100)]:
        score = accuracy_score(y_true=y_true, y_pred=y_proba > threshold)
        if score > best_score:
            best_threshold = threshold
            best_score = score
    search_result = {"threshold": best_threshold, "accuracy_score": best_score}
    return  search_result

In [38]:
search_result = threshold_search(train_y, train_preds)
search_result

{'threshold': 0.0, 'accuracy_score': 0.524703213610586}

In [41]:
y_test = test_preds > search_result['threshold']
search_result['threshold']

0.0

In [29]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

In [27]:
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'num_iterations': 1000,
    'early_stopping_rounds': 100,
}

In [30]:
FOLD_NUM = 10
kf = KFold(n_splits=FOLD_NUM, random_state=1234)

scores = []

pred_cv = np.zeros(len(test.index))

num_round = 1000

for i, (tdx, vdx) in enumerate(kf.split(train_x, train_y)):
    print(f'Fold : {i}')
    # 訓練用データと検証用データに分割
    X_train, X_valid, y_train, y_valid = train_x.iloc[tdx], train_x.iloc[vdx], train_y.values[tdx], train_y.values[vdx]
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_valid, y_valid)
    
    # 学習の実行
    model = lgb.train(params, lgb_train, num_boost_round=num_round,
                      valid_names=["train", "valid"], valid_sets=[lgb_train, lgb_valid],
                      verbose_eval=100)

    # 検証データに対する予測値を求めて、勝敗（０　or　１）に変換
    va_pred = np.round(model.predict(X_valid,num_iteration=model.best_iteration))
    
    # accuracyスコアを計算
    score_ = accuracy_score(y_valid, va_pred)
    
    # フォールド毎の検証時のスコアを格納
    scores.append(score_)
    
    #テストデータに対する予測値を求める
    submission = model.predict(test,num_iteration=model.best_iteration)
    
    #テストデータに対する予測値をフォールド数で割って蓄積
    #(フォールド毎の予測値の平均値を求めることと同じ)
    pred_cv += submission/FOLD_NUM

# 最終的なテストデータに対する予測値を勝敗（０　or　１）に変換
pred_cv = np.round(pred_cv)

# 最終的なaccuracyスコアを平均値で出力
print('')
print('################################')
print('CV_score:'+ str(np.mean(scores)))

Fold : 0
[LightGBM] [Info] Number of positive: 31248, number of negative: 28264
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6259
[LightGBM] [Info] Number of data points in the train set: 59512, number of used features: 105
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.525071 -> initscore=0.100366
[LightGBM] [Info] Start training from score 0.100366
Training until validation scores don't improve for 100 rounds
[100]	train's binary_logloss: 0.634707	valid's binary_logloss: 0.683444
Early stopping, best iteration is:
[43]	train's binary_logloss: 0.660821	valid's binary_logloss: 0.683108
Fold : 1
[LightGBM] [Info] Number of positive: 31254, number of negative: 28258
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6259
[LightGBM] [Info] Number of data points in the train set: 59512, number of used features: 105
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.525171 -> initscore=0.100771
[LightGBM] [Info]

In [31]:
# 提出用ファイルを作成する
pd.DataFrame({"id": range(len(pred_cv)), "y": pred_cv }).to_csv("../data/07_model_output/submission.csv", index=False)