In [222]:
import pandas as pd
import numpy as np

In [223]:
WEAPON_COLUMNS = ['A1-weapon', 'A2-weapon', 'A3-weapon', 'A4-weapon', 'B1-weapon', 'B2-weapon', 'B3-weapon', 'B4-weapon']
RANK_COLUMNS = ['A1-rank', 'A2-rank', 'A3-rank', 'A4-rank', 'B1-rank', 'B2-rank', 'B3-rank', 'B4-rank']
LEVEL_COLUMNS = ['A1-level', 'A2-level', 'A3-level', 'A4-level', 'B1-level', 'B2-level', 'B3-level', 'B4-level']

In [224]:
train_data = pd.read_csv("data/train_data.csv", index_col="id")
test_data = pd.read_csv("data/test_data.csv", index_col="id")

In [225]:
train_data

Unnamed: 0_level_0,period,game-ver,lobby-mode,lobby,mode,stage,A1-weapon,A1-rank,A1-level,A2-weapon,...,B2-weapon,B2-rank,B2-level,B3-weapon,B3-rank,B3-level,B4-weapon,B4-rank,B4-level,y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2019-10-15T20:00:00+00:00,5.0.1,regular,standard,nawabari,sumeshi,sshooter_becchu,,139,soytuber_custom,...,hokusai_becchu,,26.0,herocharger_replica,,68.0,sharp_neo,,31.0,1
2,2019-12-14T04:00:00+00:00,5.0.1,regular,standard,nawabari,arowana,parashelter_sorella,,198,jetsweeper,...,squiclean_b,,118.0,campingshelter,,168.0,sputtery_clear,,151.0,0
3,2019-12-25T14:00:00+00:00,5.0.1,gachi,standard,hoko,ama,nzap89,a-,114,quadhopper_black,...,nzap85,a+,163.0,prime_becchu,a-,160.0,dualsweeper_custom,a,126.0,0
4,2019-11-11T14:00:00+00:00,5.0.1,regular,standard,nawabari,engawa,bamboo14mk1,,336,splatroller_becchu,...,liter4k,,189.0,promodeler_mg,,194.0,hotblaster_custom,,391.0,0
5,2019-12-14T06:00:00+00:00,5.0.1,gachi,standard,hoko,chozame,bold_7,x,299,hissen_hue,...,sputtery_hue,x,45.0,bucketslosher_soda,x,246.0,wakaba,x,160.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66121,2019-10-11T10:00:00+00:00,5.0.1,gachi,standard,area,devon,hokusai_becchu,a,66,soytuber,...,nzap85,a+,272.0,splatcharger,a,38.0,wakaba,a+,283.0,1
66122,2019-12-04T00:00:00+00:00,5.0.1,gachi,standard,asari,otoro,hokusai_becchu,s+,204,prime_becchu,...,prime_becchu,s+,167.0,pablo,s+,188.0,nzap89,s+,170.0,1
66123,2019-10-22T00:00:00+00:00,5.0.1,gachi,standard,hoko,bbass,prime_collabo,a+,116,promodeler_rg,...,bold_neo,a+,70.0,nzap85,a+,191.0,hissen_hue,a+,139.0,1
66124,2019-12-11T00:00:00+00:00,5.0.1,gachi,standard,area,fujitsubo,heroroller_replica,s+,210,sharp_neo,...,heroroller_replica,s+,395.0,rapid_elite_deco,s+,223.0,sharp_neo,s+,194.0,0


## 欠損値補完

In [226]:
def complete(data):
    for col_name in WEAPON_COLUMNS:
        data[col_name] = data[col_name].fillna('NULL')
    for col_name in LEVEL_COLUMNS:
        data[col_name] = data[col_name].fillna(0)
    for col_name in RANK_COLUMNS:
        data[col_name] = data[col_name].fillna('n')

In [227]:
complete(train_data)
complete(test_data)

## エンコーダー

In [228]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer

In [229]:
train_data["mode"].unique().reshape(-1, 1)

array([['nawabari'],
       ['hoko'],
       ['yagura'],
       ['area'],
       ['asari']], dtype=object)

### Mode

In [230]:
# TODO: sparseをTrueにする？
mode_encoder = LabelBinarizer()
mode_encoder.fit(train_data["mode"].unique())

LabelBinarizer()

In [231]:
def make_mode(data):
    return pd.DataFrame(mode_encoder.transform(data), columns=mode_encoder.classes_)

### Stage

In [232]:
stage_encoder = LabelBinarizer()
stage_encoder.fit(train_data["stage"].unique())
def make_stage(data):
    return pd.DataFrame(stage_encoder.transform(data), columns=stage_encoder.classes_)

In [233]:
make_stage(train_data["stage"])

Unnamed: 0,ajifry,ama,anchovy,arowana,battera,bbass,chozame,devon,engawa,fujitsubo,...,kombu,manta,mongara,mozuku,mutsugoro,otoro,shottsuru,sumeshi,tachiuo,zatou
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66120,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
66121,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
66122,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
66123,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


### Weapon
全部で140種類？

In [234]:
weapon_encoder = LabelBinarizer()
all_weapons = pd.concat([train_data[x] for x in WEAPON_COLUMNS]).dropna().unique()
weapon_encoder.fit(all_weapons)

LabelBinarizer()

In [235]:
def make_weapon(data):
    return pd.DataFrame(weapon_encoder.transform(data), columns=weapon_encoder.classes_)

### Rank
手動で決めてしまう

In [236]:
# TODO: 感覚で数字つけておく
rank_map = {'n': 0, 'c-': 1, 'c': 2, 'c+': 3, 'b-': 4, 'b': 5, 'b+': 6, 'a-': 7, 'a': 8, 'a+': 9, 's': 10, 's+': 11, 'x': 12}
def encode_rank(rank):
    try:
        return rank_map[rank]
    except KeyError:
        return 0

## 標準化

### Level

In [237]:
from sklearn.preprocessing import StandardScaler

In [238]:
level_scaler = StandardScaler()

In [239]:
levels = np.concatenate([train_data[x].values for x in LEVEL_COLUMNS], axis=0).reshape(-1, 1)
level_scaler.fit(levels)

StandardScaler()

In [240]:
level_scaler.transform([[200]])

array([[0.38124705]])

### Rank

In [241]:
rank_scaler = StandardScaler()

In [242]:
rank_mapper = np.vectorize(encode_rank)

In [243]:
rank_scaler = rank_scaler.fit(rank_mapper(np.concatenate([train_data[x].values for x in RANK_COLUMNS], axis=0)).reshape(-1, 1))

In [244]:
rank_scaler.transform([[21]])

array([[2.66136804]])

## データ作成

In [245]:
# nawabariとそれ意外で違う値
def make_weapon_bias(data, player):
    weapon_col = player + '-weapon'
    level_col = player + '-level'
    rank_col = player + '-rank'
    
    weapon_data = make_weapon(data[weapon_col])
    
    # nawabariなら1,それ意外は0
    nawabari_data = np.where(data['mode'] == 'nawabari', 1, 0).reshape(-1, 1)
    level_data = level_scaler.transform(data[level_col].values.reshape(-1, 1)) * nawabari_data
    
    # nawabariなら0,それ意外は1
    nawabari_inv_data = nawabari_data * -1 + 1
    rank_data = rank_scaler.transform(rank_mapper(data['A1-rank']).reshape(-1, 1)) * nawabari_inv_data
    weapon_data = weapon_data.values * (level_data + rank_data)
    return pd.DataFrame(weapon_data, columns=weapon_encoder.classes_)

In [246]:
make_weapon_bias(train_data[0:3], 'A1')

Unnamed: 0,52gal,52gal_becchu,52gal_deco,96gal,96gal_deco,NULL,bamboo14mk1,bamboo14mk2,bamboo14mk3,barrelspinner,...,spygadget_sorella,squiclean_a,squiclean_b,squiclean_g,sshooter,sshooter_becchu,sshooter_collabo,variableroller,variableroller_foil,wakaba
0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.225176,-0.0,-0.0,-0.0,-0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0


In [247]:
rank_scaler.transform(rank_mapper(train_data['A1-rank']).reshape(-1, 1))

array([[-1.77430725],
       [-1.77430725],
       [-0.29574882],
       ...,
       [ 0.12669645],
       [ 0.54914171],
       [ 0.33791908]])

In [263]:
def make_data(data, with_y=False):
    mode_data = make_mode(train_data['mode'])
    stage_data = make_stage(train_data['stage'])
    a_data = make_weapon_bias(data, 'A1') + make_weapon_bias(data, 'A2') + make_weapon_bias(data, 'A3') + make_weapon_bias(data, 'A4')
    b_data = make_weapon_bias(data, 'B1') + make_weapon_bias(data, 'B2') + make_weapon_bias(data, 'B3') + make_weapon_bias(data, 'B4')
    X = pd.concat([mode_data, stage_data, a_data, b_data], axis=1)
    if with_y:
        y = data['y']
        return X, y
    return X

In [264]:
train_X, train_y = make_data(train_data, with_y=True)

In [265]:
test_X = make_data(test_data)

In [269]:
train_X.head()

Unnamed: 0,area,asari,hoko,nawabari,yagura,ajifry,ama,anchovy,arowana,battera,...,spygadget_sorella,squiclean_a,squiclean_b,squiclean_g,sshooter,sshooter_becchu,sshooter_collabo,variableroller,variableroller_foil,wakaba
0,0,0,0,1,0,0,0,0,0,0,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0


## 学習

## テスト

In [1]:
from datetime import datetime

In [2]:
dtest = xgb.DMatrix(test_X)

NameError: name 'xgb' is not defined

In [3]:
test_pred_proba = model.predict(dtest)

NameError: name 'model' is not defined

In [30]:
test_pred = np.where(test_pred_proba > 0.5, 1, 0)

In [4]:
submit_df = pd.DataFrame({'y': test_pred})
submit_df.index.name = 'id'
submit_df.to_csv('submission_{}.csv', datetime.now().format("%Y%m%d_%H%M%S"))

NameError: name 'pd' is not defined