In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
WEAPON_COLUMNS = ['A1-weapon', 'A2-weapon', 'A3-weapon', 'A4-weapon', 'B1-weapon', 'B2-weapon', 'B3-weapon', 'B4-weapon']
RANK_COLUMNS = ['A1-rank', 'A2-rank', 'A3-rank', 'A4-rank', 'B1-rank', 'B2-rank', 'B3-rank', 'B4-rank']
LEVEL_COLUMNS = ['A1-level', 'A2-level', 'A3-level', 'A4-level', 'B1-level', 'B2-level', 'B3-level', 'B4-level']

In [3]:
train_data = pd.read_csv("data/train_data.csv", index_col="id")
test_data = pd.read_csv("data/test_data.csv", index_col="id")

In [4]:
train_data

Unnamed: 0_level_0,period,game-ver,lobby-mode,lobby,mode,stage,A1-weapon,A1-rank,A1-level,A2-weapon,...,B2-weapon,B2-rank,B2-level,B3-weapon,B3-rank,B3-level,B4-weapon,B4-rank,B4-level,y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2019-10-15T20:00:00+00:00,5.0.1,regular,standard,nawabari,sumeshi,sshooter_becchu,,139,soytuber_custom,...,hokusai_becchu,,26.0,herocharger_replica,,68.0,sharp_neo,,31.0,1
2,2019-12-14T04:00:00+00:00,5.0.1,regular,standard,nawabari,arowana,parashelter_sorella,,198,jetsweeper,...,squiclean_b,,118.0,campingshelter,,168.0,sputtery_clear,,151.0,0
3,2019-12-25T14:00:00+00:00,5.0.1,gachi,standard,hoko,ama,nzap89,a-,114,quadhopper_black,...,nzap85,a+,163.0,prime_becchu,a-,160.0,dualsweeper_custom,a,126.0,0
4,2019-11-11T14:00:00+00:00,5.0.1,regular,standard,nawabari,engawa,bamboo14mk1,,336,splatroller_becchu,...,liter4k,,189.0,promodeler_mg,,194.0,hotblaster_custom,,391.0,0
5,2019-12-14T06:00:00+00:00,5.0.1,gachi,standard,hoko,chozame,bold_7,x,299,hissen_hue,...,sputtery_hue,x,45.0,bucketslosher_soda,x,246.0,wakaba,x,160.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66121,2019-10-11T10:00:00+00:00,5.0.1,gachi,standard,area,devon,hokusai_becchu,a,66,soytuber,...,nzap85,a+,272.0,splatcharger,a,38.0,wakaba,a+,283.0,1
66122,2019-12-04T00:00:00+00:00,5.0.1,gachi,standard,asari,otoro,hokusai_becchu,s+,204,prime_becchu,...,prime_becchu,s+,167.0,pablo,s+,188.0,nzap89,s+,170.0,1
66123,2019-10-22T00:00:00+00:00,5.0.1,gachi,standard,hoko,bbass,prime_collabo,a+,116,promodeler_rg,...,bold_neo,a+,70.0,nzap85,a+,191.0,hissen_hue,a+,139.0,1
66124,2019-12-11T00:00:00+00:00,5.0.1,gachi,standard,area,fujitsubo,heroroller_replica,s+,210,sharp_neo,...,heroroller_replica,s+,395.0,rapid_elite_deco,s+,223.0,sharp_neo,s+,194.0,0


## 欠損値補完

In [5]:
train_data['A1-level'].fillna(-1)

id
1        139
2        198
3        114
4        336
5        299
        ... 
66121     66
66122    204
66123    116
66124    210
66125    132
Name: A1-level, Length: 66125, dtype: int64

In [6]:
def complete(data):
    for col_name in WEAPON_COLUMNS:
        data[col_name] = data[col_name].fillna('NULL')
    for col_name in LEVEL_COLUMNS:
        data[col_name] = data[col_name].fillna(-1)
    for col_name in RANK_COLUMNS:
        data[col_name] = data[col_name].fillna('n')

In [7]:
complete(train_data)
complete(test_data)

## エンコーダー構築

### Mode

In [8]:
mode_encoder = LabelEncoder()
mode_encoder.fit(list(train_data["mode"].unique()))

LabelEncoder()

### Stage

In [9]:
stage_encoder = LabelEncoder()
stage_encoder.fit(list(train_data["stage"].unique()))
stage_encoder.transform(['sumeshi'])

array([20])

## Weapon
全部で140種類？

In [10]:
weapon_encoder = LabelEncoder()
weapon_encoder.fit(list(pd.concat([train_data['A1-weapon'], train_data['A2-weapon'], train_data['A3-weapon'], train_data['A4-weapon'], train_data['B1-weapon'], train_data['B2-weapon'], train_data['B3-weapon'], train_data['B4-weapon']]).unique()))

LabelEncoder()

In [11]:
weapon_encoder.classes_

array(['52gal', '52gal_becchu', '52gal_deco', '96gal', '96gal_deco',
       'NULL', 'bamboo14mk1', 'bamboo14mk2', 'bamboo14mk3',
       'barrelspinner', 'barrelspinner_deco', 'barrelspinner_remix',
       'bold', 'bold_7', 'bold_neo', 'bottlegeyser', 'bottlegeyser_foil',
       'bucketslosher', 'bucketslosher_deco', 'bucketslosher_soda',
       'campingshelter', 'campingshelter_camo', 'campingshelter_sorella',
       'carbon', 'carbon_deco', 'clashblaster', 'clashblaster_neo',
       'dualsweeper', 'dualsweeper_custom', 'dynamo', 'dynamo_becchu',
       'dynamo_tesla', 'explosher', 'explosher_custom', 'furo',
       'furo_deco', 'h3reelgun', 'h3reelgun_cherry', 'h3reelgun_d',
       'heroblaster_replica', 'herobrush_replica', 'herocharger_replica',
       'heromaneuver_replica', 'heroroller_replica',
       'heroshelter_replica', 'heroshooter_replica',
       'heroslosher_replica', 'herospinner_replica', 'hissen',
       'hissen_hue', 'hokusai', 'hokusai_becchu', 'hokusai_hue',
       

## Rank
手動で決めてしまう

In [12]:
# TODO: 感覚で数字つけておく
rank_map = {'n': 0, 'c-': 1, 'c': 2, 'c+': 3, 'b-': 4, 'b': 5, 'b+': 6, 'a-': 10, 'a': 11, 'a+': 12, 's': 15, 's+': 18, 'x': 21}
def encode_rank(rank):
    try:
        return rank_map[rank]
    except KeyError:
        return 0
# Rankの合計
def make_rank_data(data, labels):
    result = data[data.columns.intersection(labels)]
    result = result.applymap(encode_rank)
    result = result.sum(axis=1)
    return result

## Level

In [13]:
# Levelの合計つくる
def make_level_data(data, labels):
    result = data[data.columns.intersection(labels)]
    result = result.sum(axis=1)
    return result

## データ作成
まず、mode,stage,rank,levelだけでやる

In [14]:
DROP_COLUMNS = ['period', 'game-ver', 'lobby', 'lobby-mode']

In [15]:
def make_data(data, with_y=False):
    X = data.copy()
    X['mode'] = mode_encoder.transform(data['mode'])
    X['stage'] = stage_encoder.transform(data['stage'])
    for weapon_column in WEAPON_COLUMNS:
        X[weapon_column + '-enc'] = weapon_encoder.transform(data[weapon_column])
    for level_column in LEVEL_COLUMNS:
        X[level_column + '-enc'] = data[level_column]
    for rank_column in RANK_COLUMNS:
        X[rank_column + '-enc'] = data[rank_column].apply(encode_rank)
    for drop_column in DROP_COLUMNS + WEAPON_COLUMNS + LEVEL_COLUMNS + RANK_COLUMNS:
        X = X.drop(drop_column, axis=1)
    if with_y:
        X = X.drop('y', axis=1)
        y = data['y']
        return X, y
    return X

In [16]:
train_X, train_y = make_data(train_data, with_y=True)

In [17]:
test_X = make_data(test_data)

In [18]:
train_X.head()

Unnamed: 0_level_0,mode,stage,A1-weapon-enc,A2-weapon-enc,A3-weapon-enc,A4-weapon-enc,B1-weapon-enc,B2-weapon-enc,B3-weapon-enc,B4-weapon-enc,...,B3-level-enc,B4-level-enc,A1-rank-enc,A2-rank-enc,A3-rank-enc,A4-rank-enc,B1-rank-enc,B2-rank-enc,B3-rank-enc,B4-rank-enc
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3,20,135,112,89,50,13,51,41,110,...,68.0,31.0,0,0,0,0,0,0,0,0
2,3,3,92,57,21,84,77,132,20,126,...,168.0,151.0,0,0,0,0,0,0,0,0
3,2,1,85,99,94,57,13,84,94,28,...,160.0,126.0,10,11,11,11,10,12,10,11
4,3,8,6,117,31,94,114,67,96,54,...,194.0,391.0,0,0,0,0,0,0,0,0
5,2,6,13,49,38,116,34,127,19,139,...,246.0,160.0,21,21,21,21,21,21,21,21


In [19]:
make_rank_data(train_data[0:1], ['A1-rank', 'A2-rank', 'A3-rank', 'A4-rank'])

id
1    0
dtype: int64

## 学習

In [20]:
import xgboost as xgb
from sklearn.metrics import log_loss

In [21]:
dtrain = xgb.DMatrix(train_X, label=train_y)

In [22]:
params = {'objective': 'binary:logistic', 'random_state': 71}
num_round = 1000

In [23]:
model = xgb.train(params, dtrain, num_round)

In [24]:
train_pred_proba = model.predict(dtrain)

In [25]:
train_pred = np.where(train_pred_proba > 0.5, 1, 0)

In [26]:
from sklearn.metrics import accuracy_score

In [27]:
accuracy_score(train_y, train_pred)

0.9957353497164462

## テスト

In [28]:
dtest = xgb.DMatrix(test_X)

In [29]:
test_pred_proba = model.predict(dtest)

In [30]:
test_pred = np.where(test_pred_proba > 0.5, 1, 0)

In [31]:
submit_df = pd.DataFrame({'y': test_pred})
submit_df.index.name = 'id'
submit_df.to_csv('gbdt_submission.csv')