In [1]:
import os
import warnings
warnings.filterwarnings('ignore')
from typing import Any, Dict, List, Tuple
from pathlib import Path
import yaml

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pandas_profiling import ProfileReport # profile report を作る用
from matplotlib_venn import venn2 # venn図を作成する用
from tqdm import tqdm
from contextlib import contextmanager
from time import time
from collections import Counter

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_log_error, mean_squared_error
import lightgbm as lgb

from mykaggle.metric.mse import rmsle
from mykaggle.util.ml_logger import MLLogger
from mykaggle.lib.lgbm_util import compute_importances, save_importances
from mykaggle.lib.pandas_util import change_column_name
from mykaggle.util.routine import fix_seed

sns.set_style('darkgrid')

In [2]:
settings = yaml.safe_load('''
name: '212_eda'
competition: atmacup8
seed: 1019
training:
    num_folds: 5
    num_rounds: 1000
    early_stopping_rounds: 100
    verbose_eval: 20
lgbm_params:
    objective: binary
    learning_rate: 0.05
    max_depth: -1
    num_leaves: 31
    colsample_bytree: .7
    metric: "None"
feature:
    name_bow_pca_dim: 10
    name_bow_word_th1: 5
    name_bow_word_th2: 3
    name_bow_th1_upper: 130
    name_bow_th2_upper: 1000
''')

  and should_run_async(code)


In [3]:
fix_seed(settings['seed'])
pd.set_option('display.max_rows', 500)

In [4]:
datadir = Path('../data/')
ckptdir = Path('../ckpt/') / settings['name']
if not ckptdir.exists():
    ckptdir.mkdir()

In [5]:
df_train = pd.read_csv(datadir / 'id_train.csv')
df_test = pd.read_csv(datadir / 'id_test.csv')
df_submission = pd.read_csv(datadir / 'atmaCup8_sample-submission.csv')
df_train.shape, df_test.shape

((8358, 17), (8360, 12))

In [6]:
df_train.head().T

Unnamed: 0,0,1,2,3,4
Name,LEGO Batman: The Videogame,LEGO Indiana Jones: The Original Adventures,LEGO Batman: The Videogame,Combat,LEGO Harry Potter: Years 5-7
Platform,Wii,Wii,PSP,2600,Wii
Year_of_Release,,,,,
Genre,Action,Action,Action,Action,Action
Publisher,Warner Bros. Interactive Entertainment,LucasArts,Warner Bros. Interactive Entertainment,Atari,Warner Bros. Interactive Entertainment
NA_Sales,180,151,56,117,69
EU_Sales,97,61,44,7,42
JP_Sales,0,0,0,0,0
Other_Sales,28,21,27,1,12
Global_Sales,306,234,128,125,124


In [7]:
df_test.head().T

  and should_run_async(code)


Unnamed: 0,0,1,2,3,4
Name,Hitman 2: Silent Assassin,Legacy of Kain: Soul Reaver,Metal Gear Solid 2: Substance,Silent Hill: Homecoming,Silent Hill: Homecoming
Platform,XB,PS,XB,X360,PS3
Year_of_Release,,,,,
Genre,Action,Action,Action,Action,Action
Publisher,Eidos Interactive,Eidos Interactive,Konami Digital Entertainment,Konami Digital Entertainment,Konami Digital Entertainment
Critic_Score,84,91,87,70,71
Critic_Count,23,17,28,54,41
User_Score,8,9,8.5,6.9,6.9
User_Count,19,132,39,180,143
Developer,Io Interactive,Crystal Dynamics,KCEJ,Double Helix Games,Double Helix Games


In [8]:
df_submission.head()

  and should_run_async(code)


Unnamed: 0,Global_Sales
0,63.371815
1,63.371815
2,63.371815
3,63.371815
4,63.371815


In [9]:
df_train.loc[:, 'has_na_sales'] = df_train.loc[:, 'NA_Sales']  > 0
df_train.loc[:, 'has_eu_sales'] = df_train.loc[:, 'EU_Sales']  > 0
df_train.loc[:, 'has_jp_sales'] = df_train.loc[:, 'JP_Sales']  > 0
df_train.loc[:, 'has_other_sales'] = df_train.loc[:, 'Other_Sales']  > 0
# df_train.to_csv(datadir / 'country_train.csv', index=False)

  and should_run_async(code)


In [10]:
# 要検討
df_best_pred = pd.read_csv('../ckpt/187_name_count/187_name_count.csv')
df_test.loc[:, 'Global_Sales'] = df_best_pred.loc[:, 'Global_Sales']

  and should_run_async(code)


In [10]:
train = df_train.copy()
test = df_test.copy()
train['is_test'] = False
test['is_test'] = True
df_whole = pd.concat([train, test])

  and should_run_async(code)


In [11]:
train_others = {
    'main': df_train.copy(),
    'another': df_test.copy()
}
test_others = {
    'main': df_test.copy(),
    'another': df_train.copy(),
}

# Nameの特徴作る

In [13]:
from mykaggle.feature.le import LE
from mykaggle.feature.name_bow2 import NameBOW2
from mykaggle.feature.name_bow3 import NameBOW3
from mykaggle.feature.name_series_count import NameSeriesCount

fhp = settings['feature']

nbow_train = NameBOW2(
    True, fhp['name_bow_pca_dim'], fhp['name_bow_word_th1'], fhp['name_bow_word_th2'],
    fhp['name_bow_th1_upper'], fhp['name_bow_th2_upper'],
)
nbow_test = NameBOW2(
    False, fhp['name_bow_pca_dim'], fhp['name_bow_word_th1'], fhp['name_bow_word_th2'],
    fhp['name_bow_th1_upper'], fhp['name_bow_th2_upper'],
)
nbow3_train = NameBOW3(
    True, fhp['name_bow_pca_dim'], fhp['name_bow_word_th1'], fhp['name_bow_word_th2'],
    fhp['name_bow_th1_upper'], fhp['name_bow_th2_upper'],
)
nbow3_test = NameBOW3(
    False, fhp['name_bow_pca_dim'], fhp['name_bow_word_th1'], fhp['name_bow_word_th2'],
    fhp['name_bow_th1_upper'], fhp['name_bow_th2_upper'],
)
nsc_train = NameSeriesCount(train=True)
nsc_test = NameSeriesCount(train=False)

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


In [14]:
df_f_train = train[['id']].copy()
df_f_test = test[['id']].copy()

df_f_train = nbow3_train(df_f_train, train_others)
# df_f_train = nsc_train(df_f_train, train_others)

df_f_test = nbow3_test(df_f_test, test_others)
# df_f_test = nsc_test(df_f_test, test_others)

df_f_test = df_f_test.fillna(0)

  and should_run_async(code)


In [15]:
df_f_train.shape, df_f_test.shape

  and should_run_async(code)


((8359, 11), (8360, 11))

In [16]:
word_train = df_f_train.iloc[:, 1:].values
word_test = df_f_test.iloc[:, 1:].values
word_train.shape, word_test.shape

((8359, 10), (8360, 10))

In [17]:
from sklearn.metrics.pairwise import euclidean_distances, manhattan_distances, cosine_similarity

In [15]:
print(df_test[df_test['Publisher'] == 'Unknown']['Publisher'].shape)
df_test['Publisher'] = df_test['Publisher'].fillna('Unknown')
print(df_test[df_test['Publisher'] == 'Unknown']['Publisher'].shape)
unk_index = df_test[df_test['Publisher'] == 'Unknown'].index
df_test_unk = df_test.iloc[unk_index, :].reset_index()
# word_test_unk = word_test[unk_index]
df_test_unk.shape #, word_test_unk.shape

(255,)
(255,)


(255, 14)

In [19]:
word_test_unk

array([[-0.03196166, -0.00631504, -0.00934762, ...,  0.00642338,
        -0.00468251, -0.01375352],
       [-0.05378218, -0.02017085, -0.01802754, ...,  0.0566798 ,
        -0.01118895, -0.00037687],
       [-0.01725487, -0.01183572, -0.00577344, ...,  0.01129381,
        -0.00726148, -0.01938195],
       ...,
       [-0.03013965, -0.01146357,  0.01415896, ...,  0.00163465,
        -0.0091265 , -0.01596059],
       [-0.0332226 , -0.00575323, -0.00857906, ...,  0.00293759,
        -0.0076021 , -0.01359298],
       [-0.0331198 , -0.0062172 , -0.00833887, ...,  0.00330704,
        -0.00767522, -0.0148032 ]])

In [20]:
eucl_dist = euclidean_distances(word_train, word_test_unk)
manh_dist = manhattan_distances(word_train, word_test_unk)
cos_dist = cosine_similarity(word_train, word_test_unk)

In [21]:
np.argmin(eucl_dist, axis=0)

  and should_run_async(code)


array([2143, 1368, 3179, 7233, 5301, 3177, 5096, 4083, 2322, 6175, 5462,
       2897, 5462,  536, 1934, 4467, 4467, 1422,  897, 5891, 6912, 2492,
       2147, 5150, 8237,  990, 1966, 2918,  693, 1738, 2024, 1316, 2020,
       4635, 7882, 3908, 5207, 5008,   46, 5946, 6667, 2555, 5462, 2843,
       3438, 5333,   66, 3549, 6477, 7976, 5462, 7371, 5462, 4113, 5207,
       5207, 5207, 4700,  823, 7882,  803,  702, 4459, 2950, 6711, 5462,
       2589, 5462, 8014, 5462, 7482, 5462, 1924, 6164, 4677, 1328, 1739,
       1739, 5462, 5462,  657, 5207, 5462, 4786, 5607, 1063, 5607, 7712,
       5207, 5592, 3261, 2825, 6391, 3976, 6291, 3012, 4020, 1473, 6743,
       1808, 5669,  997, 5669, 5669, 1031, 2474,  515, 3115, 5462, 5462,
       5462,  515, 5425, 3597, 2293, 5207, 1190, 4939, 2550, 2474, 3302,
       5963, 6091, 7413, 8043,  646,  325, 4031, 7882, 7030, 5389, 4964,
        210, 3424, 5223, 5223, 5223, 5223, 5223, 5223, 5223, 5223, 5223,
       5223, 5223, 5223, 5223,  210, 3383, 5223, 42

In [22]:
df_train_unk = df_train.iloc[np.argmin(eucl_dist, axis=0), :][['Name', 'Publisher', 'Platform']].reset_index()
df_train_unk.shape

(255, 4)

In [23]:
pd.concat([df_train_unk, df_test_unk[['Name', 'Publisher', 'Platform']]], axis=1)

Unnamed: 0,index,Name,Publisher,Platform,Name.1,Publisher.1,Platform.1
0,2143,Growlanser Generations,Atlus,PS2,Cubix Robots for Everyone: Clash 'n' Bash,Unknown,GBA
1,1368,GT Advance Championship Racing,THQ,GBA,Dragon Ball Z: Budokai Tenkaichi 2 (JP sales),Unknown,Wii
2,3179,Nicktoons: Battle for Volcano Island,THQ,DS,Nicktoons: Battle for Volcano Island,Unknown,GC
3,7233,The Legend of Zelda: A Link Between Worlds,Nintendo,3DS,The Legend of Zelda: The Minish Cap(weekly JP ...,Unknown,GBA
4,5301,Pinball Hall of Fame: The Williams Collection,Crave Entertainment,X360,Twisted Metal: Small Brawl,Unknown,PS
5,3177,Teen Titans,THQ,GC,Teen Titans,Unknown,GBA
6,5096,SpongeBob's Truth or Square,THQ,DS,Jurassic Park: The Game,Unknown,X360
7,4083,Operation Darkness,Success,X360,WCW Backstage Assault,Unknown,N64
8,2322,Famicom Mini: Nazo no Murasame-Jou,Nintendo,GBA,Action Man-Operation Extreme,Unknown,PS
9,6175,Monster Hunter Frontier Online: Season 9.0,Capcom,X360,Prinny: Can I Really Be The Hero? (US sales),Unknown,PSP


In [47]:
df_train[df_train['Name'].str.contains('Nicktoons')]

  and should_run_async(code)


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,...,Critic_Count,User_Score,User_Count,Developer,Rating,id,has_na_sales,has_eu_sales,has_jp_sales,has_other_sales
2295,Nicktoons: Freeze Frame Frenzy,GBA,2004.0,Action,THQ,28,11,0,1,40,...,4.0,tbd,,Altron,E,2295,True,True,False,True
2297,Nicktoons: Movin',PS2,2004.0,Action,THQ,16,12,0,4,32,...,,tbd,,Mass Media,E,2297,True,True,False,True
2776,Nicktoons: Unite!,PS2,2005.0,Adventure,THQ,34,1,0,6,41,...,,,,,,2776,True,True,False,True
2778,Nicktoons: Unite!,GC,2005.0,Adventure,THQ,27,7,0,1,35,...,,,,,,2778,True,True,False,True
3179,Nicktoons: Battle for Volcano Island,DS,2006.0,Action,THQ,27,0,0,2,28,...,6.0,tbd,,Natsume,E,3179,True,False,False,True
3265,Nicktoons: Unite!,DS,2006.0,Adventure,THQ,10,0,0,1,11,...,,,,,,3265,True,False,False,True
3934,Nicktoons: Attack of the Toybots,PS2,2007.0,Platform,THQ,24,19,0,6,50,...,,tbd,,Blue Tongue,E,3934,True,True,False,True
3935,Nicktoons: Attack of the Toybots,Wii,2007.0,Platform,THQ,40,0,0,3,43,...,4.0,6,8.0,Blue Tongue Entertainment,E,3935,True,False,False,True
3948,Nicktoons: Attack of the Toybots,DS,2007.0,Platform,THQ,4,0,0,0,5,...,4.0,tbd,,Natsume,E,3948,True,False,False,False
4317,SpongeBob SquarePants featuring Nicktoons: Glo...,PS2,2008.0,Action,THQ,28,22,0,7,57,...,,tbd,,Incinerator Games,E,4317,True,True,False,True


In [17]:
df_test_unk = df_test_unk.set_index('index')

  and should_run_async(code)


In [21]:
count = 0
count_series = 0
count_trigram = 0
count_bigram = 0
count_unigram = 0

for i, name in zip(df_test_unk.index, df_test_unk['Name']):
    df_tmp = pd.DataFrame()
    if ':' in name:
        name = name.split(':')[0]
        df_tmp = df_train[df_train['Name'].str.startswith(name)]
        if df_tmp.shape[0] != 0:
            count_series += 1
    if df_tmp.shape[0] == 0 and len(name.split(' ')) >= 3:
        name = ' '.join(name.split(' ')[:3])
        df_tmp = df_train[df_train['Name'].str.startswith(name)]
        if df_tmp.shape[0] != 0:
            count_trigram += 1
    if df_tmp.shape[0] == 0 and len(name.split(' ')) >= 2:
        name = ' '.join(name.split(' ')[:2])
        df_tmp = df_train[df_train['Name'].str.startswith(name)]
        if df_tmp.shape[0] != 0:
            count_bigram += 1
#     if df_tmp.shape[0] == 0:
#         name = name.split(' ')[0]
#         if name != 'The':
#             df_tmp = df_train[df_train['Name'].str.startswith(name)]
#             count_unigram += 1

    if df_tmp.shape[0] > 0:
#         print(name)
        # display(df_tmp)
        count += 1
        most_publisher = df_tmp.loc[:, 'Publisher'].value_counts().reset_index().loc[0, 'index']
        if df_test_unk.loc[i, 'Publisher'] != 'Unknown':
            raise
        df_test_unk.loc[i, 'Publisher'] = most_publisher
print(count, count_series, count_trigram, count_bigram, count_unigram)

90 38 20 32 0


In [22]:
df_test_unk

Unnamed: 0_level_0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
5,Cubix Robots for Everyone: Clash 'n' Bash,GBA,,Action,Unknown,,,,,,,8364,5.408323
6,Dragon Ball Z: Budokai Tenkaichi 2 (JP sales),Wii,,Action,Atari,,,,,,,8365,4.898906
8,Nicktoons: Battle for Volcano Island,GC,,Action,THQ,,,7,4.0,Blue Tongue Entertainment,E,8367,5.430704
9,The Legend of Zelda: The Minish Cap(weekly JP ...,GBA,,Action,Nintendo,,,,,,,8368,4.78398
12,Twisted Metal: Small Brawl,PS,,Action,Sony Computer Entertainment,51.0,14.0,7.6,49.0,Incognito Inc.,T,8371,18.809165
13,Teen Titans,GBA,,Action,THQ,61.0,9.0,tbd,,Artificial Mind and Movement,E10+,8372,9.472561
16,Jurassic Park: The Game,X360,,Action,Vivendi Games,60.0,9.0,3.6,138.0,Telltale Games,T,8375,18.779847
17,WCW Backstage Assault,N64,,Action,Unknown,,,,,,,8376,14.427532
19,Action Man-Operation Extreme,PS,,Action,Unknown,,,,,,,8378,4.685558
22,Prinny: Can I Really Be The Hero? (US sales),PSP,,Action,Nippon Ichi Software,,,,,,,8381,6.148178


In [20]:
# 90個を整理

In [23]:
df_final_test = pd.read_csv(datadir / 'id_test.csv')
df_final_test.loc[df_test_unk.index, 'Publisher'] = df_test_unk.loc[:, 'Publisher']
df_final_test.to_csv(datadir / 'id_test_unknown1.csv', index=False)

  and should_run_async(code)


In [24]:
df_final_test = pd.read_csv(datadir / 'country_prob_test.csv')
df_final_test.loc[df_test_unk.index, 'Publisher'] = df_test_unk.loc[:, 'Publisher']
df_final_test.to_csv(datadir / 'country_prob_test_unknown1.csv', index=False)

In [25]:
df_final_test[df_final_test.loc[:, 'Publisher'] == 'Unknown'].shape, df_final_test[df_final_test.loc[:, 'Publisher'].isna()].shape

((165, 16), (0, 16))

In [130]:
df_final_test[df_final_test.loc[:, 'Publisher'] == 'Unknown'].shape, df_final_test[df_final_test.loc[:, 'Publisher'].isna()].shape

((165, 12), (0, 12))

In [None]:
df_final_test = pd.read_csv(datadir / 'country_prob_test.csv')
# df_final_test.loc[df_test_unk.index, 'Publisher'] = df_test_unk.loc[:, 'Publisher']
df_final_test.to_csv(datadir / 'country_prob_test_unknown.csv', index=False)

In [None]:
publisher_map = df_pivot_train[['Publisher']].copy()
publisher_map = publisher_map.reset_index()[['Publisher']]
publisher_map = {k: v for k, v in zip(publisher_map.index, publisher_map['Publisher'])}