In [1]:
import os
import warnings
warnings.filterwarnings('ignore')
from typing import Any, Dict, List, Tuple
from pathlib import Path
import yaml

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pandas_profiling import ProfileReport # profile report を作る用
from matplotlib_venn import venn2 # venn図を作成する用
from tqdm import tqdm
from contextlib import contextmanager
from time import time
from collections import Counter

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_log_error, mean_squared_error
import lightgbm as lgb

from mykaggle.metric.mse import rmsle
from mykaggle.util.ml_logger import MLLogger
from mykaggle.lib.lgbm_util import compute_importances, save_importances
from mykaggle.lib.pandas_util import change_column_name
from mykaggle.util.routine import fix_seed

sns.set_style('darkgrid')

In [2]:
settings = yaml.safe_load('''
name: '111_eda'
competition: atmacup8
seed: 1019
training:
    num_folds: 5
    num_rounds: 1000
    early_stopping_rounds: 100
    verbose_eval: 20
lgbm_params:
    objective: binary
    learning_rate: 0.05
    max_depth: -1
    num_leaves: 31
    colsample_bytree: .7
    metric: "None"
''')

  and should_run_async(code)


In [3]:
fix_seed(settings['seed'])

In [4]:
datadir = Path('../data/')
ckptdir = Path('../ckpt/') / settings['name']
if not ckptdir.exists():
    ckptdir.mkdir()

In [5]:
df_train = pd.read_csv(datadir / 'id_train.csv')
df_test = pd.read_csv(datadir / 'id_test.csv')
df_submission = pd.read_csv(datadir / 'atmaCup8_sample-submission.csv')
df_train.shape, df_test.shape

((8359, 17), (8360, 12))

In [6]:
df_train.head().T

Unnamed: 0,0,1,2,3,4
Name,LEGO Batman: The Videogame,LEGO Indiana Jones: The Original Adventures,LEGO Batman: The Videogame,Combat,LEGO Harry Potter: Years 5-7
Platform,Wii,Wii,PSP,2600,Wii
Year_of_Release,,,,,
Genre,Action,Action,Action,Action,Action
Publisher,Warner Bros. Interactive Entertainment,LucasArts,Warner Bros. Interactive Entertainment,Atari,Warner Bros. Interactive Entertainment
NA_Sales,180,151,56,117,69
EU_Sales,97,61,44,7,42
JP_Sales,0,0,0,0,0
Other_Sales,28,21,27,1,12
Global_Sales,306,234,128,125,124


In [7]:
df_test.head().T

  and should_run_async(code)


Unnamed: 0,0,1,2,3,4
Name,Hitman 2: Silent Assassin,Legacy of Kain: Soul Reaver,Metal Gear Solid 2: Substance,Silent Hill: Homecoming,Silent Hill: Homecoming
Platform,XB,PS,XB,X360,PS3
Year_of_Release,,,,,
Genre,Action,Action,Action,Action,Action
Publisher,Eidos Interactive,Eidos Interactive,Konami Digital Entertainment,Konami Digital Entertainment,Konami Digital Entertainment
Critic_Score,84,91,87,70,71
Critic_Count,23,17,28,54,41
User_Score,8,9,8.5,6.9,6.9
User_Count,19,132,39,180,143
Developer,Io Interactive,Crystal Dynamics,KCEJ,Double Helix Games,Double Helix Games


In [8]:
df_submission.head()

  and should_run_async(code)


Unnamed: 0,Global_Sales
0,63.371815
1,63.371815
2,63.371815
3,63.371815
4,63.371815


In [9]:
train = df_train.copy()
test = df_test.copy()
train['is_test'] = False
test['is_test'] = True
whole = pd.concat([train, test])

  and should_run_async(code)


In [10]:
train_others = {
    'main': df_train.copy(),
    'another': df_test.copy()
}
test_others = {
    'main': df_test.copy(),
    'another': df_train.copy(),
}

# Platform x Year_of_Release

In [11]:
from mykaggle.feature.year_rank5 import YearRank5
year_rank_train = YearRank5(train=True)
year_rank_test = YearRank5(train=False)
train = year_rank_train(train, others=train_others, use_cache=False, save_cache=False)
test = year_rank_test(test, others=test_others, use_cache=False, save_cache=False)
whole = pd.concat([train, test])

In [12]:
whole.tail()

  and should_run_async(code)


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,is_test,year_rank_rate,year_rank_plus
8355,SD Gundam G Generation Genesis,PSV,2016.0,Strategy,Namco Bandai Games,,,,,,,,,,,,16714,True,0.983471,2016.983471
8356,SD Gundam G Generation Genesis,PS4,2016.0,Strategy,Namco Bandai Games,,,,,,,,,,,,16715,True,0.987603,2016.987603
8357,Battle Worlds: Kronos,PC,2016.0,Strategy,Nordic Games,,,,,,71.0,15.0,7.1,71.0,KING Art,,16716,True,0.991736,2016.991736
8358,Codename: Panzers Complete Collection,PC,2016.0,Strategy,Nordic Games,,,,,,,,,,,,16717,True,0.995868,2016.995868
8359,Imagine: Makeup Artist,DS,2020.0,Simulation,Ubisoft,,,,,,,,tbd,,Ubisoft,E,16718,True,0.0,2020.0


In [13]:
pf_to_yor = whole.groupby('Platform')['year_rank_plus'].agg(['min', 'max']).reset_index()
pf_to_yor.columns = ['Platform', 'platform_year_rank_min', 'platform_year_rank_max']
# pf_to_yor['publisher_term'] = pf_to_yor.loc[:, 'max'] - pf_to_yor.loc[:, 'min']
train = pd.merge(train, pf_to_yor, how='left', on='Platform')
train['diff_now_and_platform_min'] = train.loc[:, 'year_rank_plus'] - train.loc[:, 'platform_year_rank_min']
train['diff_now_and_platform_max'] = train.loc[:, 'platform_year_rank_max'] - train.loc[:, 'year_rank_plus']

  and should_run_async(code)


In [14]:
train.head()

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,...,Developer,Rating,id,is_test,year_rank_rate,year_rank_plus,platform_year_rank_min,platform_year_rank_max,diff_now_and_platform_min,diff_now_and_platform_max
0,LEGO Batman: The Videogame,Wii,,Action,Warner Bros. Interactive Entertainment,180,97,0,28,306,...,Traveller's Tales,E10+,0,False,,,2006.0,2016.458678,,
1,LEGO Indiana Jones: The Original Adventures,Wii,,Action,LucasArts,151,61,0,21,234,...,Traveller's Tales,E10+,1,False,,,2006.0,2016.458678,,
2,LEGO Batman: The Videogame,PSP,,Action,Warner Bros. Interactive Entertainment,56,44,0,27,128,...,Traveller's Tales,E10+,2,False,,,2004.039604,2015.388158,,
3,Combat,2600,,Action,Atari,117,7,0,1,125,...,,,3,False,,,1980.0,1989.5,,
4,LEGO Harry Potter: Years 5-7,Wii,,Action,Warner Bros. Interactive Entertainment,69,42,0,12,124,...,Traveller's Tales,E10+,4,False,,,2006.0,2016.458678,,


# Sales 順で見る

In [15]:
train['Publisher'].value_counts().head(20)

  and should_run_async(code)


THQ                                       715
Nintendo                                  706
Sony Computer Entertainment               687
Sega                                      638
Take-Two Interactive                      422
Capcom                                    386
Atari                                     367
Tecmo Koei                                348
Warner Bros. Interactive Entertainment    235
Square Enix                               234
Disney Interactive Studios                218
Microsoft Game Studios                    191
505 Games                                 191
D3Publisher                               184
Vivendi Games                             164
Idea Factory                              133
Nippon Ichi Software                      106
Majesco Entertainment                      92
LucasArts                                  90
Bethesda Softworks                         76
Name: Publisher, dtype: int64

In [16]:
test['Publisher'].value_counts().head(20)

Electronic Arts                 1356
Activision                       985
Namco Bandai Games               939
Ubisoft                          933
Konami Digital Entertainment     834
Unknown                          201
Midway Games                     198
Eidos Interactive                198
Acclaim Entertainment            186
Codemasters                      150
Deep Silver                      121
Zoo Digital Publishing           104
Rising Star Games                 87
Hudson Soft                       81
5pb                               62
Infogrames                        62
Virgin Interactive                62
Empire Interactive                53
PQube                             45
GT Interactive                    45
Name: Publisher, dtype: int64

In [17]:
train.sort_values('Global_Sales', ascending=False).head(20)

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,...,Developer,Rating,id,is_test,year_rank_rate,year_rank_plus,platform_year_rank_min,platform_year_rank_max,diff_now_and_platform_min,diff_now_and_platform_max
3588,Wii Sports,Wii,2006.0,Sports,Nintendo,4136,2896,377,844,8253,...,Nintendo,E,3588,False,0.864706,2006.864706,2006.0,2016.458678,0.864706,9.593972
170,Super Mario Bros.,NES,1985.0,Platform,Nintendo,2908,358,681,77,4024,...,,,170,False,0.222222,1985.222222,1983.428571,1994.608108,1.793651,9.385886
4725,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,1568,1276,379,329,3552,...,Nintendo,E,4725,False,0.602721,2008.602721,2006.0,2016.458678,2.602721,7.855957
5593,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,1561,1093,328,295,3277,...,Nintendo,E,5593,False,0.827586,2009.827586,2006.0,2016.458678,3.827586,6.631091
594,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,1127,889,1022,100,3137,...,,,594,False,0.5,1996.5,1988.363636,2001.945701,8.136364,5.445701
214,Tetris,GB,1989.0,Puzzle,Nintendo,2320,225,422,57,3026,...,,,214,False,0.285714,1989.285714,1988.363636,2001.945701,0.922078,12.659987
3390,New Super Mario Bros.,DS,2006.0,Platform,Nintendo,1128,914,650,288,2980,...,Nintendo,E,3390,False,0.476471,2006.476471,1985.111111,2020.0,21.365359,13.523529
3331,Wii Play,Wii,2006.0,Misc,Nintendo,1396,918,293,284,2892,...,Nintendo,E,3331,False,0.360784,2006.360784,2006.0,2016.458678,0.360784,10.097893
5352,New Super Mario Bros. Wii,Wii,2009.0,Platform,Nintendo,1444,694,470,224,2832,...,Nintendo,E,5352,False,0.481322,2009.481322,2006.0,2016.458678,3.481322,6.977356
164,Duck Hunt,NES,1984.0,Shooter,Nintendo,2693,63,28,47,2831,...,,,164,False,0.555556,1984.555556,1983.428571,1994.608108,1.126984,10.052553


# Name を見る

In [None]:
names = whole.loc[:, 'Name'].values
names

In [None]:
def preprocess(x: str) -> str:
    x = x.replace(',', '')
    x = x.replace(':', ' ')
    x = x.replace(', ', ' ')
    x = x.replace('!', ' ')
    x = x.replace('\'s', ' ')
    x = x.replace('  ', ' ')

    return x.lower()

whole['processed_name'] = whole.loc[:, 'Name'].astype(str).apply(preprocess)

In [None]:
pd.DataFrame(whole['processed_name'].unique(), columns=['processed_name'])

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vec1 = CountVectorizer()
res1 = vec1.fit_transform(whole['processed_name'].unique())
vec2 = CountVectorizer(ngram_range=(2, 2))
res2 = vec2.fit_transform(whole['processed_name'].unique())
vec3 = CountVectorizer(ngram_range=(2, 4))
res3 = vec3.fit_transform(whole['processed_name'].unique())

In [None]:
word_1gram_cnt = res1.toarray().sum(axis=0)
word_1gram_cnt.shape

In [None]:
np.array(vec1.get_feature_names())[word_1gram_cnt > 100].shape, np.array(vec1.get_feature_names())[word_1gram_cnt > 100]

In [None]:
word_2gram_cnt = res2.toarray().sum(axis=0)
word_2gram_cnt.shape

In [None]:
Counter(word_1gram_cnt).most_common()

In [None]:
Counter(word_2gram_cnt).most_common()

In [None]:
words1 = np.array(vec1.get_feature_names())[word_1gram_cnt > 150]
len(words1), words1

In [None]:
words2 = np.array(vec2.get_feature_names())[word_2gram_cnt > 100]
len(words2), words2

In [None]:
word_2_4_gram_cnt = res3.toarray().sum(axis=0)
word_2_4_gram_cnt.shape

In [None]:
words3 = np.array(vec3.get_feature_names())[word_2_4_gram_cnt > 5]
len(words3), words3

In [None]:
test = res3.toarray()[:, word_2_4_gram_cnt > 5]

In [None]:
test.shape

In [None]:
whole[whole['processed_name'].str.contains('dragon')]

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [None]:
pca = PCA(4)
pca_feature = pca.fit_transform(res3.toarray()[:, word_2_4_gram_cnt > 5])

In [None]:
pca_feature.shape

In [None]:
TSNE(2)

In [18]:
from mykaggle.feature.name_tfidf import NameTfidf
name_tfidf = NameTfidf()

  and should_run_async(code)


In [20]:
name_tfidf(train, train_others)

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,...,year_rank_rate,year_rank_plus,platform_year_rank_min,platform_year_rank_max,diff_now_and_platform_min,diff_now_and_platform_max,pca_0_name_tfidf_gt_count_5_3,pca_1_name_tfidf_gt_count_5_3,pca_2_name_tfidf_gt_count_5_3,pca_3_name_tfidf_gt_count_5_3
0,LEGO Batman: The Videogame,Wii,,Action,Warner Bros. Interactive Entertainment,180,97,0,28,306,...,,,2006.000000,2016.458678,,,0.145092,0.008547,0.006841,0.133784
1,LEGO Indiana Jones: The Original Adventures,Wii,,Action,LucasArts,151,61,0,21,234,...,,,2006.000000,2016.458678,,,0.108966,0.004712,0.002260,0.103707
2,LEGO Batman: The Videogame,PSP,,Action,Warner Bros. Interactive Entertainment,56,44,0,27,128,...,,,2004.039604,2015.388158,,,0.145092,0.008547,0.006841,0.133784
3,Combat,2600,,Action,Atari,117,7,0,1,125,...,,,1980.000000,1989.500000,,,-0.032967,-0.010672,-0.010109,-0.005047
4,LEGO Harry Potter: Years 5-7,Wii,,Action,Warner Bros. Interactive Entertainment,69,42,0,12,124,...,,,2006.000000,2016.458678,,,-0.019856,-0.008365,-0.005516,0.013714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8354,Stellaris,PC,2016.0,Strategy,Paradox Interactive,0,4,0,0,4,...,0.992308,2016.992308,1985.800000,2016.996154,31.192308,0.003846,-0.031381,-0.008407,-0.008226,0.000110
8355,Total War Attila: Tyrants & Kings,PC,2016.0,Strategy,Koch Media,0,1,0,0,1,...,0.996154,2016.996154,1985.800000,2016.996154,31.196154,0.000000,-0.014175,-0.008996,-0.009340,-0.053256
8356,Brothers Conflict: Precious Baby,PSV,2017.0,Action,Idea Factory,0,0,1,0,1,...,0.000000,2017.000000,2011.084685,2017.666667,5.915315,0.666667,-0.032367,-0.009348,-0.009730,0.002313
8357,Phantasy Star Online 2 Episode 4: Deluxe Package,PS4,2017.0,Role-Playing,Sega,0,0,4,0,4,...,0.333333,2017.333333,2013.007547,2017.333333,4.325786,0.000000,-0.031939,-0.010412,0.004575,0.016523
