In [1]:
import os
import warnings
warnings.filterwarnings('ignore')
from typing import Any, Dict, List, Tuple
from pathlib import Path
import yaml

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pandas_profiling import ProfileReport # profile report を作る用
from matplotlib_venn import venn2 # venn図を作成する用
from tqdm import tqdm
from contextlib import contextmanager
from time import time

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_log_error, mean_squared_error
import lightgbm as lgb

from mykaggle.metric.mse import rmsle
from mykaggle.util.ml_logger import MLLogger
from mykaggle.lib.lgbm_util import compute_importances, save_importances
from mykaggle.lib.pandas_util import change_column_name
from mykaggle.util.routine import fix_seed

sns.set_style('darkgrid')

In [2]:
settings = yaml.safe_load('''
name: '045_eda'
competition: atmacup8
seed: 1019
training:
    num_folds: 5
    num_rounds: 1000
    early_stopping_rounds: 100
    verbose_eval: 20
lgbm_params:
    objective: binary
    learning_rate: 0.05
    max_depth: -1
    num_leaves: 31
    colsample_bytree: .7
    metric: "None"
''')

  and should_run_async(code)


In [3]:
fix_seed(settings['seed'])

In [4]:
datadir = Path('../data/')
ckptdir = Path('../ckpt/') / settings['name']
if not ckptdir.exists():
    ckptdir.mkdir()

In [5]:
df_train = pd.read_csv(datadir / 'id_train.csv')
df_test = pd.read_csv(datadir / 'id_test.csv')
df_submission = pd.read_csv(datadir / 'atmaCup8_sample-submission.csv')
df_train.shape, df_test.shape

((8359, 17), (8360, 12))

In [6]:
df_train.head().T

Unnamed: 0,0,1,2,3,4
Name,LEGO Batman: The Videogame,LEGO Indiana Jones: The Original Adventures,LEGO Batman: The Videogame,Combat,LEGO Harry Potter: Years 5-7
Platform,Wii,Wii,PSP,2600,Wii
Year_of_Release,,,,,
Genre,Action,Action,Action,Action,Action
Publisher,Warner Bros. Interactive Entertainment,LucasArts,Warner Bros. Interactive Entertainment,Atari,Warner Bros. Interactive Entertainment
NA_Sales,180,151,56,117,69
EU_Sales,97,61,44,7,42
JP_Sales,0,0,0,0,0
Other_Sales,28,21,27,1,12
Global_Sales,306,234,128,125,124


In [7]:
df_test.head().T

  and should_run_async(code)


Unnamed: 0,0,1,2,3,4
Name,Hitman 2: Silent Assassin,Legacy of Kain: Soul Reaver,Metal Gear Solid 2: Substance,Silent Hill: Homecoming,Silent Hill: Homecoming
Platform,XB,PS,XB,X360,PS3
Year_of_Release,,,,,
Genre,Action,Action,Action,Action,Action
Publisher,Eidos Interactive,Eidos Interactive,Konami Digital Entertainment,Konami Digital Entertainment,Konami Digital Entertainment
Critic_Score,84,91,87,70,71
Critic_Count,23,17,28,54,41
User_Score,8,9,8.5,6.9,6.9
User_Count,19,132,39,180,143
Developer,Io Interactive,Crystal Dynamics,KCEJ,Double Helix Games,Double Helix Games


In [8]:
df_submission.head()

  and should_run_async(code)


Unnamed: 0,Global_Sales
0,63.371815
1,63.371815
2,63.371815
3,63.371815
4,63.371815


In [9]:
train = df_train.copy()
test = df_test.copy()
train['is_test'] = False
test['is_test'] = True
whole = pd.concat([train, test])

  and should_run_async(code)


# Publisher を見る

In [10]:
pub_to_platform_count = whole.pivot_table(index='Publisher', columns='Platform',values='Name', aggfunc='count').reset_index()
pub_to_platform_count = pub_to_platform_count.fillna(0)
pub_to_platform_count.head()

Platform,Publisher,2600,3DO,3DS,DC,DS,GB,GBA,GC,GEN,...,SAT,SCD,SNES,TG16,WS,Wii,WiiU,X360,XB,XOne
0,10TACLE Studios,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1C Company,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,20th Century Fox Video Games,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2D Boy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3DO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [11]:
pub_to_genre_count = whole.pivot_table(index='Publisher', columns='Genre',values='Name', aggfunc='count').reset_index()
pub_to_genre_count = pub_to_genre_count.fillna(0)
pub_to_genre_count.head()

  and should_run_async(code)


Genre,Publisher,Action,Adventure,Fighting,Misc,Platform,Puzzle,Racing,Role-Playing,Shooter,Simulation,Sports,Strategy
0,10TACLE Studios,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1C Company,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
2,20th Century Fox Video Games,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,2D Boy,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3DO,17.0,3.0,1.0,0.0,1.0,1.0,0.0,1.0,5.0,0.0,6.0,1.0


In [12]:
pub_to_year_count = whole.pivot_table(index='Publisher', columns='Year_of_Release',values='Name', aggfunc='count').reset_index()
pub_to_year_count = pub_to_year_count.fillna(0)
pub_to_year_count.head()

  and should_run_async(code)


Year_of_Release,Publisher,1980.0,1981.0,1982.0,1983.0,1984.0,1985.0,1986.0,1987.0,1988.0,...,2009.0,2010.0,2011.0,2012.0,2013.0,2014.0,2015.0,2016.0,2017.0,2020.0
0,10TACLE Studios,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1C Company,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,20th Century Fox Video Games,0.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2D Boy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3DO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
pub_to_dev_count = whole.pivot_table(index='Publisher', columns='Developer',values='Name', aggfunc='count').reset_index()
pub_to_dev_count = pub_to_dev_count.fillna(0)
pub_to_dev_count.head()

  and should_run_async(code)


Developer,Publisher,10tacle Studios,"10tacle Studios, Fusionsphere Systems","1C, 1C Company","1C, Ino-Co, 1C Company","1C, Various, 1C Company",1C: Maddox Games,1C:Ino-Co,1st Playable Productions,2015,...,id Software,"id Software, Nerve Software","id Software, Raven Software",n-Space,neo Software,odenis studio,oeFun,puzzle.tv,syn Sophia,zSlide
0,10TACLE Studios,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1C Company,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2D Boy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3DO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,505 Games,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
from mykaggle.transform.pivot import PivotTransform
from sklearn.decomposition import PCA
columns = ['Platform', 'Genre', 'Year_of_Release', 'Developer']

  and should_run_async(code)


In [42]:
a = whole['Genre'].unique()
a

array(['Action', 'Adventure', 'Fighting', 'Misc', 'Platform', 'Puzzle',
       'Racing', 'Role-Playing', 'Shooter', 'Simulation', 'Sports',
       'Strategy', nan], dtype=object)

In [16]:
b = np.array(['a', 'b', np.nan])

In [17]:
b[np.where(b.astype(str) != str(np.nan))]

array(['a', 'b'], dtype='<U3')

In [18]:
pt = PivotTransform(['Publisher'], column='Genre', target='Name', aggs=['count'], fillna=0)
pub_to_genre_count = pt(whole)

In [19]:
pca = PCA(3)
output = pca.fit_transform(pub_to_genre_count.drop('Publisher', axis=1).values)
output = pd.concat([pub_to_genre_count, pd.DataFrame(output)], axis=1)

  and should_run_async(code)


In [20]:
list(range(-1, -3-1, -1))

  and should_run_async(code)


[-1, -2, -3]

In [21]:
output.iloc[:, [0, -3, -2, -1]]

Unnamed: 0,Publisher,0,1,2
0,10TACLE Studios,-8.851676,1.752332,-0.503820
1,1C Company,-8.634801,1.788274,-0.213881
2,20th Century Fox Video Games,-6.526206,-0.065907,-3.099918
3,2D Boy,-9.063381,1.958172,-0.795066
4,3DO,6.643973,-3.566374,-7.714002
...,...,...,...,...
576,id Software,-8.864584,2.038753,-1.187493
577,imageepoch Inc.,-8.835152,1.649518,-0.092220
578,inXile Entertainment,-8.947292,1.788550,-0.271028
579,"mixi, Inc",-8.530315,1.497282,-1.465199


# Developer を見る

In [22]:
whole.loc[:, 'Developer'].isna().sum(), train.loc[:, 'Developer'].isna().sum(), test.loc[:, 'Developer'].isna().sum()

  and should_run_async(code)


(6623, 3489, 3134)

In [23]:
# Publisher から見た Developer (unique)
pub_to_dev = whole.groupby('Publisher')['Developer'].apply(set).reset_index()
pub_to_dev['num_publisher_to_unique_dev'] = pub_to_dev['Developer'].apply(lambda x: len(list(x)))
pub_to_dev

Unnamed: 0,Publisher,Developer,num_publisher_to_unique_dev
0,10TACLE Studios,"{nan, WideScreen Games}",2
1,1C Company,"{Avalon Style, Katauri Interactive, THQ, 1C, 1...",3
2,20th Century Fox Video Games,{nan},1
3,2D Boy,{2D Boy},1
4,3DO,"{nan, 3DO, Bam Entertainment, Pandemic Studios...",5
...,...,...,...
576,id Software,{id Software},1
577,imageepoch Inc.,{nan},1
578,inXile Entertainment,{InXile Entertainment},1
579,"mixi, Inc",{nan},1


In [24]:
# Publisher から見た Developer (unique)
whole.groupby('Publisher')['Developer'].apply(list)

  and should_run_async(code)


Publisher
10TACLE Studios                                      [WideScreen Games, nan, nan]
1C Company                      [Katauri Interactive, Avalon Style, THQ, 1C, 1...
20th Century Fox Video Games                            [nan, nan, nan, nan, nan]
2D Boy                                                                   [2D Boy]
3DO                             [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...
                                                      ...                        
id Software                                                         [id Software]
imageepoch Inc.                                                        [nan, nan]
inXile Entertainment                                       [InXile Entertainment]
mixi, Inc                                                                   [nan]
responDESIGN                                                           [nan, nan]
Name: Developer, Length: 581, dtype: object

In [25]:
whole['Developer'].unique().shape

(1697,)

In [26]:
# Developer から見た Publisher
dev_to_pub = whole.groupby('Developer')['Publisher'].apply(list).reset_index()
dev_to_pub

Unnamed: 0,Developer,Publisher
0,10tacle Studios,[Atari]
1,"10tacle Studios, Fusionsphere Systems","[Koch Media, Deep Silver]"
2,"1C, 1C Company","[Blue Byte, Ubisoft]"
3,"1C, Ino-Co, 1C Company","[Paradox Interactive, Unknown]"
4,"1C, Various, 1C Company",[Ascaron Entertainment GmbH]
...,...,...
1691,odenis studio,[Ghostlight]
1692,oeFun,[O3 Entertainment]
1693,puzzle.tv,[Telegames]
1694,syn Sophia,"[Nintendo, Nintendo]"


In [27]:
# Developer から見た Publisher (unique)
dev_to_pub = whole.groupby('Developer')['Publisher'].apply(set).reset_index()
dev_to_pub['num_publisher'] = dev_to_pub['Publisher'].apply(lambda x: len(list(x)))

  and should_run_async(code)


In [28]:
dev_to_pub['num_publisher'].shape, dev_to_pub.loc[dev_to_pub['num_publisher'] > 1, 'num_publisher'].shape

((1696,), (672,))

In [29]:
devs = whole.loc[:, 'Developer'].unique()
devs

  and should_run_async(code)


array(["Traveller's Tales", nan, 'Ryu ga Gotoku Studios', ...,
       'EA Sports, EA Vancouver', 'EA Canada, EA Vancouver',
       'The Digital Lounge, Dino Dini'], dtype=object)

In [30]:
from typing import NamedTuple

class A(NamedTuple):
    a: int = 0
    b: int = 3
    c: int = 2
    
a = A(1, 3, 2)

In [31]:
import json
json.dumps({'condition': a})

'{"condition": [1, 3, 2]}'

In [32]:
isinstance(a, list)

False

# Publisher -> Genre

In [33]:
train.loc[:, 'Genre'].unique()

array(['Action', 'Adventure', 'Fighting', 'Misc', 'Platform', 'Puzzle',
       'Racing', 'Role-Playing', 'Shooter', 'Simulation', 'Sports',
       'Strategy'], dtype=object)

In [34]:
pub_to_dev = whole.groupby('Publisher')['Genre'].apply(set).reset_index()
pub_to_dev['num_publisher_to_unique_genre'] = pub_to_dev['Genre'].apply(lambda x: len(list(x)))
pub_to_dev

Unnamed: 0,Publisher,Genre,num_publisher_to_unique_genre
0,10TACLE Studios,"{Adventure, Strategy, Puzzle}",3
1,1C Company,"{Role-Playing, Racing, Strategy}",3
2,20th Century Fox Video Games,"{Action, Shooter}",2
3,2D Boy,{Puzzle},1
4,3DO,"{Strategy, Shooter, Sports, Platform, Adventur...",9
...,...,...,...
576,id Software,{Shooter},1
577,imageepoch Inc.,"{Role-Playing, Adventure}",2
578,inXile Entertainment,{Role-Playing},1
579,"mixi, Inc",{Action},1


In [35]:
pub_to_dev = whole.groupby('Publisher')['Genre'].apply(list).reset_index()
pub_to_dev['Genre']

  and should_run_async(code)


0                          [Adventure, Puzzle, Strategy]
1                       [Role-Playing, Racing, Strategy]
2              [Action, Action, Shooter, Action, Action]
3                                               [Puzzle]
4      [Action, Action, Action, Action, Action, Actio...
                             ...                        
576                                            [Shooter]
577                            [Adventure, Role-Playing]
578                                       [Role-Playing]
579                                             [Action]
580                                     [Sports, Sports]
Name: Genre, Length: 581, dtype: object

In [36]:
from collections import Counter
def counter(x):
    return Counter(x).most_common()[0][0]

In [37]:
pub_to_dev['Genre'].apply(counter)

0         Adventure
1      Role-Playing
2            Action
3            Puzzle
4            Action
           ...     
576         Shooter
577       Adventure
578    Role-Playing
579          Action
580          Sports
Name: Genre, Length: 581, dtype: object

# Publisher x Year_of_Release

In [38]:
pub_to_yor = whole.groupby('Publisher')['Year_of_Release'].agg(['min', 'max']).reset_index()
pub_to_yor['publisher_years'] = pub_to_yor.loc[:, 'max'] - pub_to_yor.loc[:, 'min']
pub_to_yor

Unnamed: 0,Publisher,min,max,publisher_years
0,10TACLE Studios,2006.0,2007.0,1.0
1,1C Company,2009.0,2011.0,2.0
2,20th Century Fox Video Games,1981.0,1982.0,1.0
3,2D Boy,2008.0,2008.0,0.0
4,3DO,1998.0,2003.0,5.0
...,...,...,...,...
576,id Software,1992.0,1992.0,0.0
577,imageepoch Inc.,2014.0,2014.0,0.0
578,inXile Entertainment,2015.0,2015.0,0.0
579,"mixi, Inc",2015.0,2015.0,0.0


In [39]:
pub_to_yor.iloc[:, 2] 

  and should_run_async(code)


0      2007.0
1      2011.0
2      1982.0
3      2008.0
4      2003.0
        ...  
576    1992.0
577    2014.0
578    2015.0
579    2015.0
580    2005.0
Name: max, Length: 581, dtype: float64

# Name を見る

In [40]:
names = whole.loc[:, 'Name'].values
names

array(['LEGO Batman: The Videogame',
       'LEGO Indiana Jones: The Original Adventures',
       'LEGO Batman: The Videogame', ..., 'Battle Worlds: Kronos',
       'Codename: Panzers Complete Collection', 'Imagine: Makeup Artist'],
      dtype=object)

In [41]:
for name in names:
    print(name)

LEGO Batman: The Videogame
LEGO Indiana Jones: The Original Adventures
LEGO Batman: The Videogame
Combat
LEGO Harry Potter: Years 5-7
LEGO Harry Potter: Years 5-7
Yakuza 4
LEGO Harry Potter: Years 5-7
The Lord of the Rings: War in the North
The Lord of the Rings: War in the North
The Chronicles of Narnia: The Lion, The Witch and The Wardrobe
LEGO Harry Potter: Years 5-7
The Golden Compass
Circus Atari
Maze Craze: A Game of Cops 'n Robbers
Robert Ludlum's The Bourne Conspiracy
LEGO Harry Potter: Years 5-7
Robert Ludlum's The Bourne Conspiracy
The Golden Compass
Tomb Raider (2013)
Slot Machine
The Chronicles of Narnia: The Lion, The Witch and The Wardrobe
Flag Capture
LEGO Harry Potter: Years 5-7
LEGO Harry Potter: Years 5-7
Happy Feet Two
Happy Feet Two
Happy Feet Two
Zero: Tsukihami no Kamen
Happy Feet Two
Tornado
Alex Rider: Stormbreaker
Captain America: Super Soldier
Captain America: Super Soldier
Adventure
Runaway: A Twist of Fate
Street Fighter IV
Samurai Shodown Anthology
Rock Ban