In [1]:
import os
import warnings
warnings.filterwarnings('ignore')
from typing import Any, Dict, List, Tuple
from pathlib import Path
import yaml

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pandas_profiling import ProfileReport # profile report を作る用
from matplotlib_venn import venn2 # venn図を作成する用
from tqdm import tqdm
from contextlib import contextmanager
from time import time
from collections import Counter

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_log_error, mean_squared_error
import lightgbm as lgb

from mykaggle.metric.mse import rmsle
from mykaggle.util.ml_logger import MLLogger
from mykaggle.lib.lgbm_util import compute_importances, save_importances
from mykaggle.lib.pandas_util import change_column_name
from mykaggle.util.routine import fix_seed

sns.set_style('darkgrid')

In [2]:
settings = yaml.safe_load('''
name: '215_eda'
competition: atmacup8
seed: 1019
training:
    num_folds: 5
    num_rounds: 1000
    early_stopping_rounds: 100
    verbose_eval: 20
lgbm_params:
    objective: binary
    learning_rate: 0.05
    max_depth: -1
    num_leaves: 31
    colsample_bytree: .7
    metric: "None"
feature:
    name_bow_pca_dim: 10
    name_bow_word_th1: 5
    name_bow_word_th2: 3
    name_bow_th1_upper: 130
    name_bow_th2_upper: 1000
''')

  and should_run_async(code)


In [3]:
fix_seed(settings['seed'])
pd.set_option('display.max_rows', 500)

In [4]:
datadir = Path('../data/')
ckptdir = Path('../ckpt/') / settings['name']
if not ckptdir.exists():
    ckptdir.mkdir()

In [5]:
df_train = pd.read_csv(datadir / 'id_train.csv')
df_test = pd.read_csv(datadir / 'id_test_unknown1.csv')
df_submission = pd.read_csv(datadir / 'atmaCup8_sample-submission.csv')
df_train.shape, df_test.shape

((8359, 17), (8360, 12))

In [6]:
df_train.head().T

Unnamed: 0,0,1,2,3,4
Name,LEGO Batman: The Videogame,LEGO Indiana Jones: The Original Adventures,LEGO Batman: The Videogame,Combat,LEGO Harry Potter: Years 5-7
Platform,Wii,Wii,PSP,2600,Wii
Year_of_Release,,,,,
Genre,Action,Action,Action,Action,Action
Publisher,Warner Bros. Interactive Entertainment,LucasArts,Warner Bros. Interactive Entertainment,Atari,Warner Bros. Interactive Entertainment
NA_Sales,180,151,56,117,69
EU_Sales,97,61,44,7,42
JP_Sales,0,0,0,0,0
Other_Sales,28,21,27,1,12
Global_Sales,306,234,128,125,124


In [7]:
df_test.head().T

  and should_run_async(code)


Unnamed: 0,0,1,2,3,4
Name,Hitman 2: Silent Assassin,Legacy of Kain: Soul Reaver,Metal Gear Solid 2: Substance,Silent Hill: Homecoming,Silent Hill: Homecoming
Platform,XB,PS,XB,X360,PS3
Year_of_Release,,,,,
Genre,Action,Action,Action,Action,Action
Publisher,Eidos Interactive,Eidos Interactive,Konami Digital Entertainment,Konami Digital Entertainment,Konami Digital Entertainment
Critic_Score,84,91,87,70,71
Critic_Count,23,17,28,54,41
User_Score,8,9,8.5,6.9,6.9
User_Count,19,132,39,180,143
Developer,Io Interactive,Crystal Dynamics,KCEJ,Double Helix Games,Double Helix Games


In [8]:
df_submission.head()

  and should_run_async(code)


Unnamed: 0,Global_Sales
0,63.371815
1,63.371815
2,63.371815
3,63.371815
4,63.371815


In [9]:
df_train.loc[:, 'has_na_sales'] = df_train.loc[:, 'NA_Sales']  > 0
df_train.loc[:, 'has_eu_sales'] = df_train.loc[:, 'EU_Sales']  > 0
df_train.loc[:, 'has_jp_sales'] = df_train.loc[:, 'JP_Sales']  > 0
df_train.loc[:, 'has_other_sales'] = df_train.loc[:, 'Other_Sales']  > 0
# df_train.to_csv(datadir / 'country_train.csv', index=False)

  and should_run_async(code)


In [10]:
# 要検討
df_best_pred = pd.read_csv('../ckpt/187_name_count/187_name_count.csv')
df_test.loc[:, 'Global_Sales'] = df_best_pred.loc[:, 'Global_Sales']

  and should_run_async(code)


In [11]:
train = df_train.copy()
test = df_test.copy()
train['is_test'] = False
test['is_test'] = True
df_whole = pd.concat([train, test])

In [12]:
train_others = {
    'main': df_train.copy(),
    'another': df_test.copy()
}
test_others = {
    'main': df_test.copy(),
    'another': df_train.copy(),
}

# Name の一致度から Publisher を探す

In [14]:
# もう nan はないのを確認
df_test[df_test.loc[:, 'Publisher'].isna()]

  and should_run_async(code)


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales


In [20]:
# 名前が nan は2件あるので処理しておく
display(df_test[df_test.loc[:, 'Name'].isna()])
df_test['Name'] = df_test['Name'].fillna('NaN')

  and should_run_async(code)


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
278,,GEN,1993.0,,Acclaim Entertainment,,,,,,,8637,48.062438
279,,GEN,1993.0,,Acclaim Entertainment,,,,,,,8638,23.178798


In [23]:
# unknown はあと165個
df_test_unk = df_test[df_test.loc[:, 'Publisher'] == 'Unknown']
df_test_unk.shape

  and should_run_async(code)


(165, 13)

In [22]:
# Maddeon NFL を見ると Electronic Arts とわかる。test でかぶってるやつもあるよね
df_test[df_test.loc[:, 'Name'].str.contains('Madden NFL')]

  and should_run_async(code)


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
145,Madden NFL 2004,PS2,,Sports,Electronic Arts,94.0,29.0,8.5,140.0,EA Tiburon,E,8504,114.953023
148,Madden NFL 07,PSP,,Sports,Unknown,78.0,18.0,6.6,16.0,EA Tiburon,E,8507,39.277323
152,Madden NFL 11,Wii,,Sports,Unknown,75.0,6.0,5.4,7.0,EA Tiburon,E,8511,28.425216
153,Madden NFL 06,X360,,Sports,Electronic Arts,74.0,48.0,4.9,44.0,EA Tiburon,E,8512,92.190305
154,Madden NFL 2002,XB,,Sports,Unknown,90.0,16.0,8.1,8.0,EA Sports,E,8513,25.059178
546,Madden NFL 97,PS,1996.0,Sports,Electronic Arts,,,,,,,8905,80.530725
699,Madden NFL 98,PS,1997.0,Sports,Electronic Arts,,,,,,,9058,80.763161
883,Madden NFL 2000,PS,1998.0,Sports,Electronic Arts,,,,,,,9242,90.062777
884,Madden NFL 99,PS,1998.0,Sports,Electronic Arts,,,,,,,9243,93.108214
892,Madden NFL 99,N64,1998.0,Sports,Electronic Arts,,,,,,,9251,27.854975


In [27]:
df_test_known = df_test.drop(df_test_unk.index, axis=0)
df_test_known.shape

(8195, 13)

In [34]:
count = 0
count_series = 0
count_trigram = 0
count_bigram = 0
count_unigram = 0

for i, name in zip(df_test_unk.index, df_test_unk['Name']):
    df_tmp = pd.DataFrame()
    if ':' in name:
        name = name.split(':')[0]
        df_tmp = df_test_known[df_test_known['Name'].str.startswith(name)]
        if df_tmp.shape[0] != 0:
            count_series += 1
    if df_tmp.shape[0] == 0 and len(name.split(' ')) >= 3:
        name = ' '.join(name.split(' ')[:3])
        df_tmp = df_test_known[df_test_known['Name'].str.startswith(name)]
        if df_tmp.shape[0] != 0:
            count_trigram += 1
    if df_tmp.shape[0] == 0 and len(name.split(' ')) >= 2:
        name = ' '.join(name.split(' ')[:2])
        df_tmp = df_test_known[df_test_known['Name'].str.startswith(name)]
        if df_tmp.shape[0] != 0:
            count_bigram += 1
#     if df_tmp.shape[0] == 0:
#         name = name.split(' ')[0]
#         if name != 'The':
#             df_tmp = df_train[df_train['Name'].str.startswith(name)]
#             count_unigram += 1

    if df_tmp.shape[0] > 0:
        print(name)
        display(df_tmp)
        count += 1
        most_publisher = df_tmp.loc[:, 'Publisher'].value_counts().reset_index().loc[0, 'index']
        if df_test_unk.loc[i, 'Publisher'] != 'Unknown':
            raise
        df_test_unk.loc[i, 'Publisher'] = most_publisher
print(count, count_series, count_trigram, count_bigram, count_unigram)

WCW Backstage Assault


  and should_run_async(code)


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
965,WCW Backstage Assault,PS,1999.0,Action,Electronic Arts,40.0,9.0,,,Kodiak Interactive,T,9324,18.693782


Rock Revolution


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
4523,Rock Revolution,X360,2008.0,Misc,Konami Digital Entertainment,38.0,28.0,5.1,17.0,"Zoe Mode, HB Studios Multimedia",T,12882,45.658812
4549,Rock Revolution,DS,2008.0,Misc,Konami Digital Entertainment,,,6.8,8.0,"Zoe Mode, HB Studios Multimedia",E10+,12908,48.470551


Rock Revolution


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
4523,Rock Revolution,X360,2008.0,Misc,Konami Digital Entertainment,38.0,28.0,5.1,17.0,"Zoe Mode, HB Studios Multimedia",T,12882,45.658812
4549,Rock Revolution,DS,2008.0,Misc,Konami Digital Entertainment,,,6.8,8.0,"Zoe Mode, HB Studios Multimedia",E10+,12908,48.470551


Build-A-Bear Workshop


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
4134,Build-A-Bear Workshop,DS,2007.0,Simulation,Game Factory,61.0,11.0,4.8,5.0,Neko Entertainment,E,12493,14.324265
4804,Build-A-Bear Workshop: A Friend Fur All Seasons,Wii,2008.0,Simulation,Game Factory,,,tbd,,Neko Entertainment,E,13163,11.684287
5958,Build-A-Bear Workshop: Welcome to Hugsville,DS,2010.0,Misc,Game Factory,,,tbd,,The Game Factory,E,14317,18.443461


National Geographic Challenge!


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
6590,National Geographic Challenge!,Wii,2011.0,Misc,Black Bean Games,,,,,,,14949,25.230245
6606,National Geographic Challenge!,PS3,2011.0,Misc,Black Bean Games,,,,,,,14965,20.179679


AKB1/48


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
5932,AKB1/48: Idol to Koishitara...,PSP,2010.0,Misc,Namco Bandai Games,,,,,,,14291,11.141156
6540,AKB1/48: Idol to Guam de Koishitara...,PSP,2011.0,Misc,Namco Bandai Games,,,,,,,14899,32.872968


NASCAR Thunder 2003


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
1780,NASCAR Thunder 2003,GC,2002.0,Racing,Electronic Arts,85.0,12.0,6.6,5.0,EA Sports,E,10139,45.693696
1785,NASCAR Thunder 2003,XB,2002.0,Racing,Electronic Arts,84.0,13.0,tbd,,EA Sports,E,10144,39.356461


The Dukes of


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
1022,The Dukes of Hazzard: Racing for Home,PS,1999.0,Racing,Ubisoft,,,,,,,9381,57.055556
2574,The Dukes of Hazzard: Return of the General Lee,PS2,2004.0,Racing,Ubisoft,52.0,17.0,6.8,8.0,Ratbag,E,10933,47.875574
2586,The Dukes of Hazzard: Return of the General Lee,XB,2004.0,Racing,Ubisoft,49.0,18.0,7.0,4.0,Ratbag,E,10945,18.642177


Smashing Drive


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
1812,Smashing Drive,XB,2002.0,Racing,Namco Bandai Games,42.0,9.0,tbd,,Namco,T,10171,10.043202
2594,Smashing Drive,GBA,2004.0,Racing,Zoo Digital Publishing,,,tbd,,Raylight Studios,E,10953,14.388557


Rayman Arena


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
1437,Rayman Arena,PS2,2001.0,Racing,Ubisoft,63.0,10.0,8.6,8.0,Ubisoft,E,9796,59.114224
1792,Rayman Arena,GC,2002.0,Racing,Ubisoft,60.0,10.0,7.2,6.0,Ubisoft,E,10151,40.673757


Yu-Gi-Oh! 5D's Wheelie


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
5406,Yu-Gi-Oh! 5D's Wheelie Breakers,Wii,2009.0,Racing,Konami Digital Entertainment,,,,,,,13765,21.668944


Freaky Flyers


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
2214,Freaky Flyers,PS2,2003.0,Racing,Midway Games,66.0,20.0,tbd,,Midway Studios - San Diego,T,10573,31.820538
2233,Freaky Flyers,XB,2003.0,Racing,Midway Games,64.0,22.0,tbd,,Midway,T,10592,14.176486


Legacy of


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
1,Legacy of Kain: Soul Reaver,PS,,Action,Eidos Interactive,91.0,17.0,9.0,132.0,Crystal Dynamics,T,8360,25.764409
2071,Legacy of Kain: Defiance,PS2,2003.0,Action,Eidos Interactive,75.0,34.0,9.3,49.0,Crystal Dynamics,M,10430,54.660893
2086,Legacy of Kain: Defiance,XB,2003.0,Action,Eidos Interactive,74.0,30.0,7.9,21.0,Crystal Dynamics,M,10445,20.659466
2113,Legacy of Kain: Defiance,PC,2003.0,Action,Eidos Interactive,70.0,11.0,8.7,67.0,Nixxes Software,M,10472,2.414508


Brothers in Arms


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
4742,Brothers in Arms: Hell's Highway,PS3,2008.0,Shooter,Ubisoft,76.0,48.0,8.0,70.0,Gearbox Software,M,13101,78.40126
4744,Brothers in Arms: Hell's Highway,X360,2008.0,Shooter,Ubisoft,76.0,65.0,8.3,149.0,Gearbox Software,M,13103,92.595749
4758,Brothers in Arms: Double Time,Wii,2008.0,Shooter,Ubisoft,45.0,13.0,6.6,17.0,Gearbox Software,M,13117,27.549807
4769,Brothers in Arms: Hell's Highway,PC,2008.0,Shooter,Ubisoft,79.0,22.0,7.9,130.0,Gearbox Software,M,13128,6.540755
7201,Brothers in Arms: Furious 4,PS3,2012.0,Shooter,Ubisoft,,,,,Gearbox Software,M,15560,13.282793


Charm Girls Club


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
5290,Charm Girls Club: Pajama Party,Wii,2009.0,Misc,Electronic Arts,,,tbd,,Electronic Arts,E,13649,21.303583
5564,Charm Girls Club: My Perfect Prom,DS,2009.0,Simulation,Electronic Arts,,,tbd,,Griptonite Games,E,13923,20.905514


Charm Girls Club


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
5290,Charm Girls Club: Pajama Party,Wii,2009.0,Misc,Electronic Arts,,,tbd,,Electronic Arts,E,13649,21.303583
5564,Charm Girls Club: My Perfect Prom,DS,2009.0,Simulation,Electronic Arts,,,tbd,,Griptonite Games,E,13923,20.905514


Triple Play


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
556,Triple Play 97,PS,1996.0,Sports,Electronic Arts Victor,,,,,,,8915,62.829568
701,Triple Play 98,PS,1997.0,Sports,Electronic Arts,,,,,,,9060,74.015314
1079,Triple Play 2000,PS,1999.0,Sports,Electronic Arts,,,,,,,9438,77.899693
1100,Triple Play 2000,N64,1999.0,Sports,Electronic Arts,,,,,,,9459,18.491543
1281,Triple Play 2001,PS,2000.0,Sports,Electronic Arts,,,,,,,9640,72.559008
1923,Triple Play 2002,PS2,2002.0,Sports,Electronic Arts,65.0,17.0,tbd,,Pandemic Studios,E,10282,49.891352
1953,Triple Play 2002,XB,2002.0,Sports,Electronic Arts,64.0,17.0,tbd,,Pandemic Studios,E,10312,20.589911


Madden NFL 07


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
3620,Madden NFL 07,PS2,2006.0,Sports,Electronic Arts,84.0,29.0,8.2,39.0,EA Sports,E,11979,126.365339
3623,Madden NFL 07,X360,2006.0,Sports,Electronic Arts,80.0,54.0,6.0,148.0,EA Tiburon,E,11982,160.531715
3627,Madden NFL 07,XB,2006.0,Sports,Electronic Arts,83.0,27.0,8.7,6.0,EA Tiburon,E,11986,50.803673
3635,Madden NFL 07,GC,2006.0,Sports,Electronic Arts,82.0,14.0,9.2,5.0,EA Tiburon,E,11994,36.733642
3642,Madden NFL 07,PS3,2006.0,Sports,Electronic Arts,76.0,25.0,4.2,22.0,EA Tiburon,E,12001,60.140128
3646,Madden NFL 07,Wii,2006.0,Sports,Electronic Arts,81.0,35.0,8.0,16.0,EA Canada,E,12005,76.062697
3667,Madden NFL 07,DS,2006.0,Sports,Electronic Arts,70.0,9.0,6.5,10.0,Exient Entertainment,E,12026,40.154546
3709,Madden NFL 07,GBA,2006.0,Sports,Electronic Arts,68.0,4.0,9.3,4.0,Exient Entertainment,E,12068,5.476317


MLB SlugFest 20-03


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
1960,MLB SlugFest 20-03,XB,2002.0,Sports,Midway Games,79.0,9.0,7.8,5.0,Gratuitous Games,E,10319,20.550965
1972,MLB SlugFest 20-03,GC,2002.0,Sports,Midway Games,80.0,10.0,7.5,4.0,Gratuitous Games,E,10331,11.602864


Madden NFL 11


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
6192,Madden NFL 11,X360,2010.0,Sports,Electronic Arts,84.0,47.0,5.7,120.0,EA Tiburon,E,14551,151.253217
6195,Madden NFL 11,PS3,2010.0,Sports,Electronic Arts,83.0,36.0,6.1,68.0,EA Tiburon,E,14554,123.466327
6221,Madden NFL 11,PS2,2010.0,Sports,Electronic Arts,,,7,4.0,EA Tiburon,E,14580,74.851439
6234,Madden NFL 11,PSP,2010.0,Sports,Electronic Arts,,,tbd,,EA Tiburon,E,14593,36.762983


Madden NFL 2002


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
1523,Madden NFL 2002,PS2,2001.0,Sports,Electronic Arts,94.0,23.0,7.9,46.0,EA Sports,E,9882,177.273428
1530,Madden NFL 2002,PS,2001.0,Sports,Electronic Arts,88.0,9.0,8,7.0,EA Sports,E,9889,90.745167
1541,Madden NFL 2002,GC,2001.0,Sports,Electronic Arts,89.0,12.0,tbd,,EA Sports,E,9900,34.075998
1548,Madden NFL 2002,N64,2001.0,Sports,Electronic Arts,,,,,,,9907,29.216992
1553,Madden NFL 2002,GBA,2001.0,Sports,Electronic Arts,,,tbd,,BudCat,E,9912,17.191156


All-Star Baseball


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
904,All-Star Baseball 99,N64,1998.0,Sports,Acclaim Entertainment,,,,,,,9263,25.014829
1094,All-Star Baseball 2000,N64,1999.0,Sports,Acclaim Entertainment,,,,,,,9453,20.74521
1304,All-Star Baseball 2001,N64,2000.0,Sports,Acclaim Entertainment,,,,,,,9663,19.884515
1543,All-Star Baseball 2002,PS2,2001.0,Sports,Acclaim Entertainment,77.0,16.0,7.6,8.0,Acclaim Studios Austin,E,9902,44.423758
1563,All-Star Baseball 2002,GC,2001.0,Sports,Acclaim Entertainment,66.0,17.0,5.6,9.0,Acclaim,E,9922,22.515179
1924,All-Star Baseball 2003,PS2,2002.0,Sports,Acclaim Entertainment,81.0,15.0,8.4,17.0,Acclaim Studios Austin,E,10283,65.622364
1952,All-Star Baseball 2003,XB,2002.0,Sports,Acclaim Entertainment,79.0,15.0,8.1,7.0,Acclaim,E,10311,31.786657
1969,All-Star Baseball 2003,GC,2002.0,Sports,Acclaim Entertainment,83.0,16.0,8,6.0,Acclaim,E,10328,25.110935
1970,All-Star Baseball 2003,GBA,2002.0,Sports,Acclaim Entertainment,77.0,6.0,tbd,,Software Creations,E,10329,11.754639
2344,All-Star Baseball 2004,PS2,2003.0,Sports,Acclaim Entertainment,78.0,19.0,7.5,11.0,Acclaim Studios Austin,E,10703,48.432046


NBA Starting Five


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
2028,NBA Starting Five,XB,2002.0,Sports,Konami Digital Entertainment,48.0,4.0,5.5,4.0,Konami,E,10387,5.328802


All-Star Baseball


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
904,All-Star Baseball 99,N64,1998.0,Sports,Acclaim Entertainment,,,,,,,9263,25.014829
1094,All-Star Baseball 2000,N64,1999.0,Sports,Acclaim Entertainment,,,,,,,9453,20.74521
1304,All-Star Baseball 2001,N64,2000.0,Sports,Acclaim Entertainment,,,,,,,9663,19.884515
1543,All-Star Baseball 2002,PS2,2001.0,Sports,Acclaim Entertainment,77.0,16.0,7.6,8.0,Acclaim Studios Austin,E,9902,44.423758
1563,All-Star Baseball 2002,GC,2001.0,Sports,Acclaim Entertainment,66.0,17.0,5.6,9.0,Acclaim,E,9922,22.515179
1924,All-Star Baseball 2003,PS2,2002.0,Sports,Acclaim Entertainment,81.0,15.0,8.4,17.0,Acclaim Studios Austin,E,10283,65.622364
1952,All-Star Baseball 2003,XB,2002.0,Sports,Acclaim Entertainment,79.0,15.0,8.1,7.0,Acclaim,E,10311,31.786657
1969,All-Star Baseball 2003,GC,2002.0,Sports,Acclaim Entertainment,83.0,16.0,8,6.0,Acclaim,E,10328,25.110935
1970,All-Star Baseball 2003,GBA,2002.0,Sports,Acclaim Entertainment,77.0,6.0,tbd,,Software Creations,E,10329,11.754639
2344,All-Star Baseball 2004,PS2,2003.0,Sports,Acclaim Entertainment,78.0,19.0,7.5,11.0,Acclaim Studios Austin,E,10703,48.432046


Street Hoops


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
1920,Street Hoops,PS2,2002.0,Sports,Activision,,,,,,,10279,88.483407
1946,Street Hoops,XB,2002.0,Sports,Activision,58.0,22.0,8.0,4.0,Black Ops Entertainment,T,10305,20.642063


NHL Hitz Pro


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
2369,NHL Hitz Pro,PS2,2003.0,Sports,Midway Games,79.0,23.0,8.9,13.0,Next Level Games,E,10728,35.40134
2390,NHL Hitz Pro,XB,2003.0,Sports,Midway Games,81.0,23.0,8.8,12.0,Next Level Games,E,10749,7.591916


Cabela's Alaskan Adventure


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
3681,Cabela's Alaskan Adventure,X360,2006.0,Sports,Activision,48.0,11.0,5.6,17.0,FUN Labs,T,12040,15.836919


The Game of


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
2913,The Game of Life / Yahtzee / Payday,GBA,2005.0,Misc,Zoo Digital Publishing,,,,,,,11272,37.225959


Disney Sports Football


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
2037,Disney Sports Football,GC,2002.0,Sports,Konami Digital Entertainment,59.0,8.0,tbd,,KCEO,E,10396,6.09686


Blitz


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
2713,Blitz: The League,XB,2004.0,Sports,Midway Games,78.0,30.0,7.7,14.0,Midway,M,11072,37.65601
3674,Blitz: The League,X360,2006.0,Sports,Midway Games,69.0,23.0,6.5,14.0,Midway,M,12033,19.666441
3691,Blitz: Overtime,PSP,2006.0,Sports,Midway Games,,,7.5,4.0,Midway,M,12050,3.838764
4935,Blitz: The League II,PS3,2008.0,Sports,Midway Games,62.0,25.0,7.5,15.0,,,13294,17.593018
4946,Blitz: The League II,X360,2008.0,Sports,Midway Games,68.0,34.0,7.2,13.0,Midway,M,13305,16.135636
4994,Blitzkrieg 2 Anthology,PC,2008.0,Strategy,Ascaron Entertainment GmbH,,,tbd,,"1C, Various, 1C Company",,13353,1.453377


God Eater


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
7417,God Eater 2,PSV,2013.0,Role-Playing,Namco Bandai Games,,,,,,,15776,17.667557
7420,God Eater 2,PSP,2013.0,Role-Playing,Namco Bandai Games,,,,,,,15779,19.160328
7835,God Eater Resurrection,PSV,2015.0,Action,Namco Bandai Games,,,8.8,17.0,"Shift, Bandai Namco Games",T,16194,22.03517
7860,God Eater Resurrection,PS4,2015.0,Action,Namco Bandai Games,70.0,18.0,7.8,44.0,"Shift, Bandai Namco Games",T,16219,16.117082
7992,God Eater Off Shot: Lindow-hen Twin Pack & Ani...,PS4,2015.0,Misc,Namco Bandai Games,,,,,,,16351,4.608421
8022,God Eater 2: Rage Burst,PSV,2015.0,Role-Playing,Namco Bandai Games,,,8.1,16.0,Shift,T,16381,10.566244
8032,God Eater 2: Rage Burst,PS4,2015.0,Role-Playing,Namco Bandai Games,69.0,35.0,8.0,79.0,Shift,T,16391,29.143688
8177,God Eater Off Shot: Soma Shikkuzaru-hen Twin P...,PS4,2016.0,Action,Namco Bandai Games,,,,,,,16536,3.475675
8186,God Eater Off Shot: Tachibana Sakuya-hen Twin ...,PS4,2016.0,Action,Namco Bandai Games,,,,,,,16545,3.025942
8245,God Eater Off Shot:Shiou-hen Twin Pack & Anima...,PS4,2016.0,Misc,Namco Bandai Games,,,,,,,16604,4.421093


Gekiatsu!! Pachi Game Tamashi


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
7108,Gekiatsu!! Pachi Game Tamashi Max: Evangelion ...,PS3,2012.0,Misc,Fields,,,,,,,15467,1.906919


North American Hunting


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
4894,North American Hunting Extravaganza,Wii,2008.0,Sports,Zushi Games,,,tbd,,Arcade Moon,T,13253,23.513063


Gummy Bears Mini


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
6299,Gummy Bears Mini Golf,DS,2010.0,Sports,Storm City Games,,,,,,,14658,4.345996


Gekiatsu!! Pachi Game


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
7108,Gekiatsu!! Pachi Game Tamashi Max: Evangelion ...,PS3,2012.0,Misc,Fields,,,,,,,15467,1.906919


Gekiatsu!! Pachi Game


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
7108,Gekiatsu!! Pachi Game Tamashi Max: Evangelion ...,PS3,2012.0,Misc,Fields,,,,,,,15467,1.906919


The Lost


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
585,The Lost World: Jurassic Park,PS,1997.0,Action,Electronic Arts,,,,,,,8944,75.799283
2183,The Lost Vikings,GBA,2003.0,Puzzle,Activision,75.0,16.0,9.1,15.0,Mass Media,E,10542,52.978459
7062,The Lost Chronicles of Zerzura,PC,2012.0,Adventure,DTP Entertainment,76.0,9.0,6.7,15.0,Cranberry Production,,15421,2.64607


Two Worlds


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
4040,Two Worlds,X360,2007.0,Role-Playing,SouthPeak Games,50.0,44.0,5.5,182.0,Reality Pump,M,12399,31.703763


Two Worlds


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
4040,Two Worlds,X360,2007.0,Role-Playing,SouthPeak Games,50.0,44.0,5.5,182.0,Reality Pump,M,12399,31.703763


Two Worlds


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
4040,Two Worlds,X360,2007.0,Role-Playing,SouthPeak Games,50.0,44.0,5.5,182.0,Reality Pump,M,12399,31.703763


Heavy Fire


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
6776,Heavy Fire: Afghanistan,Wii,2011.0,Shooter,Mastiff,,,tbd,,Teyon,T,15135,8.424202
6780,Heavy Fire: The Chosen Few 3D,3DS,2011.0,Shooter,Mastiff,,,6.3,4.0,Teyon,T,15139,4.291341
6782,Heavy Fire: Afghanistan,PC,2011.0,Shooter,Mastiff,,,3.2,5.0,,,15141,6.927221


Fairy Tail


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
5905,Fairy Tail: Portable Guild,PSP,2010.0,Fighting,Konami Digital Entertainment,,,,,,,14264,5.428599
6521,Fairy Tail: Portable Guild 2,PSP,2011.0,Fighting,Konami Digital Entertainment,,,,,,,14880,6.126859


Ukiyo no


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
7894,Ukiyo no Shishi,PS3,2015.0,Action,Namco Bandai Games,,,,,,,16253,3.063663


Horse Life


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
4159,Horse Life,DS,2007.0,Simulation,Game Life,68.0,9.0,tbd,,Neko Entertainment,E,12518,8.800543
4820,Horse Life Adventures,Wii,2008.0,Simulation,Deep Silver,,,tbd,,Neko Entertainment,E,13179,21.910418
4826,Horse Life Adventures,DS,2008.0,Simulation,Deep Silver,,,tbd,,Neko Entertainment,E,13185,25.204486


Ao no


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
7021,Ao no Exorcist: Genkoku no Labyrinth,PSP,2012.0,Action,Namco Bandai Games,,,,,,,15380,4.171526


45 9 18 18 0


In [36]:
df_test_unk.head()

  and should_run_async(code)


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,Global_Sales
5,Cubix Robots for Everyone: Clash 'n' Bash,GBA,,Action,Unknown,,,,,,,8364,5.408323
17,WCW Backstage Assault,N64,,Action,Electronic Arts,,,,,,,8376,14.427532
19,Action Man-Operation Extreme,PS,,Action,Unknown,,,,,,,8378,4.685558
23,Housekeeping,DS,,Action,Unknown,,,,,,,8382,3.495931
25,Super Duper Sumos,GBA,,Action,Unknown,57.0,5.0,tbd,,Handheld Games,E,8384,6.827206


In [None]:
# 90個を整理

In [39]:
df_final_test = pd.read_csv(datadir / 'id_test_unknown1.csv')
print(df_final_test[df_final_test['Publisher'] == 'Unknown'].shape[0])
df_final_test.loc[df_test_unk.index, 'Publisher'] = df_test_unk.loc[:, 'Publisher']
print(df_final_test[df_final_test['Publisher'] == 'Unknown'].shape[0])
df_final_test.to_csv(datadir / 'id_test_unknown1.csv', index=False)

165
120


In [42]:
df_final_test = pd.read_csv(datadir / 'country_prob_test_unknown1.csv')
print(df_final_test[df_final_test['Publisher'] == 'Unknown'].shape[0])
df_final_test.loc[df_test_unk.index, 'Publisher'] = df_test_unk.loc[:, 'Publisher']
print(df_final_test[df_final_test['Publisher'] == 'Unknown'].shape[0])
df_final_test.to_csv(datadir / 'country_prob_test_unknown1.csv', index=False)

165
120


# train と test で単語的に被ってるやつを消した。更に減らす

In [47]:
df_test = pd.read_csv(datadir / 'id_test_unknown1.csv')
df_test_unk = df_test[df_test['Publisher'] == 'Unknown']
df_test_known = df_test.drop(df_test_unk.index, axis=0)
df_known = pd.concat([df_train, df_test_known], axis=0)
df_known['Name'] = df_known['Name'].fillna('NaN')

  and should_run_async(code)


In [46]:
df_test_unk

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id
5,Cubix Robots for Everyone: Clash 'n' Bash,GBA,,Action,Unknown,,,,,,,8364
19,Action Man-Operation Extreme,PS,,Action,Unknown,,,,,,,8378
23,Housekeeping,DS,,Action,Unknown,,,,,,,8382
25,Super Duper Sumos,GBA,,Action,Unknown,57.0,5.0,tbd,,Handheld Games,E,8384
30,The Daring Game for Girls,DS,,Adventure,Unknown,,,tbd,,WXP,E,8389
31,The Daring Game for Girls,Wii,,Adventure,Unknown,,,tbd,,WXP,E,8390
32,The Hidden,3DS,,Adventure,Unknown,,,4.2,5.0,1st Playable Productions,E10+,8391
33,B.L.U.E.: Legend of Water,PS,,Adventure,Unknown,,,,,,,8392
36,wwe Smackdown vs. Raw 2006,PS2,,Fighting,Unknown,,,,,,,8395
42,Swords,Wii,,Fighting,Unknown,,,tbd,,Panic Button,T,8401


In [65]:
df_known[df_known['Name'].str.contains('Raw')]

  and should_run_async(code)


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,...,Critic_Count,User_Score,User_Count,Developer,Rating,id,has_na_sales,has_eu_sales,has_jp_sales,has_other_sales
1190,Destruction Derby Raw,PS,2000.0,Racing,Sony Computer Entertainment,8.0,5.0,0.0,1.0,14.0,...,9.0,7.3,14.0,Studio 33,E,1190,True,True,False,True
1570,WWE SmackDown! vs. Raw,PS2,2002.0,Fighting,THQ,132.0,108.0,4.0,39.0,283.0,...,,,,,,1570,True,True,True,True
1575,WWF Raw,XB,2002.0,Fighting,THQ,40.0,13.0,0.0,2.0,55.0,...,33.0,6.6,15.0,Anchor,T,1575,True,True,False,True
1952,WWE Raw 2,XB,2003.0,Fighting,THQ,43.0,11.0,0.0,2.0,56.0,...,26.0,7.3,25.0,Anchor,T,1952,True,True,False,True
3214,Raw Danger! (JP sales),PS2,2006.0,Action,505 Games,0.0,0.0,6.0,0.0,6.0,...,,,,,,3214,False,False,True,False
3230,Raw Danger!,PS2,2006.0,Action,505 Games,2.0,1.0,0.0,0.0,3.0,...,,,,,,3230,True,True,False,False
3813,WWE SmackDown vs Raw 2008,PS2,2007.0,Fighting,THQ,92.0,0.0,1.0,141.0,234.0,...,11.0,7.2,31.0,Yuke's,T,3813,True,False,True,True
3814,WWE SmackDown vs Raw 2008,X360,2007.0,Fighting,THQ,92.0,38.0,0.0,13.0,143.0,...,41.0,6.8,48.0,Yuke's,T,3814,True,True,False,True
3815,WWE SmackDown vs Raw 2008,PS3,2007.0,Fighting,THQ,62.0,49.0,1.0,20.0,133.0,...,27.0,7.3,41.0,Yuke's,T,3815,True,True,True,True
3816,WWE SmackDown vs Raw 2008,PSP,2007.0,Fighting,THQ,45.0,47.0,0.0,28.0,119.0,...,11.0,7.8,15.0,Yuke's,T,3816,True,True,False,True


## stopword 消して lower にしたら増えそう -> 4件発見

In [67]:
import texthero
from texthero import preprocessing

def _clean(series: pd.Series) -> pd.Series:
    custom_pipeline = [
        preprocessing.fillna,
        preprocessing.lowercase,
        preprocessing.remove_digits,
        preprocessing.remove_punctuation,
        preprocessing.remove_diacritics,
        preprocessing.remove_whitespace
    ]
    return texthero.clean(series, pipeline=custom_pipeline)

df_known['clean_name'] = _clean(df_known['Name'])
df_test_unk['clean_name'] = _clean(df_test_unk['Name'])

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


In [69]:
count = 0
count_series = 0
count_trigram = 0
count_bigram = 0
count_unigram = 0

for i, name in zip(df_test_unk.index, df_test_unk['clean_name']):
    df_tmp = pd.DataFrame()
    if ':' in name:
        name = name.split(':')[0]
        df_tmp = df_known[df_known['clean_name'].str.startswith(name)]
        if df_tmp.shape[0] != 0:
            count_series += 1
    if df_tmp.shape[0] == 0 and len(name.split(' ')) >= 3:
        name = ' '.join(name.split(' ')[:3])
        df_tmp = df_known[df_known['clean_name'].str.startswith(name)]
        if df_tmp.shape[0] != 0:
            count_trigram += 1
    if df_tmp.shape[0] == 0 and len(name.split(' ')) >= 2:
        name = ' '.join(name.split(' ')[:2])
        df_tmp = df_known[df_known['clean_name'].str.startswith(name)]
        if df_tmp.shape[0] != 0:
            count_bigram += 1
#     if df_tmp.shape[0] == 0:
#         name = name.split(' ')[0]
#         if name != 'The':
#             df_tmp = df_train[df_train['Name'].str.startswith(name)]
#             count_unigram += 1

    if df_tmp.shape[0] > 0:
        print(name)
        display(df_tmp)
        count += 1
        most_publisher = df_tmp.loc[:, 'Publisher'].value_counts().reset_index().loc[0, 'index']
        if df_test_unk.loc[i, 'Publisher'] != 'Unknown':
            raise
        df_test_unk.loc[i, 'Publisher'] = most_publisher
print(count, count_series, count_trigram, count_bigram, count_unigram)

wwe smackdown vs


  and should_run_async(code)


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,...,User_Score,User_Count,Developer,Rating,id,has_na_sales,has_eu_sales,has_jp_sales,has_other_sales,clean_name
1570,WWE SmackDown! vs. Raw,PS2,2002.0,Fighting,THQ,132.0,108.0,4.0,39.0,283.0,...,,,,,1570,True,True,True,True,wwe smackdown vs raw
2801,WWE SmackDown! vs. RAW 2006,PS2,2005.0,Fighting,THQ,145.0,111.0,4.0,33.0,294.0,...,,,,,2801,True,True,True,True,wwe smackdown vs raw
2809,WWE SmackDown! vs. RAW 2006,PSP,2005.0,Fighting,THQ,36.0,0.0,0.0,3.0,40.0,...,8.5,37.0,Yuke's,T,2809,True,False,False,True,wwe smackdown vs raw
3297,WWE SmackDown vs. RAW 2007,PS2,2006.0,Fighting,THQ,140.0,88.0,3.0,26.0,258.0,...,8.6,57.0,Yuke's,T,3297,True,True,True,True,wwe smackdown vs raw
3301,WWE SmackDown vs. RAW 2007,PSP,2006.0,Fighting,THQ,33.0,20.0,0.0,14.0,66.0,...,8.1,21.0,Yuke's,T,3301,True,True,False,True,wwe smackdown vs raw
3303,WWE SmackDown vs. RAW 2007,X360,2006.0,Fighting,THQ,44.0,3.0,0.0,4.0,50.0,...,7.8,46.0,Yuke's,T,3303,True,True,False,True,wwe smackdown vs raw
3813,WWE SmackDown vs Raw 2008,PS2,2007.0,Fighting,THQ,92.0,0.0,1.0,141.0,234.0,...,7.2,31.0,Yuke's,T,3813,True,False,True,True,wwe smackdown vs raw
3814,WWE SmackDown vs Raw 2008,X360,2007.0,Fighting,THQ,92.0,38.0,0.0,13.0,143.0,...,6.8,48.0,Yuke's,T,3814,True,True,False,True,wwe smackdown vs raw
3815,WWE SmackDown vs Raw 2008,PS3,2007.0,Fighting,THQ,62.0,49.0,1.0,20.0,133.0,...,7.3,41.0,Yuke's,T,3815,True,True,True,True,wwe smackdown vs raw
3816,WWE SmackDown vs Raw 2008,PSP,2007.0,Fighting,THQ,45.0,47.0,0.0,28.0,119.0,...,7.8,15.0,Yuke's,T,3816,True,True,False,True,wwe smackdown vs raw


beyond the


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,...,User_Score,User_Count,Developer,Rating,id,has_na_sales,has_eu_sales,has_jp_sales,has_other_sales,clean_name
477,Beyond The Beyond,PS,1995.0,Role-Playing,Sony Computer Entertainment,11.0,8.0,35.0,4.0,56.0,...,,,,,477,True,True,True,True,beyond the beyond


m m s


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,...,User_Score,User_Count,Developer,Rating,id,has_na_sales,has_eu_sales,has_jp_sales,has_other_sales,clean_name
4010,M&M's Kart Racing,Wii,2007.0,Racing,Zoo Digital Publishing,,,,,,...,,,,,12369,,,,,m m s kart racing
4431,M&M's Adventure,DS,2008.0,Adventure,Zoo Digital Publishing,,,,,,...,,,,,12790,,,,,m m s adventure
4671,M&M's Kart Racing,DS,2008.0,Racing,Zoo Digital Publishing,,,,,,...,,,,,13030,,,,,m m s kart racing
5320,M&M's Beach Party,Wii,2009.0,Misc,Zoo Digital Publishing,,,,,,...,,,,,13679,,,,,m m s beach party


chibi robo


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,...,User_Score,User_Count,Developer,Rating,id,has_na_sales,has_eu_sales,has_jp_sales,has_other_sales,clean_name
2777,Chibi-Robo! Plug into Adventure!,GC,2005.0,Adventure,Nintendo,23.0,6.0,9.0,1.0,39.0,...,,,,,2777,True,True,True,True,chibi robo plug into adventure
7983,Chibi-Robo! Zip Lash,3DS,2015.0,Platform,Nintendo,9.0,8.0,6.0,2.0,24.0,...,,,,,7983,True,True,True,True,chibi robo zip lash


4 0 2 2 0


In [71]:
df_test_unk.shape, df_test_unk[df_test_unk['Publisher'] == 'Unknown'].shape

  and should_run_async(code)


((120, 13), (116, 13))

In [72]:
df_final_test = pd.read_csv(datadir / 'id_test_unknown1.csv')
print(df_final_test[df_final_test['Publisher'] == 'Unknown'].shape[0])
df_final_test.loc[df_test_unk.index, 'Publisher'] = df_test_unk.loc[:, 'Publisher']
print(df_final_test[df_final_test['Publisher'] == 'Unknown'].shape[0])
df_final_test.to_csv(datadir / 'id_test_unknown1.csv', index=False)

120
116


In [73]:
df_final_test = pd.read_csv(datadir / 'country_prob_test_unknown1.csv')
print(df_final_test[df_final_test['Publisher'] == 'Unknown'].shape[0])
df_final_test.loc[df_test_unk.index, 'Publisher'] = df_test_unk.loc[:, 'Publisher']
print(df_final_test[df_final_test['Publisher'] == 'Unknown'].shape[0])
df_final_test.to_csv(datadir / 'country_prob_test_unknown1.csv', index=False)

120
116


# 次, 1文字目見ちゃうか?

In [137]:
df_test = pd.read_csv(datadir / 'id_test_unknown1.csv')
df_test_unk = df_test[df_test['Publisher'] == 'Unknown']
df_test_known = df_test.drop(df_test_unk.index, axis=0)
df_known = pd.concat([df_train, df_test_known], axis=0)
df_known['Name'] = df_known['Name'].fillna('NaN')

In [138]:
df_known['clean_name'] = _clean(df_known['Name'])
df_test_unk['clean_name'] = _clean(df_test_unk['Name'])

In [139]:
df_test_unk[df_test_unk['Name'].str.contains(':')]

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,clean_name
5,Cubix Robots for Everyone: Clash 'n' Bash,GBA,,Action,Unknown,,,,,,,8364,cubix robots for everyone clash n bash
33,B.L.U.E.: Legend of Water,PS,,Adventure,Unknown,,,,,,,8392,b l u e legend of water
122,Tribes: Aerial Assault,PS2,,Shooter,Unknown,73.0,23.0,8.8,47.0,Midway Studios - Austin,T,8481,tribes aerial assault
126,Combat Elite: WWII Paratroopers,PS2,,Shooter,Unknown,54.0,12.0,tbd,,BattleBorne,T,8485,combat elite wwii paratroopers
128,Combat Elite: WWII Paratroopers,XB,,Shooter,Unknown,56.0,11.0,tbd,,BattleBorne,T,8487,combat elite wwii paratroopers
2494,Nicktoons Collection: Game Boy Advance Video V...,GBA,2004.0,Misc,Unknown,,,,,,,10853,nicktoons collection game boy advance video vo...
2505,Sonic X: Game Boy Advance Video Volume 1,GBA,2004.0,Misc,Unknown,,,,,,,10864,sonic x game boy advance video volume
2511,Nicktoons Collection: Game Boy Advance Video V...,GBA,2004.0,Misc,Unknown,,,,,,,10870,nicktoons collection game boy advance video vo...
2939,Nicktoons Collection: Game Boy Advance Video V...,GBA,2005.0,Misc,Unknown,,,,,,,11298,nicktoons collection game boy advance video vo...
4297,UFO: Trilogy,PC,2007.0,Strategy,Unknown,,,,,,,12656,ufo trilogy


In [140]:
df_known[df_known['clean_name'].str.contains('Thomas'.lower())]

  and should_run_async(code)


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,...,User_Score,User_Count,Developer,Rating,id,has_na_sales,has_eu_sales,has_jp_sales,has_other_sales,clean_name
7893,Thomas and Friends: Steaming around Sodor,3DS,2015.0,Action,Avanquest Software,0.0,2.0,0.0,0.0,2.0,...,tbd,,Avanquest Software,,7893,False,True,False,False,thomas and friends steaming around sodor
564,Frank Thomas Big Hurt Baseball,PS,1996.0,Sports,Acclaim Entertainment,,,,,,...,,,,,8923,,,,,frank thomas big hurt baseball


In [141]:
def colon_latter(x: str):
    if ':' in x:
        return x.split(':')[1].lower().strip()
    return ''

def colon_former(x: str):
    if ':' in x:
        return x.split(':')[0].lower().strip()
    return ''
df_known['Name_colon_latter'] = df_known['Name'].apply(colon_latter)
df_known['Name_colon_former'] = df_known['Name'].apply(colon_former)
df_known['Name_colon_former_lt_2'] = df_known['Name_colon_former'].apply(lambda x: len(x.split(' ')) <= 2)

  and should_run_async(code)


In [118]:
count = 0
count_series = 0
count_trigram = 0
count_bigram = 0
count_unigram = 0

for i, name, original in zip(df_test_unk.index, df_test_unk['clean_name'], df_test_unk['Name']):
    df_tmp = pd.DataFrame()
    if ':' in original:
        name_latter = original.split(':')[1].strip()
        name_latter = ' '.join(name_latter.lower().split(' ')[:2])
        df_tmp = df_known[df_known['Name_colon_latter'].str.startswith(name_latter)]
        if df_tmp.shape[0] != 0:
            count_series += 1
#     if df_tmp.shape[0] == 0 and len(name.split(' ')) >= 3:
#         name = ' '.join(name.split(' ')[:3])
#         df_tmp = df_known[df_known['clean_name'].str.startswith(name)]
#         if df_tmp.shape[0] != 0:
#             count_trigram += 1
#     if df_tmp.shape[0] == 0 and len(name.split(' ')) >= 2:
#         name = ' '.join(name.split(' ')[:2])
#         df_tmp = df_known[df_known['clean_name'].str.startswith(name)]
#         if df_tmp.shape[0] != 0:
#             count_bigram += 1
#     if df_tmp.shape[0] == 0:
#         name = name.split(' ')[0]
#         if name != 'The':
#             df_tmp = df_train[df_train['Name'].str.startswith(name)]
#             count_unigram += 1

    if df_tmp.shape[0] > 0:
        print(name)
        display(df_tmp)
        count += 1
#         most_publisher = df_tmp.loc[:, 'Publisher'].value_counts().reset_index().loc[0, 'index']
#         if df_test_unk.loc[i, 'Publisher'] != 'Unknown':
#             raise
#         df_test_unk.loc[i, 'Publisher'] = most_publisher
print(count, count_series, count_trigram, count_bigram, count_unigram)

b l u e legend of water


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,...,User_Count,Developer,Rating,id,has_na_sales,has_eu_sales,has_jp_sales,has_other_sales,clean_name,Name_colon_latter
595,Super Mario RPG: Legend of the Seven Stars,SNES,1996.0,Role-Playing,Nintendo,66.0,0.0,145.0,3.0,214.0,...,,,,595,True,False,True,True,super mario rpg legend of the seven stars,legend of the seven stars
3320,Kengo: Legend of The 9,X360,2006.0,Fighting,Majesco Entertainment,6.0,1.0,0.0,1.0,7.0,...,14.0,Genki,M,3320,True,True,False,True,kengo legend of the,legend of the 9
6682,Jewel Link Chronicles: Legend of Athena,DS,2011.0,Puzzle,GSP,0.0,6.0,0.0,1.0,7.0,...,,,,6682,False,True,False,True,jewel link chronicles legend of athena,legend of athena
628,Croc: Legend of the Gobbos,PS,1997.0,Platform,Fox Interactive,,,,,,...,,,,8987,,,,,croc legend of the gobbos,legend of the gobbos
1354,Jackie Chan Adventures: Legend of the Dark Hand,GBA,2001.0,Action,Activision,,,,,,...,,,,9713,,,,,jackie chan adventures legend of the dark hand,legend of the dark hand
3338,Pirates: Legend of the Black Buccaneer,PS2,2006.0,Adventure,10TACLE Studios,,,,,,...,5.0,WideScreen Games,T,11697,,,,,pirates legend of the black buccaneer,legend of the black buccaneer
7888,Gravity Falls: Legend of the Gnome Gemulets,3DS,2015.0,Action,Ubisoft,,,,,,...,10.0,Ubisoft Osaka,E,16247,,,,,gravity falls legend of the gnome gemulets,legend of the gnome gemulets


nicktoons collection game boy advance video volume


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,...,User_Count,Developer,Rating,id,has_na_sales,has_eu_sales,has_jp_sales,has_other_sales,clean_name,Name_colon_latter
2495,SpongeBob SquarePants: Game Boy Advance Video ...,GBA,2004.0,Misc,THQ,,,,,,...,,,,10854,,,,,spongebob squarepants game boy advance video v...,game boy advance video volume 1
2497,SpongeBob SquarePants: Game Boy Advance Video ...,GBA,2004.0,Misc,THQ,,,,,,...,,,,10856,,,,,spongebob squarepants game boy advance video v...,game boy advance video volume 2
2500,The Fairly Odd Parents: Game Boy Advance Video...,GBA,2004.0,Misc,THQ,,,,,,...,,,,10859,,,,,the fairly odd parents game boy advance video ...,game boy advance video volume 1
2501,The Fairly Odd Parents: Game Boy Advance Video...,GBA,2004.0,Misc,THQ,,,,,,...,,,,10860,,,,,the fairly odd parents game boy advance video ...,game boy advance video volume 2
2506,Dora the Explorer: Game Boy Advance Video Volu...,GBA,2004.0,Misc,Take-Two Interactive,,,,,,...,,,,10865,,,,,dora the explorer game boy advance video volume,game boy advance video volume 1
2507,Cartoon Network Collection: Game Boy Advance V...,GBA,2004.0,Misc,Crave Entertainment,,,,,,...,,,,10866,,,,,cartoon network collection game boy advance vi...,game boy advance video volume 1
2508,All Grown Up!: Game Boy Advance Video Volume 1,GBA,2004.0,Misc,THQ,,,,,,...,,,,10867,,,,,all grown up game boy advance video volume,game boy advance video volume 1
2512,SpongeBob SquarePants: Game Boy Advance Video ...,GBA,2004.0,Misc,THQ,,,,,,...,,,,10871,,,,,spongebob squarepants game boy advance video v...,game boy advance video volume 3
2515,Dragon Ball GT: Game Boy Advance Video Volume 1,GBA,2004.0,Misc,Atari,,,,,,...,,,,10874,,,,,dragon ball gt game boy advance video volume,game boy advance video volume 1
2518,Teenage Mutant Ninja Turtles: Game Boy Advance...,GBA,2004.0,Misc,Palcom,,,,,,...,,,,10877,,,,,teenage mutant ninja turtles game boy advance ...,game boy advance video volume 1


sonic x game boy advance video volume


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,...,User_Count,Developer,Rating,id,has_na_sales,has_eu_sales,has_jp_sales,has_other_sales,clean_name,Name_colon_latter
2495,SpongeBob SquarePants: Game Boy Advance Video ...,GBA,2004.0,Misc,THQ,,,,,,...,,,,10854,,,,,spongebob squarepants game boy advance video v...,game boy advance video volume 1
2497,SpongeBob SquarePants: Game Boy Advance Video ...,GBA,2004.0,Misc,THQ,,,,,,...,,,,10856,,,,,spongebob squarepants game boy advance video v...,game boy advance video volume 2
2500,The Fairly Odd Parents: Game Boy Advance Video...,GBA,2004.0,Misc,THQ,,,,,,...,,,,10859,,,,,the fairly odd parents game boy advance video ...,game boy advance video volume 1
2501,The Fairly Odd Parents: Game Boy Advance Video...,GBA,2004.0,Misc,THQ,,,,,,...,,,,10860,,,,,the fairly odd parents game boy advance video ...,game boy advance video volume 2
2506,Dora the Explorer: Game Boy Advance Video Volu...,GBA,2004.0,Misc,Take-Two Interactive,,,,,,...,,,,10865,,,,,dora the explorer game boy advance video volume,game boy advance video volume 1
2507,Cartoon Network Collection: Game Boy Advance V...,GBA,2004.0,Misc,Crave Entertainment,,,,,,...,,,,10866,,,,,cartoon network collection game boy advance vi...,game boy advance video volume 1
2508,All Grown Up!: Game Boy Advance Video Volume 1,GBA,2004.0,Misc,THQ,,,,,,...,,,,10867,,,,,all grown up game boy advance video volume,game boy advance video volume 1
2512,SpongeBob SquarePants: Game Boy Advance Video ...,GBA,2004.0,Misc,THQ,,,,,,...,,,,10871,,,,,spongebob squarepants game boy advance video v...,game boy advance video volume 3
2515,Dragon Ball GT: Game Boy Advance Video Volume 1,GBA,2004.0,Misc,Atari,,,,,,...,,,,10874,,,,,dragon ball gt game boy advance video volume,game boy advance video volume 1
2518,Teenage Mutant Ninja Turtles: Game Boy Advance...,GBA,2004.0,Misc,Palcom,,,,,,...,,,,10877,,,,,teenage mutant ninja turtles game boy advance ...,game boy advance video volume 1


nicktoons collection game boy advance video volume


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,...,User_Count,Developer,Rating,id,has_na_sales,has_eu_sales,has_jp_sales,has_other_sales,clean_name,Name_colon_latter
2495,SpongeBob SquarePants: Game Boy Advance Video ...,GBA,2004.0,Misc,THQ,,,,,,...,,,,10854,,,,,spongebob squarepants game boy advance video v...,game boy advance video volume 1
2497,SpongeBob SquarePants: Game Boy Advance Video ...,GBA,2004.0,Misc,THQ,,,,,,...,,,,10856,,,,,spongebob squarepants game boy advance video v...,game boy advance video volume 2
2500,The Fairly Odd Parents: Game Boy Advance Video...,GBA,2004.0,Misc,THQ,,,,,,...,,,,10859,,,,,the fairly odd parents game boy advance video ...,game boy advance video volume 1
2501,The Fairly Odd Parents: Game Boy Advance Video...,GBA,2004.0,Misc,THQ,,,,,,...,,,,10860,,,,,the fairly odd parents game boy advance video ...,game boy advance video volume 2
2506,Dora the Explorer: Game Boy Advance Video Volu...,GBA,2004.0,Misc,Take-Two Interactive,,,,,,...,,,,10865,,,,,dora the explorer game boy advance video volume,game boy advance video volume 1
2507,Cartoon Network Collection: Game Boy Advance V...,GBA,2004.0,Misc,Crave Entertainment,,,,,,...,,,,10866,,,,,cartoon network collection game boy advance vi...,game boy advance video volume 1
2508,All Grown Up!: Game Boy Advance Video Volume 1,GBA,2004.0,Misc,THQ,,,,,,...,,,,10867,,,,,all grown up game boy advance video volume,game boy advance video volume 1
2512,SpongeBob SquarePants: Game Boy Advance Video ...,GBA,2004.0,Misc,THQ,,,,,,...,,,,10871,,,,,spongebob squarepants game boy advance video v...,game boy advance video volume 3
2515,Dragon Ball GT: Game Boy Advance Video Volume 1,GBA,2004.0,Misc,Atari,,,,,,...,,,,10874,,,,,dragon ball gt game boy advance video volume,game boy advance video volume 1
2518,Teenage Mutant Ninja Turtles: Game Boy Advance...,GBA,2004.0,Misc,Palcom,,,,,,...,,,,10877,,,,,teenage mutant ninja turtles game boy advance ...,game boy advance video volume 1


nicktoons collection game boy advance video volume


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,...,User_Count,Developer,Rating,id,has_na_sales,has_eu_sales,has_jp_sales,has_other_sales,clean_name,Name_colon_latter
2495,SpongeBob SquarePants: Game Boy Advance Video ...,GBA,2004.0,Misc,THQ,,,,,,...,,,,10854,,,,,spongebob squarepants game boy advance video v...,game boy advance video volume 1
2497,SpongeBob SquarePants: Game Boy Advance Video ...,GBA,2004.0,Misc,THQ,,,,,,...,,,,10856,,,,,spongebob squarepants game boy advance video v...,game boy advance video volume 2
2500,The Fairly Odd Parents: Game Boy Advance Video...,GBA,2004.0,Misc,THQ,,,,,,...,,,,10859,,,,,the fairly odd parents game boy advance video ...,game boy advance video volume 1
2501,The Fairly Odd Parents: Game Boy Advance Video...,GBA,2004.0,Misc,THQ,,,,,,...,,,,10860,,,,,the fairly odd parents game boy advance video ...,game boy advance video volume 2
2506,Dora the Explorer: Game Boy Advance Video Volu...,GBA,2004.0,Misc,Take-Two Interactive,,,,,,...,,,,10865,,,,,dora the explorer game boy advance video volume,game boy advance video volume 1
2507,Cartoon Network Collection: Game Boy Advance V...,GBA,2004.0,Misc,Crave Entertainment,,,,,,...,,,,10866,,,,,cartoon network collection game boy advance vi...,game boy advance video volume 1
2508,All Grown Up!: Game Boy Advance Video Volume 1,GBA,2004.0,Misc,THQ,,,,,,...,,,,10867,,,,,all grown up game boy advance video volume,game boy advance video volume 1
2512,SpongeBob SquarePants: Game Boy Advance Video ...,GBA,2004.0,Misc,THQ,,,,,,...,,,,10871,,,,,spongebob squarepants game boy advance video v...,game boy advance video volume 3
2515,Dragon Ball GT: Game Boy Advance Video Volume 1,GBA,2004.0,Misc,Atari,,,,,,...,,,,10874,,,,,dragon ball gt game boy advance video volume,game boy advance video volume 1
2518,Teenage Mutant Ninja Turtles: Game Boy Advance...,GBA,2004.0,Misc,Palcom,,,,,,...,,,,10877,,,,,teenage mutant ninja turtles game boy advance ...,game boy advance video volume 1


ufo trilogy


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,...,User_Count,Developer,Rating,id,has_na_sales,has_eu_sales,has_jp_sales,has_other_sales,clean_name,Name_colon_latter
5514,Metroid Prime: Trilogy,Wii,2009.0,Shooter,Nintendo,42.0,5.0,0.0,14.0,61.0,...,316.0,Retro Studios,T,5514,True,True,False,True,metroid prime trilogy,trilogy


tantei opera milky holmes


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,...,User_Count,Developer,Rating,id,has_na_sales,has_eu_sales,has_jp_sales,has_other_sales,clean_name,Name_colon_latter
7069,Detective Opera: Milky Holmes 2,PSP,2012.0,Adventure,BushiRoad,,,,,,...,,,,15428,,,,,detective opera milky holmes,milky holmes 2


power gig rise of the sixstring


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,...,User_Count,Developer,Rating,id,has_na_sales,has_eu_sales,has_jp_sales,has_other_sales,clean_name,Name_colon_latter
1256,Battlezone: Rise of the Black Dogs,N64,2000.0,Strategy,Crave Entertainment,6.0,1.0,0.0,0.0,7.0,...,,,,1256,True,True,False,False,battlezone rise of the black dogs,rise of the black dogs
1277,Azurik: Rise of Perathia,XB,2001.0,Action,Microsoft Game Studios,18.0,5.0,0.0,1.0,24.0,...,14.0,Adrenium,T,1277,True,True,False,True,azurik rise of perathia,rise of perathia
1869,Terminator 3: Rise of the Machines,PS2,2003.0,Action,Atari,28.0,22.0,0.0,7.0,57.0,...,23.0,Black Ops Entertainment,T,1869,True,True,False,True,terminator rise of the machines,rise of the machines
1896,Terminator 3: Rise of the Machines,XB,2003.0,Action,Atari,11.0,3.0,0.0,1.0,14.0,...,10.0,Black Ops Entertainment,T,1896,True,True,False,True,terminator rise of the machines,rise of the machines
1919,Terminator 3: Rise of the Machines,GBA,2003.0,Action,Atari,4.0,2.0,0.0,0.0,6.0,...,,Taniko,T,1919,True,True,False,False,terminator rise of the machines,rise of the machines
2142,Final Fantasy XI: Rise of the Zilart,PS2,2003.0,Role-Playing,Square Enix,0.0,0.0,13.0,0.0,13.0,...,,,,2142,False,False,True,False,final fantasy xi rise of the zilart,rise of the zilart
2704,The Incredibles: Rise of the Underminer,GBA,2005.0,Action,THQ,28.0,11.0,0.0,1.0,40.0,...,,Helixe,E,2704,True,True,False,True,the incredibles rise of the underminer,rise of the underminer
2706,The Incredibles: Rise of the Underminer,PS2,2005.0,Action,THQ,18.0,14.0,0.0,5.0,36.0,...,8.0,Heavy Iron Studios,E10+,2706,True,True,False,True,the incredibles rise of the underminer,rise of the underminer
2730,The Incredibles: Rise of the Underminer,GC,2005.0,Action,THQ,12.0,3.0,0.0,0.0,16.0,...,7.0,Heavy Iron Studios,E10+,2730,True,True,False,False,the incredibles rise of the underminer,rise of the underminer
2736,The Incredibles: Rise of the Underminer,DS,2005.0,Action,THQ,13.0,1.0,0.0,1.0,15.0,...,,Helixe,E10+,2736,True,True,False,True,the incredibles rise of the underminer,rise of the underminer


power gig rise of the sixstring


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,...,User_Count,Developer,Rating,id,has_na_sales,has_eu_sales,has_jp_sales,has_other_sales,clean_name,Name_colon_latter
1256,Battlezone: Rise of the Black Dogs,N64,2000.0,Strategy,Crave Entertainment,6.0,1.0,0.0,0.0,7.0,...,,,,1256,True,True,False,False,battlezone rise of the black dogs,rise of the black dogs
1277,Azurik: Rise of Perathia,XB,2001.0,Action,Microsoft Game Studios,18.0,5.0,0.0,1.0,24.0,...,14.0,Adrenium,T,1277,True,True,False,True,azurik rise of perathia,rise of perathia
1869,Terminator 3: Rise of the Machines,PS2,2003.0,Action,Atari,28.0,22.0,0.0,7.0,57.0,...,23.0,Black Ops Entertainment,T,1869,True,True,False,True,terminator rise of the machines,rise of the machines
1896,Terminator 3: Rise of the Machines,XB,2003.0,Action,Atari,11.0,3.0,0.0,1.0,14.0,...,10.0,Black Ops Entertainment,T,1896,True,True,False,True,terminator rise of the machines,rise of the machines
1919,Terminator 3: Rise of the Machines,GBA,2003.0,Action,Atari,4.0,2.0,0.0,0.0,6.0,...,,Taniko,T,1919,True,True,False,False,terminator rise of the machines,rise of the machines
2142,Final Fantasy XI: Rise of the Zilart,PS2,2003.0,Role-Playing,Square Enix,0.0,0.0,13.0,0.0,13.0,...,,,,2142,False,False,True,False,final fantasy xi rise of the zilart,rise of the zilart
2704,The Incredibles: Rise of the Underminer,GBA,2005.0,Action,THQ,28.0,11.0,0.0,1.0,40.0,...,,Helixe,E,2704,True,True,False,True,the incredibles rise of the underminer,rise of the underminer
2706,The Incredibles: Rise of the Underminer,PS2,2005.0,Action,THQ,18.0,14.0,0.0,5.0,36.0,...,8.0,Heavy Iron Studios,E10+,2706,True,True,False,True,the incredibles rise of the underminer,rise of the underminer
2730,The Incredibles: Rise of the Underminer,GC,2005.0,Action,THQ,12.0,3.0,0.0,0.0,16.0,...,7.0,Heavy Iron Studios,E10+,2730,True,True,False,False,the incredibles rise of the underminer,rise of the underminer
2736,The Incredibles: Rise of the Underminer,DS,2005.0,Action,THQ,13.0,1.0,0.0,1.0,15.0,...,,Helixe,E10+,2736,True,True,False,True,the incredibles rise of the underminer,rise of the underminer


thomas friends hero of the rails


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,...,User_Count,Developer,Rating,id,has_na_sales,has_eu_sales,has_jp_sales,has_other_sales,clean_name,Name_colon_latter
5559,Harvest Moon: Hero of Leaf Valley,PSP,2009.0,Simulation,Rising Star Games,,,,,,...,12.0,Marvelous Entertainment,E,13918,,,,,harvest moon hero of leaf valley,hero of leaf valley


thomas friends hero of the rails


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,...,User_Count,Developer,Rating,id,has_na_sales,has_eu_sales,has_jp_sales,has_other_sales,clean_name,Name_colon_latter
5559,Harvest Moon: Hero of Leaf Valley,PSP,2009.0,Simulation,Rising Star Games,,,,,,...,12.0,Marvelous Entertainment,E,13918,,,,,harvest moon hero of leaf valley,hero of leaf valley


meikyuu cross blood reloaded


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,...,User_Count,Developer,Rating,id,has_na_sales,has_eu_sales,has_jp_sales,has_other_sales,clean_name,Name_colon_latter
6757,Goldeneye 007: Reloaded,PS3,2011.0,Shooter,Activision,,,,,,...,52.0,Eurocom Entertainment Software,T,15116,,,,,goldeneye reloaded,reloaded
6760,Goldeneye 007: Reloaded,X360,2011.0,Shooter,Activision,,,,,,...,96.0,Eurocom Entertainment Software,T,15119,,,,,goldeneye reloaded,reloaded


black rock shooter the game


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,...,User_Count,Developer,Rating,id,has_na_sales,has_eu_sales,has_jp_sales,has_other_sales,clean_name,Name_colon_latter
3253,24: The Game,PS2,2006.0,Adventure,Sony Computer Entertainment,15.0,12.0,0.0,4.0,30.0,...,40.0,SCE Studio Cambridge,M,3253,True,True,False,True,the game,the game
3983,Napoleon Dynamite: The Game,PSP,2007.0,Puzzle,Crave Entertainment,3.0,0.0,0.0,0.0,4.0,...,7.0,7 Studios,E10+,3983,True,False,False,False,napoleon dynamite the game,the game
3986,Napoleon Dynamite: The Game,DS,2007.0,Puzzle,Crave Entertainment,2.0,0.0,0.0,0.0,3.0,...,4.0,7 Studios,E10+,3986,True,False,False,False,napoleon dynamite the game,the game
4463,Igor: The Game,DS,2008.0,Adventure,Legacy Interactive,4.0,0.0,0.0,0.0,4.0,...,,Artefacts Studio,E,4463,True,False,False,False,igor the game,the game
4552,Peppa Pig: The Game,DS,2008.0,Misc,Pinnacle,0.0,81.0,0.0,6.0,87.0,...,,,,4552,False,True,False,True,peppa pig the game,the game
5244,Wipeout: The Game,Wii,2009.0,Misc,Mindscape,194.0,0.0,0.0,12.0,206.0,...,,,,5244,True,False,False,True,wipeout the game,the game
5269,Peppa Pig: The Game,Wii,2009.0,Misc,Pinnacle,0.0,34.0,0.0,3.0,37.0,...,,,,5269,False,True,False,True,peppa pig the game,the game
5393,Countdown: The Game,DS,2009.0,Puzzle,Mindscape,0.0,14.0,0.0,1.0,15.0,...,,,,5393,False,True,False,True,countdown the game,the game
5406,Countdown: The Game,Wii,2009.0,Puzzle,Mindscape,0.0,5.0,0.0,0.0,5.0,...,,,,5406,False,True,False,False,countdown the game,the game
6061,Despicable Me: The Game,Wii,2010.0,Platform,D3Publisher,16.0,8.0,0.0,2.0,27.0,...,,,,6061,True,True,False,True,despicable me the game,the game


demolition company gold edition


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,...,User_Count,Developer,Rating,id,has_na_sales,has_eu_sales,has_jp_sales,has_other_sales,clean_name,Name_colon_latter
4276,Age of Empires III: Gold Edition,PC,2007.0,Strategy,Microsoft Game Studios,0.0,2.0,0.0,0.0,3.0,...,6.0,Ensemble Studios,T,4276,False,True,False,False,age of empires iii gold edition,gold edition
6226,Serious Sam HD: Gold Edition,PC,2010.0,Shooter,Devolver Digital,0.0,2.0,0.0,0.0,2.0,...,,,,6226,False,True,False,False,serious sam hd gold edition,gold edition
6355,Tropico 3: Gold Edition,PC,2010.0,Strategy,Kalypso Media,0.0,1.0,0.0,0.0,1.0,...,,Haemimont,T,6355,False,True,False,False,tropico gold edition,gold edition
6816,Serious Sam HD: Gold Edition,X360,2011.0,Shooter,Mastertronic,0.0,2.0,0.0,0.0,2.0,...,,,,6816,False,True,False,False,serious sam hd gold edition,gold edition
1489,Soldier of Fortune: Gold Edition,PS2,2001.0,Shooter,Codemasters,,,,,,...,17.0,Pipe Dream Interactive,M,9848,,,,,soldier of fortune gold edition,gold edition
2052,RollerCoaster Tycoon: Gold Edition,PC,2002.0,Strategy,Infogrames,,,,,,...,,Infogrames,E,10411,,,,,rollercoaster tycoon gold edition,gold edition


14 14 0 0 0


In [142]:
count = 0
count_series = 0
count_trigram = 0
count_bigram = 0
count_unigram = 0

for i, name, original in zip(df_test_unk.index, df_test_unk['clean_name'], df_test_unk['Name']):
    df_tmp = pd.DataFrame()
    if ':' in original:
        name_former = original.split(':')[0]
        if len(name_former.lower().split(' ')) <= 2:
            name_former = name_former.lower().split(' ')[0]
            df_tmp = df_known[df_known['Name_colon_former_lt_2'] & df_known['Name'].str.contains(':') & df_known['clean_name'].str.startswith(name_former + ' ')]
        if df_tmp.shape[0] != 0:
            count_series += 1
#     if df_tmp.shape[0] == 0 and len(name.split(' ')) >= 3:
#         name = ' '.join(name.split(' ')[:3])
#         df_tmp = df_known[df_known['clean_name'].str.startswith(name)]
#         if df_tmp.shape[0] != 0:
#             count_trigram += 1
#     if df_tmp.shape[0] == 0 and len(name.split(' ')) >= 2:
#         name = ' '.join(name.split(' ')[:2])
#         df_tmp = df_known[df_known['clean_name'].str.startswith(name)]
#         if df_tmp.shape[0] != 0:
#             count_bigram += 1
#     if df_tmp.shape[0] == 0:
#         name = name.split(' ')[0]
#         if name != 'The':
#             df_tmp = df_train[df_train['Name'].str.startswith(name)]
#             count_unigram += 1

    if df_tmp.shape[0] > 0:
        print(name)
#         display(df_tmp)
        count += 1
        most_publisher = df_tmp.loc[:, 'Publisher'].value_counts().reset_index().loc[0]
        if most_publisher['Publisher'] > 1:
            if df_test_unk.loc[i, 'Publisher'] != 'Unknown':
                raise
            df_test_unk.loc[i, 'Publisher'] = most_publisher['index']
print(count, count_series, count_trigram, count_bigram, count_unigram)

combat elite wwii paratroopers
combat elite wwii paratroopers
nicktoons collection game boy advance video volume
Unknown
sonic x game boy advance video volume
Unknown
nicktoons collection game boy advance video volume
Unknown
nicktoons collection game boy advance video volume
Unknown
kid adventures sky captain
power gig rise of the sixstring
Unknown
power gig rise of the sixstring
Unknown
vampire mansion linda hyde
dark parables the exiled prince
real crimes the unicorn killer
sengoku otome legend battle
Unknown
13 13 0 0 0


In [143]:
df_test_unk.shape, df_test_unk[df_test_unk['Publisher'] == 'Unknown'].shape

((116, 13), (109, 13))

In [146]:
df_final_test = pd.read_csv(datadir / 'id_test_unknown1.csv')
print(df_final_test[df_final_test['Publisher'] == 'Unknown'].shape[0])
df_final_test.loc[df_test_unk.index, 'Publisher'] = df_test_unk.loc[:, 'Publisher']
print(df_final_test[df_final_test['Publisher'] == 'Unknown'].shape[0])
df_final_test.to_csv(datadir / 'id_test_unknown.csv', index=False)

109
109


In [147]:
df_final_test = pd.read_csv(datadir / 'country_prob_test_unknown1.csv')
print(df_final_test[df_final_test['Publisher'] == 'Unknown'].shape[0])
df_final_test.loc[df_test_unk.index, 'Publisher'] = df_test_unk.loc[:, 'Publisher']
print(df_final_test[df_final_test['Publisher'] == 'Unknown'].shape[0])
df_final_test.to_csv(datadir / 'country_prob_test_unknown.csv', index=False)

109
109
