In [1]:
import os
import warnings
warnings.filterwarnings('ignore')
from typing import Any, Dict, List, Tuple
from pathlib import Path
import yaml

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pandas_profiling import ProfileReport # profile report を作る用
from matplotlib_venn import venn2 # venn図を作成する用
from tqdm import tqdm
from contextlib import contextmanager
from time import time
from collections import Counter

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_log_error, mean_squared_error
import lightgbm as lgb

from mykaggle.metric.mse import rmsle
from mykaggle.util.ml_logger import MLLogger
from mykaggle.lib.lgbm_util import compute_importances, save_importances
from mykaggle.lib.pandas_util import change_column_name
from mykaggle.util.routine import fix_seed

sns.set_style('darkgrid')

In [2]:
settings = yaml.safe_load('''
name: '225_eda_pseudo_sales'
competition: atmacup8
seed: 1019
training:
    num_folds: 5
    num_rounds: 1000
    early_stopping_rounds: 100
    verbose_eval: 20
lgbm_params:
    objective: binary
    learning_rate: 0.05
    max_depth: -1
    num_leaves: 31
    colsample_bytree: .7
    metric: "None"
''')

  and should_run_async(code)


In [3]:
fix_seed(settings['seed'])

In [4]:
datadir = Path('../data/')
ckptdir = Path('../ckpt/') / settings['name']
if not ckptdir.exists():
    ckptdir.mkdir()

In [5]:
df_train = pd.read_csv(datadir / 'id_train.csv')
df_test = pd.read_csv(datadir / 'id_test_unknown.csv')
df_submission = pd.read_csv(datadir / 'atmaCup8_sample-submission.csv')
df_train.shape, df_test.shape

((8358, 17), (8360, 12))

In [6]:
df_train.head().T

Unnamed: 0,0,1,2,3,4
Name,LEGO Batman: The Videogame,LEGO Indiana Jones: The Original Adventures,LEGO Batman: The Videogame,Combat,LEGO Harry Potter: Years 5-7
Platform,Wii,Wii,PSP,2600,Wii
Year_of_Release,,,,,
Genre,Action,Action,Action,Action,Action
Publisher,Warner Bros. Interactive Entertainment,LucasArts,Warner Bros. Interactive Entertainment,Atari,Warner Bros. Interactive Entertainment
NA_Sales,180,151,56,117,69
EU_Sales,97,61,44,7,42
JP_Sales,0,0,0,0,0
Other_Sales,28,21,27,1,12
Global_Sales,306,234,128,125,124


In [7]:
df_test.head().T

  and should_run_async(code)


Unnamed: 0,0,1,2,3,4
Name,Hitman 2: Silent Assassin,Legacy of Kain: Soul Reaver,Metal Gear Solid 2: Substance,Silent Hill: Homecoming,Silent Hill: Homecoming
Platform,XB,PS,XB,X360,PS3
Year_of_Release,,,,,
Genre,Action,Action,Action,Action,Action
Publisher,Eidos Interactive,Eidos Interactive,Konami Digital Entertainment,Konami Digital Entertainment,Konami Digital Entertainment
Critic_Score,84,91,87,70,71
Critic_Count,23,17,28,54,41
User_Score,8,9,8.5,6.9,6.9
User_Count,19,132,39,180,143
Developer,Io Interactive,Crystal Dynamics,KCEJ,Double Helix Games,Double Helix Games


In [8]:
df_submission.head()

  and should_run_async(code)


Unnamed: 0,Global_Sales
0,63.371815
1,63.371815
2,63.371815
3,63.371815
4,63.371815


In [30]:
df_train.loc[:, 'has_na_sales'] = df_train.loc[:, 'NA_Sales']  > 0
df_train.loc[:, 'has_eu_sales'] = df_train.loc[:, 'EU_Sales']  > 0
df_train.loc[:, 'has_jp_sales'] = df_train.loc[:, 'JP_Sales']  > 0
df_train.loc[:, 'has_other_sales'] = df_train.loc[:, 'Other_Sales']  > 0
# df_train.to_csv(datadir / 'region_train.csv', index=False)

  and should_run_async(code)


In [10]:
train = df_train.copy()
test = df_test.copy()
train['is_test'] = False
test['is_test'] = True
whole = pd.concat([train, test])

  and should_run_async(code)


In [11]:
train_others = {
    'main': df_train.copy(),
    'another': df_test.copy()
}
test_others = {
    'main': df_test.copy(),
    'another': df_train.copy(),
}

# 国のセールスとカテゴリ

In [None]:
sales_columns = ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']

In [None]:
from mykaggle.feature.te_country_rate import TECountryRate
tecr = TECountryRate(True)

In [None]:
train.loc[:, sales_columns].sum(axis=0)

In [None]:
df_pub_sales = train.groupby('Publisher')[sales_columns].sum()
for c in sales_columns:
    df_pub_sales[f'{c}_rate'] = df_pub_sales.loc[:, c] / df_pub_sales.loc[:, 'Global_Sales']
df_pub_sales.head()

In [None]:
# ズレある? -> 半分くらいは微妙にずれてる
diff = []
for i, row in df_pub_sales.iterrows():
    sum_sales = np.sum([row['EU_Sales'], row['JP_Sales'], row['Other_Sales'], row['NA_Sales']])
    if row['Global_Sales'] != sum_sales:
        diff.append(row['Global_Sales'] - sum_sales)
len(diff), np.max(diff), np.min(diff)

In [None]:
# platform
df_pub_sales = train.groupby('Platform')[sales_columns].sum()
for c in sales_columns:
    df_pub_sales[f'{c}_rate'] = df_pub_sales.loc[:, c] / df_pub_sales.loc[:, 'Global_Sales']
df_pub_sales.head()

In [None]:
# genre
df_pub_sales = train.groupby('Genre')[sales_columns].sum()
for c in sales_columns:
    df_pub_sales[f'{c}_rate'] = df_pub_sales.loc[:, c] / df_pub_sales.loc[:, 'Global_Sales']
df_pub_sales

In [None]:
# year
df_pub_sales = train.groupby('Year_of_Release')[sales_columns].sum()
for c in sales_columns:
    df_pub_sales[f'{c}_rate'] = df_pub_sales.loc[:, c] / df_pub_sales.loc[:, 'Global_Sales']
df_pub_sales

In [15]:
pred_sales = [
    '224_has_jp_sales',
    '222_has_na_sales',
    '223_has_eu_sales',
    '221_other_sales'
]

In [None]:
# test

In [16]:
basedir = Path('../ckpt/')
dfs = []
for pred in pred_sales:
    df = pd.read_csv(basedir / pred / (pred + '.csv'))
    dfs.append(df)
    
df_sales = pd.concat(dfs, axis=1)

In [17]:
df_sales

Unnamed: 0,has_jp_sales,has_na_sales,has_eu_sales,has_other_sales
0,0.034041,0.996483,0.953062,0.684615
1,0.530014,0.982488,0.966240,0.896287
2,0.131849,0.990622,0.941580,0.795452
3,0.419454,0.988447,0.990163,0.955910
4,0.564849,0.974045,0.986734,0.958058
...,...,...,...,...
8355,0.929947,0.041290,0.035178,0.094312
8356,0.916582,0.050759,0.062712,0.286170
8357,0.010980,0.013907,0.974339,0.037507
8358,0.014382,0.013175,0.943309,0.032373


In [18]:
df_test_sales = pd.concat([df_test, df_sales], axis=1)
df_test_sales.to_csv(datadir / 'region_prob_test.csv', index=False)
df_test_sales.head()

  and should_run_async(code)


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,has_jp_sales,has_na_sales,has_eu_sales,has_other_sales
0,Hitman 2: Silent Assassin,XB,,Action,Eidos Interactive,84.0,23.0,8.0,19.0,Io Interactive,M,8359,0.034041,0.996483,0.953062,0.684615
1,Legacy of Kain: Soul Reaver,PS,,Action,Eidos Interactive,91.0,17.0,9.0,132.0,Crystal Dynamics,T,8360,0.530014,0.982488,0.96624,0.896287
2,Metal Gear Solid 2: Substance,XB,,Action,Konami Digital Entertainment,87.0,28.0,8.5,39.0,KCEJ,M,8361,0.131849,0.990622,0.94158,0.795452
3,Silent Hill: Homecoming,X360,,Action,Konami Digital Entertainment,70.0,54.0,6.9,180.0,Double Helix Games,M,8362,0.419454,0.988447,0.990163,0.95591
4,Silent Hill: Homecoming,PS3,,Action,Konami Digital Entertainment,71.0,41.0,6.9,143.0,Double Helix Games,M,8363,0.564849,0.974045,0.986734,0.958058


In [None]:
# training

In [19]:
basedir = Path('../ckpt/')
dfs = []
for pred in pred_sales:
    df = pd.read_csv(basedir / pred / ('train_' + pred + '.csv'))
    dfs.append(df)
    
df_sales = pd.concat(dfs, axis=1)

  and should_run_async(code)


In [20]:
df_sales

Unnamed: 0,has_jp_sales,has_na_sales,has_eu_sales,has_other_sales
0,0.026652,0.997914,0.981053,0.973977
1,0.028343,0.997332,0.989190,0.957597
2,0.026891,0.998058,0.970664,0.971480
3,0.014780,0.987698,0.925745,0.284165
4,0.032148,0.997392,0.982094,0.971330
...,...,...,...,...
8353,0.013658,0.028792,0.994310,0.220813
8354,0.012668,0.015650,0.982969,0.029630
8355,0.954928,0.020139,0.024980,0.022404
8356,0.924988,0.234916,0.199957,0.317849


In [21]:
has_sales = ['has_jp_sales', 'has_na_sales', 'has_eu_sales', 'has_other_sales']

  and should_run_async(code)


In [22]:
df_train_sales = df_train.copy()
for c in has_sales:
    df_train_sales.loc[:, c] = df_sales.loc[:, c]
df_train_sales.to_csv(datadir / 'region_prob_train.csv', index=False)

In [24]:
df_train_sales.shape

  and should_run_async(code)


(8358, 21)

In [28]:
df_tmp = []
for c in has_sales:
    df_tmp.append(df_train.loc[:, c] == (df_train_sales.loc[:, c] > 0.55))
df_tmp = pd.concat(df_tmp, axis=1)
np.mean(df_tmp.values)

  and should_run_async(code)


0.9469370662837999

In [29]:
df_test_det = df_test_sales.copy()
for c in has_sales:
    df_test_det.loc[:, c] = df_test_sales.loc[:, c] > 0.55
df_test_det.to_csv(datadir / 'region_test.csv', index=False)
df_test_det.tail()

  and should_run_async(code)


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id,has_jp_sales,has_na_sales,has_eu_sales,has_other_sales
8355,SD Gundam G Generation Genesis,PSV,2016.0,Strategy,Namco Bandai Games,,,,,,,16714,True,False,False,False
8356,SD Gundam G Generation Genesis,PS4,2016.0,Strategy,Namco Bandai Games,,,,,,,16715,True,False,False,False
8357,Battle Worlds: Kronos,PC,2016.0,Strategy,Nordic Games,71.0,15.0,7.1,71.0,KING Art,,16716,False,False,True,False
8358,Codename: Panzers Complete Collection,PC,2016.0,Strategy,Nordic Games,,,,,,,16717,False,False,True,False
8359,Imagine: Makeup Artist,DS,2020.0,Simulation,Ubisoft,,,tbd,,Ubisoft,E,16718,False,True,False,False


In [None]:
df_train_sales.loc[:, ['has_jp_sales', 'has_na_sales', 'has_eu_sales', 'has_other_sales']].head()

In [None]:
df_train.loc[:, ['has_jp_sales', 'has_na_sales', 'has_eu_sales', 'has_other_sales']].head()

# Name

In [None]:
import texthero
from texthero import preprocessing

custom_pipeline = [
    preprocessing.fillna,
    preprocessing.lowercase,
    preprocessing.remove_digits,
    preprocessing.remove_punctuation,
    preprocessing.remove_diacritics,
    preprocessing.remove_whitespace
]

whole.loc[:, 'clean_name'] = texthero.clean(whole.loc[:, 'Name'], pipeline=custom_pipeline)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def get_top_text_ngrams(corpus, n, g, s):
    vec = CountVectorizer(ngram_range=(g, g)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items() if sum_words[0, idx] > s]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [None]:
most_common_bi = get_top_text_ngrams(whole.loc[:, 'clean_name'], 10000, 2, 5)

In [None]:
most_common_bi

In [None]:
most_common_bi = most_common_bi[::-1]

In [None]:
most_common_bi = most_common_bi[:-2]

In [None]:
most_common_bi

In [None]:
whole["num_word_series"] = 0
most_common_bi = dict(most_common_bi)
for i in most_common_bi:
    if i == 'Final Fantasy'.lower():
        print(i)
    idx = whole[whole.loc[:, 'clean_name'].str.contains(i)].index
    whole.iloc[idx, -1] = most_common_bi[i]

In [None]:
name_platform = train.groupby('Name')['Platform'].nunique().reset_index()
name_platform

In [None]:
from mykaggle.feature.name_series_count import NameSeriesCount
nsc = NameSeriesCount(True)

In [None]:
a = nsc(train, train_others)
a

In [None]:
a[['Name', 'num_word_series_1', 'num_word_series_2', 'num_word_series_3']]