In [1]:
import os
import warnings
warnings.filterwarnings('ignore')
from typing import Any, Dict, List, Tuple
from pathlib import Path
import yaml

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pandas_profiling import ProfileReport # profile report を作る用
from matplotlib_venn import venn2 # venn図を作成する用
from tqdm import tqdm
from contextlib import contextmanager
from time import time
from collections import Counter

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_log_error, mean_squared_error
import lightgbm as lgb

from mykaggle.metric.mse import rmsle
from mykaggle.util.ml_logger import MLLogger
from mykaggle.lib.lgbm_util import compute_importances, save_importances
from mykaggle.lib.pandas_util import change_column_name
from mykaggle.util.routine import fix_seed

sns.set_style('darkgrid')

In [2]:
settings = yaml.safe_load('''
name: '200_eda'
competition: atmacup8
seed: 1019
training:
    num_folds: 5
    num_rounds: 1000
    early_stopping_rounds: 100
    verbose_eval: 20
lgbm_params:
    objective: binary
    learning_rate: 0.05
    max_depth: -1
    num_leaves: 31
    colsample_bytree: .7
    metric: "None"
''')

  and should_run_async(code)


In [3]:
fix_seed(settings['seed'])
pd.set_option('display.max_rows', 500)

In [4]:
datadir = Path('../data/')
ckptdir = Path('../ckpt/') / settings['name']
if not ckptdir.exists():
    ckptdir.mkdir()

In [5]:
df_train = pd.read_csv(datadir / 'id_train.csv')
df_test = pd.read_csv(datadir / 'id_test.csv')
df_submission = pd.read_csv(datadir / 'atmaCup8_sample-submission.csv')
df_train.shape, df_test.shape

((8359, 17), (8360, 12))

In [6]:
df_train.head().T

Unnamed: 0,0,1,2,3,4
Name,LEGO Batman: The Videogame,LEGO Indiana Jones: The Original Adventures,LEGO Batman: The Videogame,Combat,LEGO Harry Potter: Years 5-7
Platform,Wii,Wii,PSP,2600,Wii
Year_of_Release,,,,,
Genre,Action,Action,Action,Action,Action
Publisher,Warner Bros. Interactive Entertainment,LucasArts,Warner Bros. Interactive Entertainment,Atari,Warner Bros. Interactive Entertainment
NA_Sales,180,151,56,117,69
EU_Sales,97,61,44,7,42
JP_Sales,0,0,0,0,0
Other_Sales,28,21,27,1,12
Global_Sales,306,234,128,125,124


In [7]:
df_test.head().T

  and should_run_async(code)


Unnamed: 0,0,1,2,3,4
Name,Hitman 2: Silent Assassin,Legacy of Kain: Soul Reaver,Metal Gear Solid 2: Substance,Silent Hill: Homecoming,Silent Hill: Homecoming
Platform,XB,PS,XB,X360,PS3
Year_of_Release,,,,,
Genre,Action,Action,Action,Action,Action
Publisher,Eidos Interactive,Eidos Interactive,Konami Digital Entertainment,Konami Digital Entertainment,Konami Digital Entertainment
Critic_Score,84,91,87,70,71
Critic_Count,23,17,28,54,41
User_Score,8,9,8.5,6.9,6.9
User_Count,19,132,39,180,143
Developer,Io Interactive,Crystal Dynamics,KCEJ,Double Helix Games,Double Helix Games


In [8]:
df_submission.head()

  and should_run_async(code)


Unnamed: 0,Global_Sales
0,63.371815
1,63.371815
2,63.371815
3,63.371815
4,63.371815


In [9]:
df_train.loc[:, 'has_na_sales'] = df_train.loc[:, 'NA_Sales']  > 0
df_train.loc[:, 'has_eu_sales'] = df_train.loc[:, 'EU_Sales']  > 0
df_train.loc[:, 'has_jp_sales'] = df_train.loc[:, 'JP_Sales']  > 0
df_train.loc[:, 'has_other_sales'] = df_train.loc[:, 'Other_Sales']  > 0
# df_train.to_csv(datadir / 'country_train.csv', index=False)

  and should_run_async(code)


In [10]:
# 要検討
df_best_pred = pd.read_csv('../ckpt/187_name_count/187_name_count.csv')
df_test.loc[:, 'Global_Sales'] = df_best_pred.loc[:, 'Global_Sales']

  and should_run_async(code)


In [11]:
train = df_train.copy()
test = df_test.copy()
train['is_test'] = False
test['is_test'] = True
df_whole = pd.concat([train, test])

In [12]:
train_others = {
    'main': df_train.copy(),
    'another': df_test.copy()
}
test_others = {
    'main': df_test.copy(),
    'another': df_train.copy(),
}

# Publisher の特徴作る

In [68]:
from typing import Optional, Dict
import pandas as pd
from sklearn.decomposition import PCA

from mykaggle.feature.base import Feature
from mykaggle.transform.pivot import PivotTransform

COLUMNS = [
    'Genre',
    'Platform',
    'Year_of_Release',
    'Developer'
]


class PubToCategoriyPivotPCAAll:
    '''
    Publisher から見た各カテゴリ pivot and pca
    '''

    def __init__(self, train: bool = True, n_components: int = 10) -> None:
        self.train = train
        self.name = ''
        self.n_components = n_components

    def create(
        self,
        base: pd.DataFrame,
        others: Optional[Dict[str, pd.DataFrame]] = None,
        *args, **kwargs
    ) -> pd.DataFrame:
        df_main = others['main'].copy()
        df_another = others['another'].copy()
        if self.train:
            df_whole = pd.concat([df_main, df_another])
        else:
            df_whole = pd.concat([df_another, df_main])
#         df_whole = df_main.copy()
        df_pivot = None
        for i, c in enumerate(COLUMNS):
            transform = PivotTransform(indices=['Publisher'], column=c, target='id', aggs=['count'], fillna=0)
            pub_to_c = transform(df_whole)
            if df_pivot is None:
                df_pivot = pub_to_c
            else:
                df_pivot = pd.merge(df_pivot, pub_to_c, how='left', on='Publisher')

        df_pivot = df_pivot.fillna(0)
        return df_pivot
#         df_pca = pd.DataFrame(self._pca_transform(df_pivot, self.n_components))

#         df_pivot = pd.concat([df_pivot, df_pca], axis=1)
#         df_pivot = df_pivot.iloc[:, [0] + list(range(-1, -self.n_components - 1, -1))]
#         pca_columns = ['_'.join(['pca', str(n), 'count_id_pivotby_Publisher_for_all'])
#                        for n in range(self.n_components)]
#         df_pivot.columns = ['Publisher'] + pca_columns
#         df_main = pd.merge(df_main, df_pivot, how='left', on='Publisher')
#         return df_main.loc[:, pca_columns]

    def _pca_transform(self, df: pd.DataFrame, n_components: int):
        pca = PCA(n_components)
        return pca.fit_transform(df.drop('Publisher', axis=1).values)

    
ppp_train = PubToCategoriyPivotPCAAll(train=True)
ppp_test = PubToCategoriyPivotPCAAll(train=False)

  and should_run_async(code)


In [80]:
df_pivot = ppp_train.create(df_train.copy(), train_others)
# df_pivot_test = ppp_test.create(df_test.copy(), test_others)

In [73]:
df_train.groupby('Publisher')['Name'].count().shape, df_test.groupby('Publisher')['Name'].count().shape

((295,), (286,))

In [70]:
df_pivot_train.shape, df_pivot_test.shape

  and should_run_async(code)


((581, 1778), (581, 1778))

In [79]:
train_publisher = df_train['Publisher'].unique()
test_publisher = df_test['Publisher'].unique()

In [84]:
def is_train_pub(x: str) -> bool:
    return x in train_publisher

In [85]:
df_pivot['is_train'] = df_pivot['Publisher'].apply(is_train_pub)

In [90]:
df_pivot_train = df_pivot[df_pivot['is_train']]
df_pivot_test = df_pivot[~df_pivot['is_train']]
df_pivot_train.shape, df_pivot_test.shape

((295, 1779), (286, 1779))

In [91]:
n_component = 10
pca = PCA(n_component)
pca_train = pca.fit_transform(df_pivot_train.drop(['Publisher', 'is_train'], axis=1).values)
pca_test = pca.transform(df_pivot_test.drop(['Publisher', 'is_train'], axis=1).values)

In [92]:
pca_train.shape, pca_test.shape

  and should_run_async(code)


((295, 10), (286, 10))

In [95]:
from sklearn.metrics.pairwise import euclidean_distances, manhattan_distances, cosine_similarity

In [98]:
eucl_dist = euclidean_distances(pca_train, pca_test)
manh_dist = manhattan_distances(pca_train, pca_test)
cos_dist = cosine_similarity(pca_train, pca_test)

  and should_run_async(code)


In [123]:
publisher_map = df_pivot_train[['Publisher']].copy()
publisher_map = publisher_map.reset_index()[['Publisher']]
publisher_map = {k: v for k, v in zip(publisher_map.index, publisher_map['Publisher'])}

In [127]:
def make_new_publisher(dist, pub_map):
    df = df_pivot_test[['Publisher']].copy()
    df['new_publisher'] = np.argmin(dist, 0)
    df['new_publisher'] = df['new_publisher'].apply(lambda x: pub_map[x])
    return df

In [128]:
df_euc = make_new_publisher(eucl_dist, publisher_map)
df_man = make_new_publisher(manh_dist, publisher_map)
df_cos = make_new_publisher(cos_dist, publisher_map)

In [129]:
df_euc.head()

Unnamed: 0,Publisher,new_publisher
0,10TACLE Studios,Funsta
1,1C Company,Iceberg Interactive
2,20th Century Fox Video Games,Coleco
4,3DO,Play It
5,49Games,Detn8 Games


In [133]:
df_man.head()

  and should_run_async(code)


Unnamed: 0,Publisher,new_publisher
0,10TACLE Studios,Funsta
1,1C Company,Iceberg Interactive
2,20th Century Fox Video Games,Coleco
4,3DO,989 Studios
5,49Games,Detn8 Games


In [132]:
df_cos.head()

  and should_run_async(code)


Unnamed: 0,Publisher,new_publisher
0,10TACLE Studios,Sega
1,1C Company,Sega
2,20th Century Fox Video Games,Sega
4,3DO,Nobilis
5,49Games,THQ


In [134]:
df_euc.to_csv(datadir / 'pub_map_euc.csv', index=False)
df_man.to_csv(datadir / 'pub_map_man.csv', index=False)
df_cos.to_csv(datadir / 'pub_map_cos.csv', index=False)

  and should_run_async(code)


In [136]:
def save_new_test(df_map, name: str):
    test = pd.read_csv(datadir / 'country_prob_test.csv')
    test = pd.merge(test, df_map, how='left', on='Publisher')
    test['Publisher'] = test['new_publisher']
    test = test.drop('new_publisher', axis=1)
    test.to_csv(datadir / f'pubmap_test_{name}.csv', index=False)

  and should_run_async(code)


In [137]:
save_new_test(df_euc, 'euc')
save_new_test(df_man, 'man')
save_new_test(df_cos, 'cos')