In [1]:
import os
import warnings
warnings.filterwarnings('ignore')
from typing import Any, Dict, List, Tuple
from pathlib import Path
import yaml

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pandas_profiling import ProfileReport # profile report を作る用
from matplotlib_venn import venn2 # venn図を作成する用
from tqdm import tqdm
from contextlib import contextmanager
from time import time

from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb

from mykaggle.metric.mse import rmsle
from mykaggle.util.ml_logger import MLLogger
from mykaggle.lib.lgbm_util import compute_importances, save_importances
from mykaggle.util.routine import fix_seed

sns.set_style('darkgrid')

In [2]:
settings = yaml.safe_load('''
name: '003_group_kfold'
competition: atmacup8
seed: 1019
training:
    num_folds: 5
    num_rounds: 1000
    early_stopping_rounds: 100
    verbose_eval: 20
lgbm_params:
    objective: binary
    learning_rate: 0.05
    max_depth: -1
    num_leaves: 31
    colsample_bytree: .7
    metric: "None"
''')

  and should_run_async(code)


In [3]:
fix_seed(settings['seed'])

In [4]:
datadir = Path('../data/')
ckptdir = Path('../ckpt/') / settings['name']
if not ckptdir.exists():
    ckptdir.mkdir()

In [5]:
df_train = pd.read_csv(datadir / 'id_train.csv')
df_test = pd.read_csv(datadir / 'id_test.csv')
df_submission = pd.read_csv(datadir / 'atmaCup8_sample-submission.csv')
df_train.shape, df_test.shape

((8359, 17), (8360, 12))

In [6]:
df_train.head()

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id
0,LEGO Batman: The Videogame,Wii,,Action,Warner Bros. Interactive Entertainment,180,97,0,28,306,74.0,17.0,7.9,22.0,Traveller's Tales,E10+,0
1,LEGO Indiana Jones: The Original Adventures,Wii,,Action,LucasArts,151,61,0,21,234,78.0,22.0,6.6,28.0,Traveller's Tales,E10+,1
2,LEGO Batman: The Videogame,PSP,,Action,Warner Bros. Interactive Entertainment,56,44,0,27,128,73.0,5.0,7.4,10.0,Traveller's Tales,E10+,2
3,Combat,2600,,Action,Atari,117,7,0,1,125,,,,,,,3
4,LEGO Harry Potter: Years 5-7,Wii,,Action,Warner Bros. Interactive Entertainment,69,42,0,12,124,76.0,8.0,7.8,13.0,Traveller's Tales,E10+,4


In [7]:
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import LabelEncoder

  and should_run_async(code)


In [8]:
splitter = GroupKFold(n_splits=5)

In [9]:
le = LabelEncoder()
df_train.loc[:, 'Publisher'] = le.fit_transform(df_train.loc[:, 'Publisher'])

In [10]:
df_train.head()

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id
0,LEGO Batman: The Videogame,Wii,,Action,282,180,97,0,28,306,74.0,17.0,7.9,22.0,Traveller's Tales,E10+,0
1,LEGO Indiana Jones: The Original Adventures,Wii,,Action,145,151,61,0,21,234,78.0,22.0,6.6,28.0,Traveller's Tales,E10+,1
2,LEGO Batman: The Videogame,PSP,,Action,282,56,44,0,27,128,73.0,5.0,7.4,10.0,Traveller's Tales,E10+,2
3,Combat,2600,,Action,26,117,7,0,1,125,,,,,,,3
4,LEGO Harry Potter: Years 5-7,Wii,,Action,282,69,42,0,12,124,76.0,8.0,7.8,13.0,Traveller's Tales,E10+,4


In [11]:
splits = splitter.split(df_train, groups=df_train.loc[:, 'Publisher'])
splits = list(splits)

  and should_run_async(code)


In [12]:
for train_idx, valid_idx in splits:
    train, valid = df_train.iloc[train_idx], df_train.iloc[valid_idx]
    print(train.loc[:, 'Global_Sales'].mean(), valid.loc[:, 'Global_Sales'].mean())
    print(train.loc[:, 'Global_Sales'].median(), valid.loc[:, 'Global_Sales'].median())

65.7209510991476 53.97667464114833
17.0 21.0
47.93315388066397 125.11722488038278
16.0 26.0
67.22730671452071 47.952153110047846
19.0 13.0
70.42889187976671 35.14772727272727
19.0 13.0
65.54844497607655 54.66008378216637
17.0 20.0


  and should_run_async(code)


In [None]:
# import pickle
# pickle.dump(splits, open(datadir / 'group_10fold.pkl', 'wb'))

# 検証

In [None]:
import pickle
splits = pickle.load(open(datadir / 'group_5fold.pkl', 'rb'))

In [None]:
for train_idx, valid_idx in splits:
    train, valid = df_train.iloc[train_idx], df_train.iloc[valid_idx]
    print(train.loc[:, 'Global_Sales'].mean(), valid.loc[:, 'Global_Sales'].mean())