In [1]:
import os
import warnings
warnings.filterwarnings('ignore')
from typing import Any, Dict, List, Tuple
from pathlib import Path
import yaml

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pandas_profiling import ProfileReport # profile report を作る用
from matplotlib_venn import venn2 # venn図を作成する用
from tqdm import tqdm
from contextlib import contextmanager
from time import time

from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb

from mykaggle.metric.rmsle import rmsle
from mykaggle.util.ml_logger import MLLogger
from mykaggle.lib.lgbm_util import compute_importances, save_importances
from mykaggle.util.routine import fix_seed

sns.set_style('darkgrid')

In [2]:
settings = yaml.safe_load('''
name: '002_index'
competition: atmacup8
seed: 1019
training:
    num_folds: 5
    num_rounds: 1000
    early_stopping_rounds: 100
    verbose_eval: 20
lgbm_params:
    objective: binary
    learning_rate: 0.05
    max_depth: -1
    num_leaves: 31
    colsample_bytree: .7
    metric: "None"
''')

  and should_run_async(code)


In [3]:
fix_seed(settings['seed'])

In [4]:
datadir = Path('../data/')
ckptdir = Path('../ckpt/') / settings['name']
if not ckptdir.exists():
    ckptdir.mkdir()

In [5]:
df_train = pd.read_csv(datadir / 'train.csv')
df_test = pd.read_csv(datadir / 'test.csv')
df_submission = pd.read_csv(datadir / 'atmaCup8_sample-submission.csv')
df_train.shape, df_test.shape

((8359, 16), (8360, 11))

In [9]:
df_train.head()

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating
0,LEGO Batman: The Videogame,Wii,,Action,Warner Bros. Interactive Entertainment,180,97,0,28,306,74.0,17.0,7.9,22.0,Traveller's Tales,E10+
1,LEGO Indiana Jones: The Original Adventures,Wii,,Action,LucasArts,151,61,0,21,234,78.0,22.0,6.6,28.0,Traveller's Tales,E10+
2,LEGO Batman: The Videogame,PSP,,Action,Warner Bros. Interactive Entertainment,56,44,0,27,128,73.0,5.0,7.4,10.0,Traveller's Tales,E10+
3,Combat,2600,,Action,Atari,117,7,0,1,125,,,,,,
4,LEGO Harry Potter: Years 5-7,Wii,,Action,Warner Bros. Interactive Entertainment,69,42,0,12,124,76.0,8.0,7.8,13.0,Traveller's Tales,E10+


In [10]:
df_train['id'] = range(df_train.shape[0])

  and should_run_async(code)


In [17]:
df_train.tail()

  and should_run_async(code)


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id
8354,Stellaris,PC,2016.0,Strategy,Paradox Interactive,0,4,0,0,4,78.0,57.0,8.0,569.0,Paradox Development Studio,,8354
8355,Total War Attila: Tyrants & Kings,PC,2016.0,Strategy,Koch Media,0,1,0,0,1,,,,,,,8355
8356,Brothers Conflict: Precious Baby,PSV,2017.0,Action,Idea Factory,0,0,1,0,1,,,,,,,8356
8357,Phantasy Star Online 2 Episode 4: Deluxe Package,PS4,2017.0,Role-Playing,Sega,0,0,4,0,4,,,,,,,8357
8358,Phantasy Star Online 2 Episode 4: Deluxe Package,PSV,2017.0,Role-Playing,Sega,0,0,1,0,1,,,,,,,8358


In [15]:
start = df_train.shape[0]
df_test['id'] = [r + start for r in range(df_test.shape[0])]

In [16]:
df_test.head()

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,id
0,Hitman 2: Silent Assassin,XB,,Action,Eidos Interactive,84.0,23.0,8.0,19.0,Io Interactive,M,8359
1,Legacy of Kain: Soul Reaver,PS,,Action,Eidos Interactive,91.0,17.0,9.0,132.0,Crystal Dynamics,T,8360
2,Metal Gear Solid 2: Substance,XB,,Action,Konami Digital Entertainment,87.0,28.0,8.5,39.0,KCEJ,M,8361
3,Silent Hill: Homecoming,X360,,Action,Konami Digital Entertainment,70.0,54.0,6.9,180.0,Double Helix Games,M,8362
4,Silent Hill: Homecoming,PS3,,Action,Konami Digital Entertainment,71.0,41.0,6.9,143.0,Double Helix Games,M,8363


In [18]:
df_train.to_csv(datadir / 'id_train.csv', index=False)
df_test.to_csv(datadir / 'id_test.csv', index=False)

  and should_run_async(code)
