# Submitする用のテンプレート

特徴量を読み込んで `LightGBM` を使用したモデルを使用して判別を行います。

## 使用方法

1. 特徴となるデータ(csv)をあらかじめ用意しておく。  
2. 用意したデータのパスを[2]の `train_data_list` と `test_data_list` に追加する。
3. このノートを実行する。

## 読み込むデータの形式について

以下のように `id` を主キーとしたCSVデータを用意してください。

| id | feature1 | feature2 |
|:-----------|------------:|:------------:|
| value | value | value |

In [1]:
import numpy as np
import pandas as pd
import datetime
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')
np.random.seed(4590)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

In [2]:
train_data_list = [
    "exam→" '../input/sample_train.csv'
]
test_data_list = [
    "exam→" '../input/sample_test.csv'
]

In [3]:
display(pd.read_csv( '../input/sample_train').shape)


display(pd.read_csv( '../input/sample_test').shape)

FileNotFoundError: File b'../input/sample_train' does not exist

In [4]:
df_train = pd.DataFrame()
for path in train_data_list: 
    if len(df_train) == 0:
        df_train = pd.read_csv(path)
    else:
        other = pd.read_csv(path) 
        df_train = pd.merge(df_train, other, on='id', how='left')  

df_test = pd.DataFrame()
for path in test_data_list: 
    if len(df_test) == 0: 
        df_test = pd.read_csv(path)
    else: 
        other = pd.read_csv(path) 
        df_test = pd.merge(df_test, other, on='id', how='left')

FileNotFoundError: File b'exam\xe2\x86\x92../input/sample_train.csv' does not exist

In [5]:
display(df_train.head())
display(df_test.head())

NameError: name 'df_test' is not defined

In [6]:
df_train_columns = [c for c in df_train.columns if c not in ['id','revenue', 'outliers']]

In [7]:
df_target = pd.DataFrame()
df_target['value'] = df_train['revenue']
df_train.drop('revenue', axis=1,inplace=False)
df_target['outliers'] = 0
display(df_target)
df_target.loc[df_target['value'] < -30, 'outliers'] = 1
df_target['outliers'].value_counts()

KeyError: 'revenue'

In [8]:
param ={
        'task': 'train',
        'boosting': 'goss',
        'objective': 'regression',
        'metric': 'rmse',
        'learning_rate': 0.01,
        'subsample': 0.9855232997390695,
        'max_depth': 7,
        'top_rate': 0.9064148448434349,
        'num_leaves': 63,
        'min_child_weight': 41.9612869171337,
        'other_rate': 0.0721768246018207,
        'reg_alpha': 9.677537745007898,
        'colsample_bytree': 0.5665320670155495,
        'min_split_gain': 9.820197773625843,
        'reg_lambda': 8.2532317400459,
        'min_data_in_leaf': 21,
        'verbose': -1
        }

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=4590)
oof = np.zeros(len(df_train))
predictions = np.zeros(len(df_test))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train,df_target['outliers'])):
    print("fold {}".format(fold_))
    trn_data = lgb.Dataset(df_train.iloc[trn_idx][df_train_columns], label=df_target['value'].iloc[trn_idx])#, categorical_feature=categorical_feats)
    val_data = lgb.Dataset(df_train.iloc[val_idx][df_train_columns], label=df_target['value'].iloc[val_idx])#, categorical_feature=categorical_feats)

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 100)
    oof[val_idx] = clf.predict(df_train.iloc[val_idx][df_train_columns], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = df_train_columns
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(df_test[df_train_columns], num_iteration=clf.best_iteration) / folds.n_splits

np.sqrt(mean_squared_error(oof, df_target['value']))

NameError: name 'df_test' is not defined

In [9]:
date = datetime.datetime.now().strftime('%Y%m%d%H%M')
sub_df = pd.DataFrame({"id":df_test["id"].values})
sub_df["revenue"] = predictions
sub_df.to_csv(f"submission-{date}.csv", index=False)

NameError: name 'df_test' is not defined

In [10]:
date = datetime.datetime.now().strftime('%Y%m%d%H%M')
plt.figure(figsize=(15,100))
feature_importance_df.sort_values('importance', ascending=False, inplace=True)
sns.barplot(x="importance", y="Feature", data=feature_importance_df)
plt.savefig(f"lgbm_importances_{date}.png")

NameError: name 'feature_importance_df' is not defined

<Figure size 1080x7200 with 0 Axes>

In [11]:
#  モデルの保存
date = datetime.datetime.now().strftime('%Y%m%d%H%M')
with open(f"submit_clf_{date}.binaryfile", 'wb') as f:
    pickle.dump(clf , f)

NameError: name 'clf' is not defined