## Prediction

In [None]:
import http
import os
import traceback
from io import BytesIO
import json
from pathlib import Path

import pandas as pd
import numpy as np
import lightgbm as lgb

In [None]:
# set training job id
TRAINING_JOB = 'XXXXXX'

In [None]:
ABEJA_TRAINING_RESULT_DIR = '/mnt/training_jobs/{}'.format(TRAINING_JOB)

with open(os.path.join(ABEJA_TRAINING_RESULT_DIR, 'lgb_env.json')) as f:
    lgb_env = json.load(f)
    NFOLD = lgb_env.get('NFOLD')
    cols_train = lgb_env.get('cols_train')
    OBJECTIVE = lgb_env.get('OBJECTIVE')
    IS_MULTI = OBJECTIVE.startswith("multi")
    NUM_CLASS = lgb_env.get('NUM_CLASS', 1)

models = []
for i in range(NFOLD):
    model = lgb.Booster(model_file=os.path.join(ABEJA_TRAINING_RESULT_DIR, f'model_{i}.txt'))
    models.append(model)

In [None]:
# read test data
csvfile = 'data_test.csv'

In [None]:
# prediction
X_test = pd.read_csv(csvfile, usecols=cols_train)[cols_train]
Y_test = pd.read_csv(csvfile)['num_orders_log']

if IS_MULTI:
    pred = np.zeros((len(X_test), NUM_CLASS))
else:
    pred = np.zeros(len(X_test))
for model in models:
    pred += model.predict(X_test)
pred /= len(models)

if OBJECTIVE == 'binary':
    pred[pred >= 0.5] = 1
    pred[pred < 0.5] = 0
elif IS_MULTI:
    pred = np.argmax(pred, axis=1)

print(pred)
X_test['pred'] = pred

In [None]:
X_test['pred'].head()

In [None]:
Y_test.head()

In [None]:
from sklearn.metrics import mean_squared_error
rmsle = np.sqrt(mean_squared_error(X_test['pred'], Y_test))
print('rsmle is {}'.format(rmsle))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

yvalues = np.concatenate([Y_test, X_test['pred']])
ymin, ymax, yrange = np.amin(yvalues), np.amax(yvalues), np.ptp(yvalues)
ymin, ymax, yrange = np.amin(yvalues), np.amax(yvalues), np.ptp(yvalues)
fig = plt.figure(figsize=(8, 8))
sns.scatterplot(Y_test, X_test['pred'])
plt.plot([ymin - yrange * 0.01, ymax + yrange * 0.01], [ymin - yrange * 0.01, ymax + yrange * 0.01])
plt.xlim(ymin - yrange * 0.01, ymax + yrange * 0.01)
plt.ylim(ymin - yrange * 0.01, ymax + yrange * 0.01)
plt.xlabel('y_observed', fontsize=24)
plt.ylabel('y_predicted', fontsize=24)
plt.title('Observed-Predicted Plot', fontsize=24)
plt.tick_params(labelsize=16)