In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import warnings

warnings.filterwarnings("ignore")

import os
import pandas as pd
import json
import numpy as np
from tqdm import tqdm
import datetime
from sklearn.model_selection import TimeSeriesSplit
from category_encoders import *
import lightgbm as lgb
import gc

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

<IPython.core.display.Javascript object>

## Read train file

In [3]:
train = pd.read_csv(
    "../" + "train.csv",
    sep=",",
    names=[
        "order_date",
        "requester",
        "service",
        "cpgu_user",
        "service_title",
        "receipt_mfc",
        "order_number",
        "mfc",
        "internal_status",
        "external_status",
        "sub_department",
        "creation_mfc",
        "order_type",
        "department_id",
        "deleted",
        "deleter_fk",
        "custom_service_id",
        "close_date",
        "service_level",
        "issue_date",
        "change_timestamp",
    ],
)
# train = train[train.index != 2713723]  ##delete last non valid row

train["change_timestamp"] = pd.to_datetime(train["change_timestamp"], errors="coerce")
train = train[~train.change_timestamp.isna()]
train["order_date"] = pd.to_datetime(train["order_date"])
train["issue_date"] = pd.to_datetime(train["issue_date"])
train["close_date"] = pd.to_datetime(train["close_date"])
train["order_number"] = pd.to_numeric(train["order_number"], errors="coerce")
train = train[~train.order_number.isna()]
train["order_number"] = train["order_number"].astype(int)

<IPython.core.display.Javascript object>

## Read test file

In [4]:
test_id = pd.read_csv("sample_submission.csv")
test_id.head()

Unnamed: 0,requester,service_title
0,83029018,1
1,123998836,1
2,184301121,1
3,54344611,1
4,184299902,1


<IPython.core.display.Javascript object>

In [5]:
train["split"] = train.requester.isin(test_id.requester) * 1
train.groupby("split").size()

split
0    2593723
1     119723
dtype: int64

<IPython.core.display.Javascript object>

In [6]:
train.split.value_counts(normalize=True)

0    0.955878
1    0.044122
Name: split, dtype: float64

<IPython.core.display.Javascript object>

## Пробуем настроить валидацию
### Первый вариант - берем в валидацию последнюю дату по каждому requester из теста
#### Второй вариант - берем в валидацию все requester из теста

### Далее берем по 10% от офисов, пользователей и времени и проверяем где получится лучше скор.

#### Третий вариант (валидация по юзерам) - берем в валидацию 30% всех данных. Сохраняем долю requester в валидации такой же как в train
##### Четвертый вариант (валидация по времени) - берем в валидацию последние пару месяцев

###### Пятый вариант (валидация по офисам) - берем в валидацию отдельные офисы

In [7]:
# Сначала делаем простые признаки. Прошлое заявление, позапрошлое и так далее
# Отделяем LABEL

<IPython.core.display.Javascript object>

In [8]:
train["service_title_one_month_lag"] = train.groupby(["requester"])[
    "service_title"
].shift(1)
train["service_title_two_months_lag"] = train.groupby(["requester"])[
    "service_title"
].shift(2)
train["service_title_three_months_lag"] = train.groupby(["requester"])[
    "service_title"
].shift(3)
train["LABEL"] = train.groupby(["requester"])["service_title"].shift(-1)
train = train[~train.LABEL.isna()]
train[train.requester == 7184765][
    [
        "order_date",
        "LABEL",
        "service_title",
        "service_title_one_month_lag",
        "service_title_two_months_lag",
        "service_title_three_months_lag",
    ]
]

Unnamed: 0,order_date,LABEL,service_title,service_title_one_month_lag,service_title_two_months_lag,service_title_three_months_lag
0,2019-01-06 15:55:33.527,491,134,,,
1,2019-01-06 16:42:30.203,134,491,134.0,,
3,2019-01-08 14:24:48.943,491,134,491.0,134.0,
4,2019-01-08 15:19:51.693,1169,491,134.0,491.0,134.0
10025,2019-01-10 11:25:30.907,443,1169,491.0,134.0,491.0
17084,2019-01-10 22:50:49.407,527,443,1169.0,491.0,134.0
36572,2019-01-15 11:05:34.320,711,527,443.0,1169.0,491.0
36697,2019-01-15 11:12:01.950,743,711,527.0,443.0,1169.0
36762,2019-01-15 11:15:27.187,491,743,711.0,527.0,443.0
71127,2019-01-19 12:22:40.677,134,491,743.0,711.0,527.0


<IPython.core.display.Javascript object>

In [9]:
max_request_date = (
    train[train.split == 1]
    .groupby("requester")["order_date"]
    .max()
    .rename("order_date")
    .reset_index()
)
max_request_date["train_test_split"] = 1
train = train.merge(max_request_date, how="left",)
train.train_test_split = train.train_test_split.fillna(0)
train["train_test_split"] = train["train_test_split"].astype(int)
train[train.requester == 83029018][
    [
        "order_date",
        "LABEL",
        "train_test_split",
        "service_title",
        "service_title_one_month_lag",
        "service_title_two_months_lag",
        "service_title_three_months_lag",
    ]
]

Unnamed: 0,order_date,LABEL,train_test_split,service_title,service_title_one_month_lag,service_title_two_months_lag,service_title_three_months_lag
10,2019-01-09 09:03:37.870,1169,0,222,,,
430,2019-01-09 10:03:53.827,603,0,1169,222,,
722,2019-01-09 10:28:41.577,901,0,603,1169,222,
1093,2019-01-09 11:02:34.580,412,0,901,603,1169,222
1132,2019-01-09 11:05:08.187,82,0,412,901,603,1169
...,...,...,...,...,...,...,...
1791048,2020-04-24 16:53:09.670,1220,0,637,98,496,1220
1792468,2020-04-27 14:11:15.800,416,0,1220,637,98,496
1793656,2020-04-28 14:41:15.257,98,0,416,1220,637,98
1793987,2020-04-28 16:27:47.410,1275,0,98,416,1220,637


<IPython.core.display.Javascript object>

In [10]:
## fill_na + convert_cat_features

<IPython.core.display.Javascript object>

In [11]:
train = train.fillna("-999")

<IPython.core.display.Javascript object>

In [12]:
train["LABEL"] = train["LABEL"].astype("category").cat.codes

<IPython.core.display.Javascript object>

In [13]:
# how to find cat_features
cat_features = train.dtypes[train.dtypes == object].index.to_list()
# cat_features.pop()
cat_features

['requester',
 'service',
 'cpgu_user',
 'service_title',
 'receipt_mfc',
 'mfc',
 'internal_status',
 'external_status',
 'sub_department',
 'creation_mfc',
 'order_type',
 'department_id',
 'deleted',
 'deleter_fk',
 'custom_service_id',
 'close_date',
 'service_level',
 'issue_date',
 'service_title_one_month_lag',
 'service_title_two_months_lag',
 'service_title_three_months_lag']

<IPython.core.display.Javascript object>

In [14]:
def frequency_encoding(train_df, test_df, columns, self_encoding=False):
    for col in columns:
        temp_df = pd.concat([train_df[[col]], test_df[[col]]])
        fq_encode = temp_df[col].value_counts(dropna=False).to_dict()
        if self_encoding:
            train_df[col] = train_df[col].map(fq_encode)
            test_df[col] = test_df[col].map(fq_encode)
        else:
            train_df[col + "_fq_enc"] = train_df[col].map(fq_encode)
            test_df[col + "_fq_enc"] = test_df[col].map(fq_encode)
    return train_df, test_df

<IPython.core.display.Javascript object>

In [15]:
train_df = train[train.train_test_split == 0].drop(["train_test_split"], axis=1)
test_df = train[train.train_test_split == 1].drop(["train_test_split"], axis=1)

<IPython.core.display.Javascript object>

In [16]:
tr_df, tt_df = frequency_encoding(
    train_df, test_df, columns=cat_features, self_encoding=True,
)

<IPython.core.display.Javascript object>

In [17]:
del train
del train_df
del test_df
del max_request_date
del test_id
gc.collect()

233

<IPython.core.display.Javascript object>

In [18]:
time_split = tr_df.order_date

<IPython.core.display.Javascript object>

In [19]:
X, y = tr_df[cat_features], tr_df.LABEL
del tr_df
gc.collect()

0

<IPython.core.display.Javascript object>

In [None]:
%%time
NFOLDS = 5
tscv = TimeSeriesSplit(n_splits=NFOLDS)


lgb_params = {}
lgb_params["learning_rate"] = 0.1
lgb_params["boosting_type"] = "gbdt"  # GradientBoostingDecisionTree
lgb_params["objective"] = "multiclass"  # Multi-class target feature
lgb_params["metric"] = "multi_error"  # metric for multi-class
lgb_params["max_depth"] = 3
lgb_params["num_class"] = len(y.unique())

predictions = []  # np.zeros(len(tt_df[cat_features]))
oof = []

for fold, (train_index, val_index) in enumerate(tscv.split(time_split)):
    print("Fold:", fold)

    tr_x, tr_y = X.iloc[train_index, :], y[train_index]
    vl_x, vl_y = X.iloc[val_index, :], y[val_index]

    tr_data = lgb.Dataset(tr_x, label=tr_y)
    vl_data = lgb.Dataset(vl_x, label=vl_y)

    estimator = lgb.train(
        lgb_params,
        tr_data,
        valid_sets=[tr_data, vl_data],
        verbose_eval=200,
        early_stopping_rounds=10,
    )
    pp_p = estimator.predict(tt_df[cat_features])
    predictions.append([fold, pp_p])
    oof_preds = estimator.predict(vl_x)
    oof.append([fold,oof_preds])
    gc.collect()

Fold: 0
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[2]	training's multi_error: 0.780161	valid_1's multi_error: 0.781623
Fold: 1
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[2]	training's multi_error: 0.771505	valid_1's multi_error: 0.803628
Fold: 2
