In [1]:
import pandas as pd
import numpy as np
import feather

In [222]:
df_example = pd.read_csv('data/prediction_example.csv', header=None)

In [3]:
df_pays = feather.read_dataframe('data/df_pays_agg.feather')

In [5]:
last_day = df_pays.groupby('shop_id').day.max()
last_day[last_day < pd.to_datetime('2016-10-31')]

shop_id
987    2016-10-30
1707   2016-10-24
1824   2016-10-10
Name: day, dtype: datetime64[ns]

Evaluation metric:

![](https://img.alicdn.com/tps/TB1PDLzOVXXXXXgaXXXXXXXXXXX-814-184.png)

Data: before 2016-10-31
Need to predict 2016-11-01 - 2016-11-14

In [245]:
def evaluation_metric(y_true, y_pred):
    scores = (y_pred - y_true) / (y_pred + y_true)
    scores[y_true < 1] = 0.0
    scores = np.abs(scores)
    return scores.mean()

In [246]:
df_pays_train = df_pays[df_pays.day <= pd.to_datetime('2016-10-17')]
df_pays_val = df_pays[df_pays.day > pd.to_datetime('2016-10-17')]
df_pays_val.day.nunique()

14

In [247]:
def fill_holes(df, day_min=None, day_max=None):
    if day_min is None:
        day_min = df.day.min()
    if day_max is None:
        day_max = df.day.max()

    td = max_val_day - min_val_day
    if len(df) == td.days + 1:
        return df

    idx = pd.date_range(day_min, day_max)

    df = df.reset_index(drop=1)
    df = df.set_index('day').reindex(index=idx, fill_value=0).reset_index()
    df.rename(columns={'index': 'day'}, inplace=1)

    return df

In [268]:
baseline_day = pd.to_datetime('2016-10-11')
max_train_day = pd.to_datetime('2016-10-17')
min_val_day = pd.to_datetime('2016-10-18')
max_val_day = pd.to_datetime('2016-10-31')

baseline_sub_day = pd.to_datetime('2016-10-25')

In [269]:
def create_baseline_pred(shop_id, val=True):
    if shop_id in {987, 1707, 1824}:
        return np.zeros(14, dtype='int')
    if val:
        df_shop = df_pays_train[df_pays_train.shop_id == shop_id]
        df_shop = df_shop[df_shop.day >= baseline_day]
        df_shop = fill_holes(df_shop, day_min=baseline_day, day_max=max_train_day)
        week = df_shop.pays_count.values
        return np.concatenate([week, week])
    else:
        df_shop = df_pays[df_pays.shop_id == shop_id]
        df_shop = df_shop[df_shop.day >= baseline_sub_day]
        df_shop = fill_holes(df_shop, day_min=baseline_sub_day, day_max=max_val_day)
        week = df_shop.pays_count.values
        return np.concatenate([week, week])

In [256]:
def create_true_values(shop_id):
    df_shop = df_pays_val[df_pays_val.shop_id == shop_id]
    df_shop = fill_holes(df_shop, day_min=min_val_day, day_max=max_val_day)
    return df_shop.pays_count.values

In [113]:
from tqdm import tqdm

In [257]:
shops = df_pays.shop_id.unique()
shops = sorted(shops)

In [258]:
preds = []
trues = []

for i in tqdm(shops):
    if i == 1824:
        y_true = np.zeros(14, dtype='int')
    else:
        y_true = create_true_values(i)

    y_pred = create_baseline_pred(i)
    preds.append(y_pred)
    trues.append(y_true)

all_pred = np.concatenate(preds)
all_true = np.concatenate(trues)

100%|██████████| 2000/2000 [00:07<00:00, 253.72it/s]


In [259]:
evaluation_metric(all_true, all_pred)

0.095394389645392236

In [270]:
preds = []

for i in tqdm(shops):
    y_pred = create_baseline_pred(i, val=False)
    preds.append(y_pred)

100%|██████████| 2000/2000 [00:06<00:00, 297.69it/s]


In [271]:
with open('sub.csv', 'w') as f:
    for i in tqdm(shops):
        f.write(str(i) + ',')
        f.write(','.join(str(d) for d in preds[i - 1]))
        f.write('\n')

100%|██████████| 2000/2000 [00:00<00:00, 14158.68it/s]


In [272]:
!head sub.csv

1,255,172,226,257,251,281,180,255,172,226,257,251,281,180
2,82,97,83,96,96,115,80,82,97,83,96,96,115,80
3,80,61,68,103,102,104,67,80,61,68,103,102,104,67
4,90,104,55,110,215,196,115,90,104,55,110,215,196,115
5,302,1,0,215,203,291,239,302,1,0,215,203,291,239
6,97,100,117,133,109,118,86,97,100,117,133,109,118,86
7,128,115,111,128,165,162,81,128,115,111,128,165,162,81
8,70,68,72,49,70,71,35,70,68,72,49,70,71,35
9,210,197,245,205,291,283,212,210,197,245,205,291,283,212
10,77,114,85,84,99,93,121,77,114,85,84,99,93,121


In [273]:
!wc -l sub.csv

2000 sub.txt
