In [2]:
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics.regression import mean_squared_error
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder, Imputer, MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from tqdm import tqdm_notebook

In [3]:
train = pd.read_csv("files-ascott_group/train_set_weeks.csv")
test = pd.read_csv("files-ascott_group/test_set_weeks.csv")

### Добавляем номер недели как столбец 

In [4]:
train['wk_num'] = train['wk'].apply(lambda x: int(str(x)[4:]))
test['wk_num'] = test['wk'].apply(lambda x: int(str(x)[4:]))

### Парные категориальные признаки

In [5]:
train['idFilial_KanalDB'] = train['idFilial'].astype(str) + "_" + train['KanalDB'].astype(str)
train['idFilial_idSubGrp'] = train['idFilial'].astype(str) + "_" + train['idSubGrp'].astype(str)
train['KanalDB_idSubGrp'] = train['KanalDB'].astype(str) + "_" + train['idSubGrp'].astype(str)

test['idFilial_KanalDB'] = test['idFilial'].astype(str) + "_" + test['KanalDB'].astype(str)
test['idFilial_idSubGrp'] = test['idFilial'].astype(str) + "_" + test['idSubGrp'].astype(str)
test['KanalDB_idSubGrp'] = test['KanalDB'].astype(str) + "_" + test['idSubGrp'].astype(str)

### Сглаженные средние

In [6]:
cat_columns = ['idFilial', 'KanalDB', 'idSubGrp', 
               'idFilial_KanalDB', 'idFilial_idSubGrp', 'KanalDB_idSubGrp']
info = {}
ALPHA = 15
for i in np.arange(test['N wk'].max(), 11 + test['N wk'].max() % 11, -11):
    if i > train['N wk'].max():
        test['wk_window'] = i
    else: 
        train.loc[(train['N wk'] <= i) & (train['N wk'] >= i - 10), 'wk_window'] = i
        
    tmp = train[(train['N wk'] <= i - 11) & (train['N wk'] >= i - 21)]
    info[i] = {}
    for col in cat_columns:
        info[i][col] = {}
        mean = tmp.groupby(col)['value'].mean()        
        size = tmp.groupby(col)['value'].size()
        gmean = tmp['value'].mean()
        smooth_mean = (ALPHA * gmean + size * mean) / (ALPHA + size)
        info[i][col]['mean'] = smooth_mean

In [7]:
for i in np.arange(test['N wk'].max(), 11 + test['N wk'].max() % 11, -11):
    for col in cat_columns:
        if i > train['N wk'].max():
            for name, stat in info[i][col].items():
                s = test.loc[test['wk_window'] == i, col].map(stat)
                test.loc[test['wk_window'] == i, 'value_{}_{}'.format(col, name)] = s
        else: 
            for name, stat in info[i][col].items():
                s = train.loc[train['wk_window'] == i, col].map(stat)
                train.loc[train['wk_window'] == i, 'value_{}_{}'.format(col, name)] = s

### Метапризнаки

In [8]:
mms = MinMaxScaler()
enc = OneHotEncoder(sparse=False)
cat_columns = ['idFilial', 'KanalDB', 'idSubGrp', 'idFilial_KanalDB', 
               'idFilial_idSubGrp', 'KanalDB_idSubGrp']
num_columns = ['N wk']

xgb = XGBRegressor(n_estimators=100, max_depth=3, n_jobs=-1)
rf = RandomForestRegressor(n_jobs=-1, n_estimators=100, max_depth=3)
knn5 = KNeighborsRegressor(n_neighbors=5, n_jobs=-1)
knn11 = KNeighborsRegressor(n_neighbors=11, n_jobs=-1)
knn21 = KNeighborsRegressor(n_neighbors=21, n_jobs=-1)
models = [xgb, rf, knn5, knn11, knn21]
names = ['xgb', 'rf', 'knn5', 'knn11', 'knn21']

for i in tqdm_notebook(np.arange(test['N wk'].max(), 11 + test['N wk'].max() % 11, -11)):
    if i > train['N wk'].max():
        tmp_test = test
    else: 
        tmp_test = train.loc[(train['N wk'] <= i) & (train['N wk'] >= i - 10)]
    tmp_train = train[(train['N wk'] <= i - 11) & (train['N wk'] >= i - 21)]
    
    Xtrain = mms.fit_transform(np.concatenate([
        enc.fit_transform(tmp_train[cat_columns]),
        tmp_train[num_columns]
    ], axis=1))

    Xtest = mms.transform(np.concatenate([
        enc.transform(tmp_test[cat_columns]),
        tmp_test[num_columns]
    ], axis=1))
    ytrain = tmp_train['value']
    
    for name, model in zip(names, models):
        model.fit(Xtrain, ytrain)
        p = model.predict(Xtest)
        if i > train['N wk'].max(): 
            test['stack_{}'.format(name)] = p
        else:
            train.loc[(train['N wk'] <= i) & (train['N wk'] >= i - 10), 'stack_{}'.format(name)] = p
            print(np.sqrt(mean_squared_error(tmp_test['value'], p)))

76265.7266386
94737.3320493
72116.1951095
73976.8353654
87343.9161673
78753.870621
91195.3110567
79119.4649447
75215.3633694
85515.4303242
78240.5441752
91102.1803809
79770.3334801
80830.1417002
83909.7881713
45749.6500659
53626.1132589
45774.284577
46339.7423043
44793.7300878
54544.3430048
65496.6203002
54150.1756765
50952.2454204
58272.9305783
42777.3876423
48603.6488351
44075.2594053
42911.6622649
44508.5815053
65804.0990901
68861.2200076
63336.1095858
59818.097624
62746.6355429
61248.4548315
69735.2563164
61668.0011992
59681.2998797
64894.0955696
44623.6742057
53414.630746
49307.0254312
42509.1396794
46494.7118296
50363.7498168
63194.1605879
51158.8480243
49663.5660694
56062.4741083
39274.7361557
52718.9326095
37747.5891799
37725.6563628
42101.513901
114229.88449
86564.5870803
96674.9021895
77411.5009515
86302.8718367
70982.3056428
82134.9069599
72361.2235319
64191.6149976
69340.3882944
70728.1288852
84774.3282578
68425.2251281
68590.1642978
76057.72263
49354.2269224
63980.0723255


In [9]:
train = train[train['wk_window'].notnull()]

In [12]:
fcolumns = ['N wk', 'idFilial', 'KanalDB', 'idSubGrp',
       'wk_num', 'idFilial_KanalDB', 'idFilial_idSubGrp', 'KanalDB_idSubGrp',
       'wk_window', 'value_idFilial_mean', 'value_KanalDB_mean',
       'value_idSubGrp_mean', 'value_idFilial_KanalDB_mean',
       'value_idFilial_idSubGrp_mean', 'value_KanalDB_idSubGrp_mean',
       'stack_xgb', 'stack_rf', 'stack_knn5', 'stack_knn11', 'stack_knn21', 'wk']

### Обучаем 10 lgb-ов и усредняем

In [25]:
p = []
for i in range(10):
    print(i)
    cat = CatBoostRegressor(iterations=2000 + i, learning_rate=0.01, 
                            random_seed=i * 77, thread_count=8, depth=4, 
                            eval_metric='RMSE', l2_leaf_reg=2 + i / 100, rsm=0.8 + i / 1000)
    cat.fit(
        train[fcolumns].values, 
        train['value'].values, 
        cat_features=[1, 2, 3, 4, 5, 6, 7]
    )
    preds = cat.predict(test[fcolumns].values)
    p.append(preds)
    print(np.mean(preds))

0
132558.994769
1
133886.285686
2
135144.160055
3
129773.075253
4
138025.153402
5
131281.995573
6
133425.455389
7
136716.457372
8
132765.083218
9
133307.114896


In [27]:
sub = pd.read_csv("files-ascott_group/sample_submission.csv")

In [28]:
s = pd.DataFrame(data=np.mean(p, axis=0), index=test['id'], columns=['value'])
s.to_csv('new2.csv')