In [1]:

import itertools

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm as tn

import optuna
from joblib import Parallel, delayed
from sklearn.model_selection import KFold, GroupKFold
from sklearn.preprocessing import StandardScaler
seed = 57

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)


In [2]:
INPUT = '../input/final-hack4retail'

cheques = pd.read_csv(f'{INPUT}/cheques_public.csv', sep=';')
distances = np.array(pd.read_csv(f'{INPUT}/darkstore.csv')) * 2
darkstore_map = pd.read_csv(f'{INPUT}/darkstore_map.csv', sep=';')

## Training

In [3]:
def get_cheque_stats(cheques):
    cheques['CHEQUE_LEN'] = cheques.groupby('CHEQUEID')['KOLVO'].transform(len)
    stats = cheques.groupby('LAGERID').agg({'KOLVO': ['sum', 'count'], 'CHEQUE_LEN': 'mean'})
    stats[('KOLVO', 'mean')] = stats[('KOLVO','sum')] / stats[('KOLVO', 'count')]
    stats.columns = ['_'.join(c) for c in stats.columns.to_flat_index()]
    
    cols = ['KOLVO_count', 'CHEQUE_LEN_mean', 'KOLVO_mean']
    for col1, col2 in itertools.combinations(cols, 2):
        new_col = f"mult_{col1}_{col2}"
        cols.append(new_col)
        stats[new_col] = stats[col1] * stats[col2]

    stats = stats[cols].copy()
    return stats

In [4]:
get_cheque_stats(cheques).head(10)

Unnamed: 0_level_0,KOLVO_count,CHEQUE_LEN_mean,KOLVO_mean,mult_KOLVO_count_CHEQUE_LEN_mean,mult_KOLVO_count_KOLVO_mean,mult_CHEQUE_LEN_mean_KOLVO_mean
LAGERID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,144,3.888889,1.479167,560.0,213.0,5.752315
2,424,4.346698,4.382075,1843.0,1858.0,19.047559
3,636,7.125786,1.163522,4532.0,740.0,8.291009
4,162,8.290123,1.024691,1343.0,166.0,8.494818
5,167,5.179641,3.077844,865.0,514.0,15.942128
6,119,8.571429,1.907563,1020.0,227.0,16.35054
7,608,6.651316,1.404605,4044.0,854.0,9.342473
8,83,6.301205,1.120482,523.0,93.0,7.060386
9,138,5.268116,1.282609,727.0,177.0,6.756931
10,125,6.584,1.216,823.0,152.0,8.006144


In [5]:
picking_time = {1: 1, 2: 2, 3: 3}

def validator(cheques, distances, darkstore_map):
    
    darkstore_map = darkstore_map.sort_values(by=['SECTION', 'LEVEL'])
    darkstore_map_dict = darkstore_map.set_index(['SECTION', 'LEVEL']).to_dict('index')
    darkstore_map_dict_inverse = dict()
    for k, v in darkstore_map_dict.items():
        darkstore_map_dict[k] = v['LAGERID']
        darkstore_map_dict_inverse[v['LAGERID']] = k
    
    cheques = cheques.copy()
    cheques['LOCATION'] = cheques['LAGERID'].apply(lambda x: darkstore_map_dict_inverse[x])
    cheques[['SECTION', 'LEVEL']] = pd.DataFrame(cheques['LOCATION'].tolist(), index=cheques.index)
    
    all_times = []
    
    for i, temp_cheque in cheques.groupby('CHEQUEID'):

        sum_time = 0
        current_location = 0
        sum_time += (temp_cheque['KOLVO'] * temp_cheque['LEVEL'].map(picking_time)).sum()
        
        est_locatsii = True
        set_locations = set(temp_cheque['SECTION'])
        while est_locatsii:
            dists = sorted([(x, distances[current_location, x]) for x in set_locations], key=lambda x: x[1], reverse=False)
            current_location = dists[0][0]
            travel_time = dists[0][1]
            sum_time += travel_time

            set_locations.remove(current_location)
            if not len(set_locations):
                est_locatsii = False

        dist_to_final = distances[0, current_location]
        sum_time += dist_to_final * 1
        
        all_times.append(sum_time)
        
    return np.mean(all_times), all_times

In [6]:
def inner_func(i, tdx, vdx, coefs_level, coefs_section):
    train_cheques = cheques.iloc[tdx]
    valid_cheques = cheques.iloc[vdx]

    train_stats = get_cheque_stats(cheques.iloc[tdx])
    scaled_cols = [c for c in train_stats.columns if 'cluster' not in c]
    train_stats[scaled_cols] = StandardScaler().fit_transform(train_stats[scaled_cols])
    train_stats = train_stats.values
    
    level_ranging = (train_stats * coefs_level).sum(axis=1)
    level_ranging = level_ranging.argsort()[::-1]
    level2items = {
        1: level_ranging[:44],
        2: level_ranging[44:88],
        3: level_ranging[88:],
    }
    section_ranging = (train_stats * coefs_section).sum(axis=1)

    darkstore_map_fold = darkstore_map.copy()
    darkstore_map_fold['apiori'] = pd.Categorical(darkstore_map_fold['SECTION'], distances[0].argsort())
    darkstore_map_fold = darkstore_map_fold.sort_values('apiori')

    for level in range(1, 4):
        item_ids = level2items[level]
        ranging = section_ranging[item_ids]
        item_ids = item_ids[ranging.argsort()]
        darkstore_map_fold.loc[darkstore_map_fold['LEVEL'] == level, 'LAGERID'] = item_ids + 1

    train_mean, _  = validator(train_cheques, distances, darkstore_map_fold)
    valid_mean, _ = validator(valid_cheques, distances, darkstore_map_fold)

    return valid_mean

def objective(trial):
    data = get_cheque_stats(cheques)
    
    coefs_level = [trial.suggest_float(f"a{i}", -20, 20) for i in range(data.shape[1])]
    coefs_section = [trial.suggest_float(f"b{i}", -20, 20) for i in range(data.shape[1])]
    group_kfold = GroupKFold(n_splits=4)
    folds = darkstore_map.copy().drop(columns=['LAGERID'])
    valid_scores = Parallel(-1)(delayed(inner_func)(i, tdx, vdx, coefs_level, coefs_section) for i, (tdx, vdx) in enumerate(group_kfold.split(cheques, cheques['KOLVO'], cheques['CHEQUEID'])))
        
    return np.mean(valid_scores) 

In [7]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, n_jobs=1, show_progress_bar=True)

[32m[I 2021-11-14 21:53:41,618][0m A new study created in memory with name: no-name-e0841f94-61b0-45fb-8fb7-6c62fb72349e[0m


  0%|          | 0/100 [00:00<?, ?it/s]

[32m[I 2021-11-14 21:53:54,235][0m Trial 0 finished with value: 76.87084005985423 and parameters: {'a0': -6.461952487233997, 'a1': 12.52459802809716, 'a2': -9.43432416082489, 'a3': -16.95405855026777, 'a4': -18.74422317255148, 'a5': -2.020697415886268, 'b0': -18.87173362097753, 'b1': -18.76217377513862, 'b2': 12.554283376287962, 'b3': 3.383296744081065, 'b4': 14.656009082565774, 'b5': -5.546634447889804}. Best is trial 0 with value: 76.87084005985423.[0m
[32m[I 2021-11-14 21:54:04,299][0m Trial 1 finished with value: 69.80797257939655 and parameters: {'a0': 9.889619645285737, 'a1': -0.11851074896124203, 'a2': -5.029774657281916, 'a3': 1.6093437458848037, 'a4': -9.279004312778628, 'a5': 2.114256199409283, 'b0': -10.03314190341535, 'b1': 7.667838909553993, 'b2': -13.266747887245828, 'b3': -18.081443434271172, 'b4': 15.937042951331804, 'b5': 14.873519505160544}. Best is trial 1 with value: 69.80797257939655.[0m
[32m[I 2021-11-14 21:54:14,711][0m Trial 2 finished with value: 79.379

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/st

[32m[I 2021-11-14 21:57:47,845][0m Trial 23 finished with value: 68.72285341376836 and parameters: {'a0': 2.1705035596280933, 'a1': 8.188384528759958, 'a2': 10.959475062368885, 'a3': 3.588615402861107, 'a4': 2.752223053320641, 'a5': -15.380176822825444, 'b0': 1.8996433251747673, 'b1': 19.221525035086096, 'b2': 3.5551706600760866, 'b3': 1.4608329067038779, 'b4': -14.616440863540072, 'b5': 19.952220265819637}. Best is trial 21 with value: 66.01102476775802.[0m
[32m[I 2021-11-14 21:57:57,918][0m Trial 24 finished with value: 69.84734396831692 and parameters: {'a0': -6.034548404284948, 'a1': 14.363591177390035, 'a2': 12.085509143833821, 'a3': 8.865644139463676, 'a4': -5.28575567869599, 'a5': -11.036189188341668, 'b0': -6.422980927377745, 'b1': 19.93505369910506, 'b2': -2.9138462881387115, 'b3': -0.39784237462258126, 'b4': -15.450407050366156, 'b5': 9.935988787344446}. Best is trial 21 with value: 66.01102476775802.[0m
[32m[I 2021-11-14 21:58:08,469][0m Trial 25 finished with value:

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/st

[32m[I 2021-11-14 22:01:55,178][0m Trial 47 finished with value: 65.43091827631133 and parameters: {'a0': -2.727301959304469, 'a1': 18.01473882366019, 'a2': 15.278576489428717, 'a3': 17.419207449497975, 'a4': 19.868551014821445, 'a5': 17.238814600053782, 'b0': -17.642092579039435, 'b1': 17.222075887004188, 'b2': 1.1237542287060953, 'b3': -14.399338724522948, 'b4': -15.048848900434574, 'b5': 18.36304030977637}. Best is trial 44 with value: 65.21979379083116.[0m
[32m[I 2021-11-14 22:02:05,199][0m Trial 48 finished with value: 66.19959310449002 and parameters: {'a0': -4.142344690404708, 'a1': 18.463883027311656, 'a2': 8.168800900208092, 'a3': 17.607917441989937, 'a4': 18.606220715543234, 'a5': 14.835457098022383, 'b0': -19.86980992914097, 'b1': 17.895730016580693, 'b2': 0.04404446434903675, 'b3': -10.896010590360243, 'b4': 1.9597807516628158, 'b5': 18.359468335116393}. Best is trial 44 with value: 65.21979379083116.[0m
[32m[I 2021-11-14 22:02:15,264][0m Trial 49 finished with valu

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/st

[32m[I 2021-11-14 22:05:59,291][0m Trial 71 finished with value: 65.3874978758355 and parameters: {'a0': 2.6048785122841682, 'a1': 17.299628888032185, 'a2': 12.019192609605032, 'a3': 9.029201729127275, 'a4': 10.901510780008618, 'a5': 10.432681243467064, 'b0': -13.60843909300609, 'b1': 6.919428729139642, 'b2': -3.0870114318720274, 'b3': -19.911962767111653, 'b4': -18.95503691259612, 'b5': -0.10481271285380793}. Best is trial 67 with value: 65.10306670348552.[0m
[32m[I 2021-11-14 22:06:09,346][0m Trial 72 finished with value: 65.23244625863828 and parameters: {'a0': 5.933690701926532, 'a1': 18.700220228995185, 'a2': 13.764134243757411, 'a3': 7.561243961252529, 'a4': 15.427883018460564, 'a5': 9.767467608232367, 'b0': -13.567059454867875, 'b1': 6.436783663538809, 'b2': -4.015381754288714, 'b3': -18.20096474868311, 'b4': -16.808904742602696, 'b5': 0.08570251950561847}. Best is trial 67 with value: 65.10306670348552.[0m
[32m[I 2021-11-14 22:06:19,836][0m Trial 73 finished with value:

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/st

[32m[I 2021-11-14 22:10:03,971][0m Trial 95 finished with value: 65.21767057040897 and parameters: {'a0': -5.474299919525359, 'a1': 11.498366779636008, 'a2': 18.797636045463154, 'a3': 11.843563580671718, 'a4': 14.039139782342481, 'a5': 12.955226991994934, 'b0': -18.511557606318494, 'b1': 9.244403405492108, 'b2': -4.725843645924599, 'b3': -11.770076492292326, 'b4': -16.539027089028885, 'b5': 8.37643400799725}. Best is trial 88 with value: 65.0408828499679.[0m
[32m[I 2021-11-14 22:10:14,425][0m Trial 96 finished with value: 65.2135662125675 and parameters: {'a0': -5.673849630108572, 'a1': 11.458891635584362, 'a2': 19.079955312228346, 'a3': 12.09097985121999, 'a4': 13.822608442101536, 'a5': 12.815366454420317, 'b0': -18.19531306768323, 'b1': 9.5517458775754, 'b2': -5.0707606249870505, 'b3': -11.544758173006013, 'b4': -15.700918271230105, 'b5': 7.623808325830435}. Best is trial 88 with value: 65.0408828499679.[0m
[32m[I 2021-11-14 22:10:24,529][0m Trial 97 finished with value: 65.1

## Submission

In [8]:
trial = study.best_trial.params

coefs_level = [trial[f"a{i}"] for i in range(6)]
coefs_section = [trial[f"b{i}"] for i in range(6)]


In [9]:
train_stats = get_cheque_stats(cheques)
train_stats = StandardScaler().fit_transform(train_stats)

level_ranging = (train_stats * coefs_level).sum(axis=1)
level_ranging = level_ranging.argsort()[::-1]
level2items = {
    1: level_ranging[:44],
    2: level_ranging[44:88],
    3: level_ranging[88:],
}
section_ranging = (train_stats * coefs_section).sum(axis=1)

darkstore_map_fold = darkstore_map.copy()
darkstore_map_fold['apiori'] = pd.Categorical(darkstore_map_fold['SECTION'], distances[0].argsort())
darkstore_map_fold = darkstore_map_fold.sort_values('apiori')
for level in range(1, 4):
    item_ids = level2items[level]
    ranging = section_ranging[item_ids]
    item_ids = item_ids[ranging.argsort()]
    darkstore_map_fold.loc[darkstore_map_fold['LEVEL'] == level, 'LAGERID'] = item_ids + 1
train_mean, _ = validator(cheques, distances, darkstore_map_fold)
# valid_mean, valid_sum,  _ = validator(valid_cheques, distances, darkstore_map_fold)

print(f'train_mean: {train_mean}')


train_mean: 64.88594984541395


In [10]:
darkstore_map_fold.drop(columns=['apiori']).to_csv('optuna_darkstore.csv', index=False)