# Загрузка и предобработка данных

In [1]:
import pandas as pd
df_checks = pd.read_csv('./lentahack/20210518_checks.csv') # изменить на нужный датасет чеков
df_uplift = pd.read_csv('./lentahack/20210518_uplift.csv') # изменить на нужный датасет таргет значений
df_sample_submission = pd.read_csv('./lentahack/20210521_sample_submission.csv') # изменить на нужный датасет заполнения
df_hierarchy = pd.read_csv('./lentahack/20210518_hierarchy.csv') # изменить на нужный датасет иерархий товаров
df_offers = pd.read_csv('./lentahack/20210521_offers.csv') # изменить на нужный датасет описания промок

## Энкодинг иерархии с помощью OrdinalEncoding

In [2]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
enc.fit(df_hierarchy[['hierarchy_level1', 'hierarchy_level2', 'hierarchy_level3', 'hierarchy_level4']])
df_hierarchy[['hierarchy_level1', 'hierarchy_level2', 'hierarchy_level3', 'hierarchy_level4']] \
= enc.transform(df_hierarchy[['hierarchy_level1', 'hierarchy_level2', 'hierarchy_level3', 'hierarchy_level4']])

## Объединение чеков с описанием промок и таргетовыми значениями

In [3]:
df_data = pd.merge(pd.merge(df_offers, df_checks, how='left', on=['sku']),
                   df_uplift, how='left', on=['Offer_ID'])

In [4]:
drop_columns = ['client_id', 'shop_id', 'check_id', 'time', 'check_pos', 'region_name'] # сброс ненужных колонок
df_data_dropped = df_data.drop(columns=drop_columns)

In [5]:
# преобразование дат в объекты класса datetime
dates = ['day', 'start_date', 'end_date']
for date in dates:
    df_data_dropped[date] = pd.to_datetime(df_data_dropped[date],
                                           format='%Y%m%d')

In [6]:
# сброс данных с чеков учавстовавших в промопериоде акций
df_data = df_data_dropped.drop(df_data_dropped[df_data_dropped['day'] > df_data_dropped['start_date']].index)

In [7]:
# функции для обработки групп промок

import math
from collections import Counter
def calc_entropy(count): # расчёт схожести продуктов
    total = sum(count.values())
    entropy = 0
    for key, value in count.items():
        entropy += -(value / total) * math.log2(value / total)
    return entropy

def get_product_list(offer_id):
    df = groups.get_group(offer_id)
    product_list = df['sku'].unique()
    return product_list

def offer_diversity(offer_id):
    product_list = get_product_list(offer_id)
    hierarchy_list = []
    for product in product_list:
        hierarchy_list.append(df_hierarchy[df_hierarchy['sku'] == product].drop(['sku'], axis=1).values[0])
    hierarchy_list = list(zip(*hierarchy_list))
    hierarchy_diversity = {}
    for idx, dat in enumerate(hierarchy_list):
        count = Counter(dat)
        entropy = calc_entropy(count)
        hierarchy_diversity[f"level{idx}"] = entropy
    return hierarchy_diversity

def most_freq_hierarchy_levels(offer_id):
    sku = groups.get_group(offer_id)['sku'].mode().iloc[0]
    hierarchy_levels = df_hierarchy[df_hierarchy['sku'] == sku].drop('sku', axis=1).values[0]
    return hierarchy_levels

def process_offer(offer_id):
    df = groups.get_group(offer_id)
    most_freq_item = df['sku'].mode()
    product_list = get_product_list(offer_id)
    diversity = offer_diversity(offer_id)
    number_of_products = len(product_list)
    total_supply_cost = sum(df['supplier_price'] * df['num_sales'])
    total_selling_price = sum(df['selling_price'] * df['num_sales'])
    duration = df['end_date'].iloc[0] - df['start_date'].iloc[0]
    promotype = df['Promo_type'].iloc[0]
    uplift = df['UpLift'].iloc[0]
    hierarchy_levels = most_freq_hierarchy_levels(offer_id)
    train_test = df['train_test_group'].iloc[0]
    features = {'number_of_products': number_of_products, 'diversity_1': diversity['level0'],
                'diversity_2': diversity['level1'], 'diversity_3': diversity['level2'],
                'diversity_4': diversity['level3'], 'total_cost': total_supply_cost,
                'total_sells': total_selling_price, 'duration': duration, 'promotype': promotype,
                'uplift': uplift, 'most_freq_1': hierarchy_levels[0], 'most_freq_2': hierarchy_levels[1],
                'most_freq_3': hierarchy_levels[2], 'most_freq_4': hierarchy_levels[3],
                'train_test_group': train_test}
    return features

In [9]:
groups = df_data.groupby(['Offer_ID'])
series_list = []
for group, _ in groups:
    features = process_offer(group)
    features['Offer_ID'] = group
    series_list.append(features)
    print(f"Finished group {group}")
final_df = pd.DataFrame(series_list)

Finished group 10
Finished group 100
Finished group 101
Finished group 102
Finished group 103
Finished group 104
Finished group 105
Finished group 106
Finished group 107
Finished group 108
Finished group 109
Finished group 11
Finished group 112
Finished group 114
Finished group 115
Finished group 116
Finished group 117
Finished group 118
Finished group 119
Finished group 12
Finished group 120
Finished group 121
Finished group 122
Finished group 124
Finished group 125
Finished group 126
Finished group 127
Finished group 128
Finished group 129
Finished group 13
Finished group 130
Finished group 131
Finished group 133
Finished group 134
Finished group 136
Finished group 137
Finished group 138
Finished group 139
Finished group 14
Finished group 140
Finished group 141
Finished group 142
Finished group 143
Finished group 144
Finished group 145
Finished group 146
Finished group 147
Finished group 148
Finished group 149
Finished group 15
Finished group 150
Finished group 151
Finished group 152

In [10]:
# преобразование total выручки и трат в процентную выгоду
# преобразование длительности промок в int days
# total_sells - total_cost / total_cost = profit_pct
from sklearn.preprocessing import OrdinalEncoder
final_df.set_index('Offer_ID')
final_df['pct_change'] = (final_df['total_sells'] - final_df['total_cost']) / final_df['total_cost']
final_df['duration'] = final_df['duration'].apply(lambda x: int(str(x).split()[0]))
enc_final = OrdinalEncoder()
enc_final.fit(final_df[['promotype']])
final_df[['promotype']] = enc_final.transform(final_df[['promotype']])

In [11]:
final_df.drop(['total_cost', 'total_sells'], inplace=True, axis=1)
final_df.set_index('Offer_ID', inplace=True)
final_df.head()

Unnamed: 0_level_0,number_of_products,diversity_1,diversity_2,diversity_3,diversity_4,duration,promotype,uplift,most_freq_1,most_freq_2,most_freq_3,most_freq_4,train_test_group,pct_change
Offer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
10,8,0.0,0.0,0.0,0.0,13,1.0,15.941591,1.0,21.0,191.0,515.0,train,0.531065
100,12,0.0,0.0,0.0,1.418296,13,3.0,,0.0,56.0,45.0,1103.0,test,-0.040186
101,1,0.0,0.0,0.0,0.0,13,3.0,1.761594,1.0,27.0,107.0,951.0,train,0.46002
102,11,0.0,0.0,0.0,0.0,13,3.0,0.63047,1.0,26.0,31.0,1790.0,train,1.115112
103,1,0.0,0.0,0.0,0.0,13,3.0,2.505152,1.0,33.0,136.0,1766.0,train,0.243666


# Тренировка и тестинг LightGBM

In [33]:
!pip install lightgbm



In [42]:
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb

In [36]:
train_features = ['number_of_products', 'diversity_1', 'diversity_2', 'diversity_3', 'diversity_4',
                  'duration', 'promotype', 'most_freq_1', 'most_freq_2', 'most_freq_3', 'most_freq_4',
                  'pct_change']
from sklearn.model_selection import train_test_split
train_df = final_df[final_df['train_test_group'] == 'train']
X = train_df[train_features].values
y = train_df['uplift'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [37]:
train_data = lgb.Dataset(X_train, label=y_train)
validation_data = lgb.Dataset(X_test, label=y_test)

In [54]:
params = {
    'objective': 'regression_l1',
    'metric': 'mae',
    'learning_rate': 0.01,
#     'max_depth': 16,
#     'num_leaves': 70,
#     'min_data_in_leaf': 100
}

In [56]:
num_round = 10000
lgb_regressor = lgb.train(params, 
                train_data, 
                num_round, 
                valid_sets=[validation_data], 
                early_stopping_rounds=300)

[1]	valid_0's l1: 4.08667
Training until validation scores don't improve for 300 rounds
[2]	valid_0's l1: 4.073
[3]	valid_0's l1: 4.06002
[4]	valid_0's l1: 4.04717
[5]	valid_0's l1: 4.03467
[6]	valid_0's l1: 4.02228
[7]	valid_0's l1: 4.01052
[8]	valid_0's l1: 3.999
[9]	valid_0's l1: 3.98881
[10]	valid_0's l1: 3.97873
[11]	valid_0's l1: 3.96859
[12]	valid_0's l1: 3.95902
[13]	valid_0's l1: 3.94928
[14]	valid_0's l1: 3.94086
[15]	valid_0's l1: 3.9323
[16]	valid_0's l1: 3.92414
[17]	valid_0's l1: 3.91586
[18]	valid_0's l1: 3.9081
[19]	valid_0's l1: 3.89825
[20]	valid_0's l1: 3.88837
[21]	valid_0's l1: 3.8808
[22]	valid_0's l1: 3.8733
[23]	valid_0's l1: 3.86653
[24]	valid_0's l1: 3.85978
[25]	valid_0's l1: 3.85264
[26]	valid_0's l1: 3.84353
[27]	valid_0's l1: 3.83675
[28]	valid_0's l1: 3.82938
[29]	valid_0's l1: 3.82233
[30]	valid_0's l1: 3.81513
[31]	valid_0's l1: 3.80801
[32]	valid_0's l1: 3.80096
[33]	valid_0's l1: 3.79469
[34]	valid_0's l1: 3.78597
[35]	valid_0's l1: 3.77944
[36]	valid

In [57]:
y_pred = lgb_regressor.predict(X_test)
print(f"Точность модели по метрике MAE: {(mean_absolute_error(y_test, y_pred))}")

Точность модели по метрике MAE: 3.6297313428546345


In [58]:
# final_submission .csv
# df_sample_submission.csv - тестовый набор данных
for idx, offer_id in enumerate(df_sample_submission['Offer_ID'].values):
    temp = final_df.loc[offer_id]
    x_submission = temp[train_features].values.reshape((1,12))
    y_submission = lgb_regressor.predict(x_submission)
    df_sample_submission['UpLift'].iloc[idx] = y_submission

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [59]:
df_sample_submission.head()

Unnamed: 0,Offer_ID,UpLift
0,386,12.838752
1,182,2.96923
2,173,7.983641
3,375,2.694541
4,379,4.931205


In [60]:
df_sample_submission.to_csv('final_submission.csv', index=False)