<a href="https://colab.research.google.com/github/aromanenko/ATSF/blob/wip/hw3_solution_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
%matplotlib inline

In [27]:
#Загружаем данные, содержащие информацию по всем товарам во всех магазинах за все периоды времени (тренировочные + для которых необходимо предсказать спрос)
all_data = pd.read_csv('train.csv', delimiter=',')
all_data['period_start_dt'] = pd.to_datetime(all_data['period_start_dt'], format= "%Y-%m-%d")
all_data.head()

Unnamed: 0.1,Unnamed: 0,product_rk,store_location_rk,period_start_dt,demand,PROMO1_FLAG,PROMO2_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,NUM_CONSULTANT,AUTORIZATION_FLAG
0,0,40369,309,2016-12-19,29.0,,,,,,
1,1,40370,309,2016-12-19,64.0,,,,,,
2,2,40372,309,2016-12-19,32.0,,,,,,
3,3,40373,309,2016-12-19,10.0,,,,,,
4,4,46272,309,2016-12-19,15.0,,,,,,


In [28]:
#Сразу переименуем столбец "Unnamed: 0" в id
all_data.rename(columns={'Unnamed: 0': 'id'}, inplace=True)
all_data

Unnamed: 0,id,product_rk,store_location_rk,period_start_dt,demand,PROMO1_FLAG,PROMO2_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,NUM_CONSULTANT,AUTORIZATION_FLAG
0,0,40369,309,2016-12-19,29.0,,,,,,
1,1,40370,309,2016-12-19,64.0,,,,,,
2,2,40372,309,2016-12-19,32.0,,,,,,
3,3,40373,309,2016-12-19,10.0,,,,,,
4,4,46272,309,2016-12-19,15.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
35339,35537,40370,1380,2019-12-30,,0.0,0.0,1000.00,1000.0,0.0,1.0
35340,35538,40372,1380,2019-12-30,,0.0,0.0,2000.00,2000.0,0.0,1.0
35341,35539,40373,1380,2019-12-30,,0.0,0.0,3000.00,3000.0,0.0,1.0
35342,35540,46272,1380,2019-12-30,,1.0,0.0,284.29,199.0,0.0,1.0


In [29]:
#Посмотрим на уникальные значения категориальных объясняющих переменных и удалим те,
#у которых уникальное значение (не пустое) только одно, т.к. они не дают никакой пользы при обучении модели
print(all_data['PROMO1_FLAG'].unique())
print(all_data['PROMO2_FLAG'].unique()) #не используем данную переменную, т.к. значения только 0 и NaN
print(all_data['NUM_CONSULTANT'].unique()) #не используем данную переменную, т.к. значения только 0 и NaN
print(all_data['AUTORIZATION_FLAG'].unique())

[nan  1.  0.  2.]
[nan  0.]
[nan  0.]
[nan  1.  0.]


In [30]:
del all_data['PROMO2_FLAG']
del all_data['NUM_CONSULTANT']
all_data

Unnamed: 0,id,product_rk,store_location_rk,period_start_dt,demand,PROMO1_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,AUTORIZATION_FLAG
0,0,40369,309,2016-12-19,29.0,,,,
1,1,40370,309,2016-12-19,64.0,,,,
2,2,40372,309,2016-12-19,32.0,,,,
3,3,40373,309,2016-12-19,10.0,,,,
4,4,46272,309,2016-12-19,15.0,,,,
...,...,...,...,...,...,...,...,...,...
35339,35537,40370,1380,2019-12-30,,0.0,1000.00,1000.0,1.0
35340,35538,40372,1380,2019-12-30,,0.0,2000.00,2000.0,1.0
35341,35539,40373,1380,2019-12-30,,0.0,3000.00,3000.0,1.0
35342,35540,46272,1380,2019-12-30,,1.0,284.29,199.0,1.0


In [31]:
#Поработаем с пропущенными значениями
#Пропуски в категориальных переменных заполним соответствующими модами:
all_data['PROMO1_FLAG'] = all_data['PROMO1_FLAG'].fillna(all_data['PROMO1_FLAG'].mode()[0])
all_data['AUTORIZATION_FLAG'] = all_data['AUTORIZATION_FLAG'].fillna(all_data['AUTORIZATION_FLAG'].mode()[0])
all_data

Unnamed: 0,id,product_rk,store_location_rk,period_start_dt,demand,PROMO1_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,AUTORIZATION_FLAG
0,0,40369,309,2016-12-19,29.0,0.0,,,1.0
1,1,40370,309,2016-12-19,64.0,0.0,,,1.0
2,2,40372,309,2016-12-19,32.0,0.0,,,1.0
3,3,40373,309,2016-12-19,10.0,0.0,,,1.0
4,4,46272,309,2016-12-19,15.0,0.0,,,1.0
...,...,...,...,...,...,...,...,...,...
35339,35537,40370,1380,2019-12-30,,0.0,1000.00,1000.0,1.0
35340,35538,40372,1380,2019-12-30,,0.0,2000.00,2000.0,1.0
35341,35539,40373,1380,2019-12-30,,0.0,3000.00,3000.0,1.0
35342,35540,46272,1380,2019-12-30,,1.0,284.29,199.0,1.0


In [32]:
#Пропуски в числовых переменных будем заполнять по каждой паре "товар + магазин". Для каждой пары в каждый временной период с NaN в переменных
#PRICE_REGULAR и PRICE_AFTER_DISC будем всавлять медианную цену среди цен на этот товар на данную дату в других магазинах, в которых цена известна.
#При этом удалим предварительно из выборки 309 магазин, т.к. ни по одному товару ни в одну из дат в этом магазине
#неизвестны цены (переменные PRICE_REGULAR и PRICE_AFTER_DISC)
all_data[all_data['store_location_rk'] == 309].sort_values(by='product_rk')

Unnamed: 0,id,product_rk,store_location_rk,period_start_dt,demand,PROMO1_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,AUTORIZATION_FLAG
0,0,40369,309,2016-12-19,29.0,0.0,,,1.0
5,5,40369,309,2016-12-26,127.0,0.0,,,1.0
10,10,40369,309,2017-01-02,50.0,0.0,,,1.0
1,1,40370,309,2016-12-19,64.0,0.0,,,1.0
6,6,40370,309,2016-12-26,181.0,0.0,,,1.0
11,11,40370,309,2017-01-02,70.0,0.0,,,1.0
2,2,40372,309,2016-12-19,32.0,0.0,,,1.0
7,7,40372,309,2016-12-26,88.0,0.0,,,1.0
12,12,40372,309,2017-01-02,30.0,0.0,,,1.0
3,3,40373,309,2016-12-19,10.0,0.0,,,1.0


In [33]:
all_data = all_data[all_data['store_location_rk'] != 309]
all_data #удалилось 15 наблюдений

Unnamed: 0,id,product_rk,store_location_rk,period_start_dt,demand,PROMO1_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,AUTORIZATION_FLAG
15,15,40369,317,2016-12-19,50.0,0.0,,,1.0
16,16,40370,317,2016-12-19,44.0,0.0,,,1.0
17,17,40372,317,2016-12-19,13.0,0.0,,,1.0
18,18,40373,317,2016-12-19,6.0,0.0,,,1.0
19,19,46272,317,2016-12-19,34.0,0.0,,,1.0
...,...,...,...,...,...,...,...,...,...
35339,35537,40370,1380,2019-12-30,,0.0,1000.00,1000.0,1.0
35340,35538,40372,1380,2019-12-30,,0.0,2000.00,2000.0,1.0
35341,35539,40373,1380,2019-12-30,,0.0,3000.00,3000.0,1.0
35342,35540,46272,1380,2019-12-30,,1.0,284.29,199.0,1.0


In [34]:
#Удалим также такие пары "товар+день", у которых вообще не известна цена ни в одном магазине. Выведем для начала такие пары (их пять)
n = 0
listt = []
for i in all_data['product_rk'].unique():
  for d in all_data[all_data['product_rk'] == i]['period_start_dt'].unique():
    if len(all_data[(all_data['product_rk'] == i) & (all_data['period_start_dt'] == d)]) == all_data[(all_data['product_rk'] == i) & (all_data['period_start_dt'] == d)]['PRICE_REGULAR'].isnull().sum():
      n += 1
      listt.append([i, d])

print(n)
print(listt)

5
[[40369, Timestamp('2016-12-19 00:00:00')], [40370, Timestamp('2016-12-19 00:00:00')], [40372, Timestamp('2016-12-19 00:00:00')], [40373, Timestamp('2016-12-19 00:00:00')], [46272, Timestamp('2016-12-19 00:00:00')]]


In [35]:
all_data = all_data[(all_data['product_rk'] != 40369) | (all_data['period_start_dt'] != '2016-12-19')]
all_data = all_data[(all_data['product_rk'] != 40370) | (all_data['period_start_dt'] != '2016-12-19')]
all_data = all_data[(all_data['product_rk'] != 40372) | (all_data['period_start_dt'] != '2016-12-19')]
all_data = all_data[(all_data['product_rk'] != 40373) | (all_data['period_start_dt'] != '2016-12-19')]
all_data = all_data[(all_data['product_rk'] != 46272) | (all_data['period_start_dt'] != '2016-12-19')]

In [36]:
#Проверка
n = 0
listt = []
for i in all_data['product_rk'].unique():
  for d in all_data[all_data['product_rk'] == i]['period_start_dt'].unique():
    if len(all_data[(all_data['product_rk'] == i) & (all_data['period_start_dt'] == d)]) == all_data[(all_data['product_rk'] == i) & (all_data['period_start_dt'] == d)]['PRICE_REGULAR'].isnull().sum():
      n += 1
      listt.append([i, d])

print(n)
print(listt)
#Таким образом, на 2016-12-19 остался только один товар №96212

0
[]


In [37]:
#Проверка:
a = 0
for i in all_data['product_rk'].unique():
  for d in all_data[all_data['product_rk'] == i]['period_start_dt'].unique():
    a += 1

a #919 пар вместо 924

919

In [38]:
#Выведем для проверки, что были NaN в переменных PRICE_REGULAR и PRICE_AFTER_DISC, а после следующих двух ячеек проверим, что они исчезли
all_data[(all_data['product_rk'] == 96212) & (all_data['period_start_dt'] == '2019-11-04')]

Unnamed: 0,id,product_rk,store_location_rk,period_start_dt,demand,PROMO1_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,AUTORIZATION_FLAG
889,889,96212,317,2019-11-04,3.966667,1.0,70.0,49.0,1.0
1812,1818,96212,355,2019-11-04,7.0,1.0,70.0,49.0,1.0
2735,2747,96212,380,2019-11-04,3.0,1.0,70.0,49.0,1.0
3659,3677,96212,425,2019-11-04,5.366667,1.0,70.0,49.0,1.0
4582,4606,96212,453,2019-11-04,3.733333,1.0,70.0,49.0,1.0
5501,5531,96212,504,2019-11-04,9.0,1.0,70.0,49.0,1.0
6425,6460,96212,517,2019-11-04,0.0,1.0,70.0,49.0,1.0
7348,7388,96212,525,2019-11-04,4.2,1.0,70.0,49.0,1.0
8272,8318,96212,533,2019-11-04,0.0,0.0,,,0.0
9194,9240,96212,535,2019-11-04,4.0,1.0,70.0,49.0,1.0


In [39]:
#Заполняем пропуски в цене товара i (PRICE_REGULAR) на дату d медианной ценой этого товара по всем магазинам, в которых он продается, на данную дату d
for i in all_data['product_rk'].unique():
  # print(i)
  for d in all_data[all_data['product_rk'] == i]['period_start_dt'].unique():
    # print(d)
    # print(all_data[(all_data['product_rk'] == i) & (all_data['period_start_dt'] == d)]['PRICE_REGULAR'].median())
    all_data.loc[(all_data['product_rk'] == i) & (all_data['period_start_dt'] == d), ['PRICE_REGULAR']] = all_data.loc[(all_data['product_rk'] == i) & (all_data['period_start_dt'] == d)]['PRICE_REGULAR'].fillna(all_data.loc[(all_data['product_rk'] == i) & (all_data['period_start_dt'] == d)]['PRICE_REGULAR'].median())
    # print(all_data[(all_data['product_rk'] == i) & (all_data['period_start_dt'] == d)]['PRICE_REGULAR'])

In [40]:
#Заполняем аналогично пропуски в переменной PRICE_AFTER_DISC
for i in all_data['product_rk'].unique():
  # print(i)
  for d in all_data[all_data['product_rk'] == i]['period_start_dt'].unique():
    # print(d)
    # print(all_data[(all_data['product_rk'] == i) & (all_data['period_start_dt'] == d)]['PRICE_AFTER_DISC'].median())
    all_data.loc[(all_data['product_rk'] == i) & (all_data['period_start_dt'] == d), ['PRICE_AFTER_DISC']] = all_data.loc[(all_data['product_rk'] == i) & (all_data['period_start_dt'] == d)]['PRICE_AFTER_DISC'].fillna(all_data.loc[(all_data['product_rk'] == i) & (all_data['period_start_dt'] == d)]['PRICE_AFTER_DISC'].median())
    # print(all_data[(all_data['product_rk'] == i) & (all_data['period_start_dt'] == d)]['PRICE_AFTER_DISC'])

In [41]:
#Выведем для проверки, что все NaN в переменных PRICE_REGULAR и PRICE_AFTER_DISC заменились на соответствующие медианы
all_data[(all_data['product_rk'] == 96212) & (all_data['period_start_dt'] == '2019-11-04')]

Unnamed: 0,id,product_rk,store_location_rk,period_start_dt,demand,PROMO1_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,AUTORIZATION_FLAG
889,889,96212,317,2019-11-04,3.966667,1.0,70.0,49.0,1.0
1812,1818,96212,355,2019-11-04,7.0,1.0,70.0,49.0,1.0
2735,2747,96212,380,2019-11-04,3.0,1.0,70.0,49.0,1.0
3659,3677,96212,425,2019-11-04,5.366667,1.0,70.0,49.0,1.0
4582,4606,96212,453,2019-11-04,3.733333,1.0,70.0,49.0,1.0
5501,5531,96212,504,2019-11-04,9.0,1.0,70.0,49.0,1.0
6425,6460,96212,517,2019-11-04,0.0,1.0,70.0,49.0,1.0
7348,7388,96212,525,2019-11-04,4.2,1.0,70.0,49.0,1.0
8272,8318,96212,533,2019-11-04,0.0,0.0,70.0,49.0,0.0
9194,9240,96212,535,2019-11-04,4.0,1.0,70.0,49.0,1.0


In [42]:
#Проверка:
all_data['PRICE_REGULAR'].isnull().any()

False

In [50]:
all_data.iloc[0,:].period_start_dt.day

26

In [51]:
#перекодируем даты. Создадим три переменных ind_of_year, ind_of_month и ind_of_day, которые будут отражать год, месяц и день наблюдения соответственно

# all_data['period_start_dt'] = pd.to_datetime(all_data['period_start_dt'], dayfirst =False)
all_data["ind_of_year"] = [dt.year for dt in all_data.period_start_dt]
all_data["ind_of_month"] = [dt.month for dt in all_data.period_start_dt]
all_data["ind_of_day"] = [dt.day for dt in all_data.period_start_dt]
all_data.head()

Unnamed: 0,id,product_rk,store_location_rk,period_start_dt,demand,PROMO1_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,AUTORIZATION_FLAG,ind_of_year,ind_of_month,ind_of_day
20,20,40369,317,2016-12-26,65.0,1.0,500.0,500.0,1.0,2016,12,26
21,21,40370,317,2016-12-26,83.0,1.0,1000.0,1000.0,1.0,2016,12,26
22,22,40372,317,2016-12-26,30.0,1.0,2000.0,2000.0,1.0,2016,12,26
23,23,40373,317,2016-12-26,7.0,1.0,3000.0,3000.0,1.0,2016,12,26
24,24,46272,317,2016-12-26,35.0,1.0,157.0,157.0,1.0,2016,12,26


In [52]:
#пример
all_data.iloc[300:600]

Unnamed: 0,id,product_rk,store_location_rk,period_start_dt,demand,PROMO1_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,AUTORIZATION_FLAG,ind_of_year,ind_of_month,ind_of_day
320,320,40369,317,2018-01-15,12.000000,0.0,500.0,500.0,1.0,2018,1,15
321,321,40370,317,2018-01-15,8.000000,0.0,1000.0,1000.0,1.0,2018,1,15
322,322,40372,317,2018-01-15,4.000000,0.0,2000.0,2000.0,1.0,2018,1,15
323,323,40373,317,2018-01-15,2.000000,0.0,3000.0,3000.0,1.0,2018,1,15
324,324,46272,317,2018-01-15,3.000000,0.0,239.0,239.0,1.0,2018,1,15
...,...,...,...,...,...,...,...,...,...,...,...,...
615,615,40370,317,2018-12-24,69.000000,0.0,1000.0,1000.0,1.0,2018,12,24
616,616,40372,317,2018-12-24,12.000000,0.0,2000.0,2000.0,1.0,2018,12,24
617,617,40373,317,2018-12-24,16.000000,0.0,3000.0,3000.0,1.0,2018,12,24
618,618,46272,317,2018-12-24,18.266667,1.0,329.0,98.7,1.0,2018,12,24


In [53]:
#и теперь удалим переменную period_start_dt за ненадобностью
del all_data['period_start_dt']
all_data

Unnamed: 0,id,product_rk,store_location_rk,demand,PROMO1_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,AUTORIZATION_FLAG,ind_of_year,ind_of_month,ind_of_day
20,20,40369,317,65.0,1.0,500.00,500.0,1.0,2016,12,26
21,21,40370,317,83.0,1.0,1000.00,1000.0,1.0,2016,12,26
22,22,40372,317,30.0,1.0,2000.00,2000.0,1.0,2016,12,26
23,23,40373,317,7.0,1.0,3000.00,3000.0,1.0,2016,12,26
24,24,46272,317,35.0,1.0,157.00,157.0,1.0,2016,12,26
...,...,...,...,...,...,...,...,...,...,...,...
35339,35537,40370,1380,,0.0,1000.00,1000.0,1.0,2019,12,30
35340,35538,40372,1380,,0.0,2000.00,2000.0,1.0,2019,12,30
35341,35539,40373,1380,,0.0,3000.00,3000.0,1.0,2019,12,30
35342,35540,46272,1380,,1.0,284.29,199.0,1.0,2019,12,30


In [54]:
#Подготовим данные для обучения модели
#Поделим данные на трейн и тест
data_train = all_data[all_data['demand'].isnull() == False]
data_train #получаем 33959 наблюдений

Unnamed: 0,id,product_rk,store_location_rk,demand,PROMO1_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,AUTORIZATION_FLAG,ind_of_year,ind_of_month,ind_of_day
20,20,40369,317,65.0,1.0,500.000000,500.000000,1.0,2016,12,26
21,21,40370,317,83.0,1.0,1000.000000,1000.000000,1.0,2016,12,26
22,22,40372,317,30.0,1.0,2000.000000,2000.000000,1.0,2016,12,26
23,23,40373,317,7.0,1.0,3000.000000,3000.000000,1.0,2016,12,26
24,24,46272,317,35.0,1.0,157.000000,157.000000,1.0,2016,12,26
...,...,...,...,...,...,...,...,...,...,...,...
35309,35507,40370,1380,24.0,0.0,1000.000000,1000.000000,1.0,2019,11,25
35310,35508,40372,1380,11.0,0.0,2000.000000,2000.000000,1.0,2019,11,25
35311,35509,40373,1380,3.0,0.0,3000.000000,3000.000000,1.0,2019,11,25
35312,35510,46272,1380,0.0,1.0,284.290000,199.000000,1.0,2019,11,25


In [55]:
data_test = all_data[all_data['demand'].isnull()]
#Сразу переименуем столбец "demand" в тестовой выборке на "predicted"
data_test.rename(columns={'demand': 'predicted'}, inplace=True)
data_test #получаем 1200 наблюдений

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_test.rename(columns={'demand': 'predicted'}, inplace=True)


Unnamed: 0,id,product_rk,store_location_rk,predicted,PROMO1_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,AUTORIZATION_FLAG,ind_of_year,ind_of_month,ind_of_day
908,908,40369,317,,0.0,500.00,500.0,1.0,2019,12,2
909,909,40370,317,,0.0,1000.00,1000.0,1.0,2019,12,2
910,910,40372,317,,0.0,2000.00,2000.0,1.0,2019,12,2
911,911,40373,317,,0.0,3000.00,3000.0,1.0,2019,12,2
912,912,46272,317,,1.0,284.29,199.0,1.0,2019,12,2
...,...,...,...,...,...,...,...,...,...,...,...
35339,35537,40370,1380,,0.0,1000.00,1000.0,1.0,2019,12,30
35340,35538,40372,1380,,0.0,2000.00,2000.0,1.0,2019,12,30
35341,35539,40373,1380,,0.0,3000.00,3000.0,1.0,2019,12,30
35342,35540,46272,1380,,1.0,284.29,199.0,1.0,2019,12,30


In [56]:
X = data_train.drop(['id', 'demand'], axis=1)
y = data_train['demand']

In [57]:
X

Unnamed: 0,product_rk,store_location_rk,PROMO1_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,AUTORIZATION_FLAG,ind_of_year,ind_of_month,ind_of_day
20,40369,317,1.0,500.000000,500.000000,1.0,2016,12,26
21,40370,317,1.0,1000.000000,1000.000000,1.0,2016,12,26
22,40372,317,1.0,2000.000000,2000.000000,1.0,2016,12,26
23,40373,317,1.0,3000.000000,3000.000000,1.0,2016,12,26
24,46272,317,1.0,157.000000,157.000000,1.0,2016,12,26
...,...,...,...,...,...,...,...,...,...
35309,40370,1380,0.0,1000.000000,1000.000000,1.0,2019,11,25
35310,40372,1380,0.0,2000.000000,2000.000000,1.0,2019,11,25
35311,40373,1380,0.0,3000.000000,3000.000000,1.0,2019,11,25
35312,46272,1380,1.0,284.290000,199.000000,1.0,2019,11,25


In [58]:
y

Unnamed: 0,demand
20,65.0
21,83.0
22,30.0
23,7.0
24,35.0
...,...
35309,24.0
35310,11.0
35311,3.0
35312,0.0


In [59]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [63]:
#оптимальные параметры модели были подобраны вручную
#обучим модель и проверим ее качество
regressor = GradientBoostingRegressor(
    max_depth=11,
    n_estimators=420,
    learning_rate=0.2,
    random_state=1,
    min_samples_leaf=11,
    min_samples_split=2,
    loss='absolute_error'
)
regressor.fit(X_train, y_train)

In [61]:
y_pred = regressor.predict(X_test)
mean_absolute_error(y_test, y_pred)

5.062950326564885

In [64]:
#предскажем искомые значения спроса для нашей тестовой выборки (с декабря 2019)
X_test = data_test.drop(['id', 'predicted'], axis=1)
X_test

Unnamed: 0,product_rk,store_location_rk,PROMO1_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,AUTORIZATION_FLAG,ind_of_year,ind_of_month,ind_of_day
908,40369,317,0.0,500.00,500.0,1.0,2019,12,2
909,40370,317,0.0,1000.00,1000.0,1.0,2019,12,2
910,40372,317,0.0,2000.00,2000.0,1.0,2019,12,2
911,40373,317,0.0,3000.00,3000.0,1.0,2019,12,2
912,46272,317,1.0,284.29,199.0,1.0,2019,12,2
...,...,...,...,...,...,...,...,...,...
35339,40370,1380,0.0,1000.00,1000.0,1.0,2019,12,30
35340,40372,1380,0.0,2000.00,2000.0,1.0,2019,12,30
35341,40373,1380,0.0,3000.00,3000.0,1.0,2019,12,30
35342,46272,1380,1.0,284.29,199.0,1.0,2019,12,30


In [None]:
#предскажем значения спроса для тестовой выборки
y_pred_res = regressor.predict(X_test)
y_pred_res

array([ 5.82334136,  8.14585683,  5.69535062, ..., 24.95657769,
        5.98159285,  7.92182532])

In [None]:
#создадим датафрейм y_results, в который поместим результаты
y_results = data_test[['id', 'predicted']]
y_results['predicted'] = y_pred_res
y_results

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_results['predicted'] = y_pred_res


Unnamed: 0,id,predicted
908,908,5.823341
909,909,8.145857
910,910,5.695351
911,911,4.157089
912,912,4.604878
...,...,...
35339,35537,81.654309
35340,35538,39.476182
35341,35539,24.956578
35342,35540,5.981593


In [None]:
# удалим отрицательыне прогнозы из выборки
y_results.loc[y_results['predicted'] < 0, ['predicted']]

Unnamed: 0,predicted
5520,-1.477845
5526,-0.143918
6473,-0.693504
24914,-0.096025
24915,-0.345976
24917,-0.425856
24923,-0.811523
31373,-0.563163
31379,-1.254942
31385,-1.987016


In [None]:
#заменим все отрицательные предсказанные значения спроса на ноль
y_results.loc[y_results['predicted'] < 0, ['predicted']] = 0
y_results.loc[y_results['predicted'] < 0, ['predicted']]

Unnamed: 0,predicted


In [None]:
#Записываем полученный датафрейм в csv файл:
y_results.to_csv('submission_example.csv',sep=',', encoding='utf-8', index=False)