In [1]:
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn.functional as F
from torch import nn
from torch import optim

import torchvision
import torchvision.transforms as transforms

from tqdm import tqdm_notebook

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [6]:
train = pd.read_csv('./data/train.csv')
train.head()

Unnamed: 0,item_id,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,description,price,item_seq_number,activation_date,user_type,image,image_top_1,deal_probability
0,b912c3c6a6ad,e00f8ff2eaf9,Свердловская область,Екатеринбург,Личные вещи,Товары для детей и игрушки,Постельные принадлежности,,,Кокоби(кокон для сна),"Кокон для сна малыша,пользовались меньше месяц...",400.0,2,2017-03-28,Private,d10c7e016e03247a3bf2d13348fe959fe6f436c1caf64c...,1008.0,0.12789
1,2dac0150717d,39aeb48f0017,Самарская область,Самара,Для дома и дачи,Мебель и интерьер,Другое,,,Стойка для Одежды,"Стойка для одежды, под вешалки. С бутика.",3000.0,19,2017-03-26,Private,79c9392cc51a9c81c6eb91eceb8e552171db39d7142700...,692.0,0.0
2,ba83aefab5dc,91e2f88dd6e3,Ростовская область,Ростов-на-Дону,Бытовая электроника,Аудио и видео,"Видео, DVD и Blu-ray плееры",,,Philips bluray,"В хорошем состоянии, домашний кинотеатр с blu ...",4000.0,9,2017-03-20,Private,b7f250ee3f39e1fedd77c141f273703f4a9be59db4b48a...,3032.0,0.43177
3,02996f1dd2ea,bf5cccea572d,Татарстан,Набережные Челны,Личные вещи,Товары для детей и игрушки,Автомобильные кресла,,,Автокресло,Продам кресло от0-25кг,2200.0,286,2017-03-25,Company,e6ef97e0725637ea84e3d203e82dadb43ed3cc0a1c8413...,796.0,0.80323
4,7c90be56d2ab,ef50846afc0b,Волгоградская область,Волгоград,Транспорт,Автомобили,С пробегом,ВАЗ (LADA),2110.0,"ВАЗ 2110, 2003",Все вопросы по телефону.,40000.0,3,2017-03-16,Private,54a687a3a0fc1d68aed99bdaaf551c5c70b761b16fd0a2...,2264.0,0.20797


### Preprocessing

In [7]:
train['activation_date'] = pd.to_datetime(train.activation_date)
train['day_of_month'] = train.activation_date.apply(lambda x: x.day)
train['day_of_week'] = train.activation_date.apply(lambda x: x.weekday())


cols = ['region', 'category_name','city', 'user_type']

for col in tqdm_notebook(cols):
    gp = train.groupby(col)['deal_probability']
    mean = gp.mean()
    train[col + '_deal_probability_avg'] = train[col].map(mean)
    


train = train.drop(['city', 'category_name', 'user_id', 'description',
                    'image', 'parent_category_name', 'region',
                    'item_id', 'param_1', 'param_2', 'param_3',
                    'title', 'user_type', 'activation_date'], axis=1)

for col in train.columns:
    if train[col].isna().sum() > 0:
        train[col].fillna(train[col].median(), inplace=True)


train.info()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for col in tqdm_notebook(cols):


  0%|          | 0/4 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1503424 entries, 0 to 1503423
Data columns (total 10 columns):
 #   Column                              Non-Null Count    Dtype  
---  ------                              --------------    -----  
 0   price                               1503424 non-null  float64
 1   item_seq_number                     1503424 non-null  int64  
 2   image_top_1                         1503424 non-null  float64
 3   deal_probability                    1503424 non-null  float64
 4   day_of_month                        1503424 non-null  int64  
 5   day_of_week                         1503424 non-null  int64  
 6   region_deal_probability_avg         1503424 non-null  float64
 7   category_name_deal_probability_avg  1503424 non-null  float64
 8   city_deal_probability_avg           1503424 non-null  float64
 9   user_type_deal_probability_avg      1503424 non-null  float64
dtypes: float64(7), int64(3)
memory usage: 114.7 MB


In [8]:
train.head()

Unnamed: 0,price,item_seq_number,image_top_1,deal_probability,day_of_month,day_of_week,region_deal_probability_avg,category_name_deal_probability_avg,city_deal_probability_avg,user_type_deal_probability_avg
0,400.0,2,1008.0,0.12789,28,1,0.122004,0.198445,0.123397,0.149557
1,3000.0,19,692.0,0.0,26,6,0.136721,0.191848,0.1394,0.149557
2,4000.0,9,3032.0,0.43177,20,0,0.135944,0.171572,0.124881,0.149557
3,2200.0,286,796.0,0.80323,25,5,0.142602,0.198445,0.135031,0.124513
4,40000.0,3,2264.0,0.20797,16,3,0.145908,0.278427,0.137275,0.149557


In [9]:
train, test = train_test_split(train, test_size=0.25, random_state=42)

### Создать Dataset для загрузки данных

In [10]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, df, normalize=False, fit_scaler=False):
        self.df = df.copy()
        self.normalize = normalize
        self.scaler = MinMaxScaler()
        self.fit_scaler = fit_scaler
        self.sc_fl = 0

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        label = self.df.iloc[idx, -1:]
        if self.fit_scaler:
            self.scaler.fit(df.iloc[:, :-1])
            self.sc_fl = 1

        if self.normalize and sc_fl:
            df = scaler.transform(df)

        tensor = torch.FloatTensor(self.df.iloc[idx, 1:-1].values)
        label = torch.FloatTensor(label.values)

        return tensor, label

In [12]:
df_train, df_test = MyDataset(train), MyDataset(test)

### Обернуть его в Dataloader

In [17]:
BATCH_SIZE = 1024

In [14]:
train_loader = torch.utils.data.DataLoader(df_train,
                                           batch_size=BATCH_SIZE,
                                           shuffle=True,
                                           num_workers=3)

test_loader = torch.utils.data.DataLoader(df_test,
                                          batch_size=BATCH_SIZE,
                                          shuffle=True,
                                          num_workers=3)

### Написать архитектуру сети, которая предсказывает число показов на основании числовых данных. Сеть должна включать BatchNorm слои и Dropout

In [27]:
class FFNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(FFNetwork, self).__init__()
        
        self.bn1 = nn.BatchNorm1d(input_dim)
        self.fc1 = nn.Linear(input_dim, 5*hidden_dim)
        self.dp1 = nn.Dropout(0.25)

        self.bn2 = nn.BatchNorm1d(5*hidden_dim)
        self.fc2 = nn.Linear(5*hidden_dim, 2*hidden_dim)
        self.dp2 = nn.Dropout(0.15)

        self.bn4 = nn.BatchNorm1d(2*hidden_dim)
        self.fc4 = nn.Linear(2*hidden_dim, 1)

    def forward(self, x):
        x = self.bn1(x)
        x = self.fc1(x)
        x = F.tanh(x)
        x = self.dp1(x)

        x = self.bn2(x)
        x = self.fc2(x)
        x = F.tanh(x)
        x = self.dp2(x)

        x = self.bn4(x)
        x = self.fc4(x)
        x = F.sigmoid(x)

        return x

### Учить будем на функцию потерь с кагла (log RMSE)

In [None]:
def log_rmse(y_pred, y_true):
    loss = torch.sqrt(torch.mean(
        (torch.log(y_pred+1)-torch.log(y_true+1))**2)
    )
    return loss

In [37]:
def trin_eval_loss(ev_dataloader, model):

    size = len(ev_dataloader.dataset)
    num_batches = len(ev_dataloader)
    test_loss = 0

    with torch.no_grad():
        for X, y in ev_dataloader:
            pred = model(X)
            test_loss += log_rmse(pred, y)

    test_loss /= num_batches

    return test_loss

In [38]:
def train_proc(tr_dataloader, ev_dataloader, model, optimizer):

    size = len(tr_dataloader.dataset)

    for batch, (X, y) in enumerate(tr_dataloader):
        y_pred = model(X)
        loss = log_rmse(y_pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    return trin_eval_loss(ev_dataloader, model)

## Сравните сходимость Adam, RMSProp и SGD, сделайте вывод по качеству работы модели

In [41]:
EPOCHES = 3
LR = 0.01

### SGD

In [42]:
model = FFNetwork(8, 5)

In [43]:
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

In [44]:
%%time
for epoch in tqdm_notebook(range(EPOCHES)):
    print(f'Epoch {epoch+1}\n-------------------------------')

    result = train_proc(train_loader,
                             test_loader,
                             model,
                             optimizer
                             )
    print(f'Test loss: {result}')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1
-------------------------------
Test loss: 0.013198159635066986
Epoch 2
-------------------------------
Test loss: 0.013446185737848282
Epoch 3
-------------------------------
Test loss: 0.013210585340857506
CPU times: user 1h 2min 40s, sys: 27.7 s, total: 1h 3min 8s
Wall time: 26min 49s


### RMSProp

In [45]:
model = FFNetwork(8, 5)

In [46]:
optimizer = torch.optim.RMSprop(model.parameters(), lr=LR)

In [47]:
%%time
for epoch in tqdm_notebook(range(EPOCHES)):
    print(f'Epoch {epoch+1}\n-------------------------------')

    result = train_proc(train_loader,
                             test_loader,
                             model,
                             optimizer
                             )
    print(f'Test loss: {result}')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1
-------------------------------
Test loss: 0.013501441106200218
Epoch 2
-------------------------------
Test loss: 0.014313501305878162
Epoch 3
-------------------------------
Test loss: 0.014660804532468319
CPU times: user 1h 3min 29s, sys: 28.1 s, total: 1h 3min 58s
Wall time: 26min 53s


### Adam

In [48]:
model = FFNetwork(8, 5)

In [49]:
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

In [50]:
%%time
for epoch in tqdm_notebook(range(EPOCHES)):
    print(f'Epoch {epoch+1}\n-------------------------------')

    result = train_proc(train_loader,
                             test_loader,
                             model,
                             optimizer
                             )
    print(f'Test loss: {result}')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1
-------------------------------
Test loss: 0.013111072592437267
Epoch 2
-------------------------------
Test loss: 0.01341033075004816
Epoch 3
-------------------------------
Test loss: 0.013045982457697392
CPU times: user 1h 56s, sys: 27.4 s, total: 1h 1min 24s
Wall time: 26min 19s


##### по полученным данным (не факт, что они реливантны, слишком маленький эксперимент) лучшим оказался метод Adam, как по сходимости, так и по качеству