# <center> 2.9 А нейронки будут? </center>

In [1]:
import numpy as np
import pandas as pd

In [2]:
car_info = pd.read_csv('../data/car_info.csv')   # car_info - информация про машины с таргетом
 
rides_info = pd.read_csv('../data/rides_info.csv') # rides_info - информация про поездки

driver_info = pd.read_csv('../data/driver_info.csv')  # driver_info - информация про водителей

fix_info = pd.read_csv('../data/fix_info.csv')  # fix_info - информация про ремонт машин

In [3]:
rides_info = rides_info.merge(car_info, on = 'car_id', how = 'left')

In [4]:
drop_cols = ['user_id', 'car_id', 'ride_id', 'ride_date']
cat_cols = ['car_type', 'fuel_type', 'model']

In [5]:
# закодируем категориальные фичи в one hot encoding вектора
rides_info = pd.get_dummies(rides_info, columns=cat_cols)

In [6]:
# заполним пропущенные значения медианным значением по столбцу
rides_info.fillna(rides_info.median(), inplace=True)

In [130]:
rides_info.target_1

0         2.638329e+06
1         2.638329e+06
2         2.638329e+06
3         2.638329e+06
4         2.638329e+06
              ...     
739495    1.691214e+06
739496    1.691214e+06
739497    1.691214e+06
739498    1.691214e+06
739499    1.691214e+06
Name: target_1, Length: 739500, dtype: float64

<h1><center>Для начала возьмем готовую реализацию нейронной сети MLP из sklearn</center></h1>

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.metrics import f1_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder

In [8]:
X = rides_info.drop(drop_cols + ['target_1','target_2'], axis=1)
y = rides_info['target_2']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## **Классификация**

In [None]:
clf = MLPClassifier(random_state=42, max_iter=100, verbose=True).fit(X_train, y_train)

Iteration 1, loss = 24.40812620
Iteration 2, loss = 22.26955516
Iteration 3, loss = 21.41796161
Iteration 4, loss = 20.71495361
Iteration 5, loss = 19.51847941
Iteration 6, loss = 18.27446599
Iteration 7, loss = 17.13075069
Iteration 8, loss = 15.53814696
Iteration 9, loss = 14.23789187
Iteration 10, loss = 13.33057990
Iteration 11, loss = 10.25473171
Iteration 12, loss = 8.78438723
Iteration 13, loss = 5.96714658
Iteration 14, loss = 2.42213394
Iteration 15, loss = 2.26213950
Iteration 16, loss = 2.13040114
Iteration 17, loss = 2.04077838
Iteration 18, loss = 2.01382844
Iteration 19, loss = 1.99364812
Iteration 20, loss = 1.99656262
Iteration 21, loss = 1.98729682
Iteration 22, loss = 1.97327101
Iteration 23, loss = 1.98245117
Iteration 24, loss = 1.97997040
Iteration 25, loss = 1.98438371
Iteration 26, loss = 1.98031743
Iteration 27, loss = 1.97283010
Iteration 28, loss = 1.97540593
Iteration 29, loss = 1.98236160
Iteration 30, loss = 1.99105657
Iteration 31, loss = 1.98454188
Iterat

In [19]:
preds = clf.predict(X_test)

In [20]:
preds[:5]

array(['gear_stick', 'engine_check', 'engine_overheat', 'gear_stick',
       'gear_stick'], dtype='<U15')

In [21]:
f1_score(y_test, preds, average='macro')

0.14255321691485337

## **Регрессия**

In [22]:
y = rides_info['target_1']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
regr = MLPRegressor(random_state=42).fit(X_train, y_train)



In [26]:
preds = regr.predict(X_test)
preds[:5]

array([2733966.98966003, 2460727.84467931, 2186622.38347925,
       3195041.05513761, 2097579.6097364 ])

In [27]:
mean_squared_error(y_test, preds)

7136535359497.65

## Теперь попробуем решить задачу с помощью самописного MLP с двумя головами. На фрэймворке PyTorch 

In [62]:
rides_df = rides_info.drop(columns=drop_cols)

In [63]:
# переведем строковые значения категориального таргета в целочисленные
le = LabelEncoder()
rides_df['target_2'] = le.fit_transform(rides_df['target_2'])

### GaussRankScaler - SOTA scaler для нейросетей, показывает лучший скор, чем скейлеры из sklearn

In [39]:
# для нормализации данных используем ранк гаусс трансформацию. 
# импортнем библеотеку и добавим путь к ней 
#!git clone https://github.com/aldente0630/gauss-rank-scaler
import sys
sys.path.append('./gauss-rank-scaler')

In [64]:
# числовые переменные, которые подвергнем трансформации
num_cols = [col for col in list(rides_df.columns)
            if col not in ['target_1', 'target_2']]

In [75]:
from gauss_rank_scaler import GaussRankScaler

scaler = GaussRankScaler()
df = scaler.fit_transform(rides_df[num_cols])

In [76]:
df = pd.DataFrame(df, columns=num_cols)

In [77]:
target_scaler = GaussRankScaler()
target = target_scaler.fit_transform(rides_df['target_1'].values.reshape(-1, 1))

In [78]:
df['target_1'] = target
df['target_2'] = rides_df['target_2']

In [79]:
train, test = train_test_split(df, test_size=0.2, random_state=42)

In [80]:
from torch.utils.data import Dataset, DataLoader
import torch
from torch import nn
import os
import copy

In [97]:
# для наших данных и размера нейросети подойдет запуск на cpu
device = torch.device('cpu')

In [82]:
# ВАЖНО! - фиксируем воспроизводимость
def seed_everything(seed=42):
    
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

In [102]:
# можно попробовать поэкспериментировать с параметрами
class CFG:
    hidden_size=128
    dropout=0.1
    lr=1e-3
    batch_size=128
    num_workers=4
    epochs=20
    num_features=train.shape[1]-2 # кол-во фичей подаваемое на вход
    num_tar_2=train.target_2.nunique() # количество выходов равно кол-ву предсказываемых классов

In [88]:
# датасет выдает фичи и значения целевых переменных
class Rides(Dataset):
    
    def __init__(self, df):
        self.df = df
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx,:]
        
        data = row.drop(labels=['target_1', 'target_2'])
        
        data = torch.FloatTensor(data.values.astype('float'))
        tar_1 = torch.tensor(row['target_1']).float()
        tar_2 = row['target_2'].astype('int')
        
        return data, tar_1, tar_2

In [89]:
# проверим корректность работы методов нашего класса
next(iter(Rides(train)))

(tensor([-0.3788, -2.3073, -1.9011, -0.2533, -0.4003, -2.7511, -0.0485, -2.7511,
          1.0343,  0.0570, -0.7821,  2.7511,  1.0915,  0.2255, -2.7511,  2.7511,
         -2.7511, -2.7511, -2.7511,  2.7511, -2.7511, -2.7511, -2.7511, -2.7511,
         -2.7511, -2.7511, -2.7511, -2.7511, -2.7511, -2.7511, -2.7511, -2.7511,
         -2.7511, -2.7511, -2.7511, -2.7511, -2.7511, -2.7511, -2.7511, -2.7511,
          2.7511, -2.7511, -2.7511, -2.7511, -2.7511, -2.7511]),
 tensor(-1.8217),
 4)

In [90]:
train_datasets = {'train': Rides(train),
                  'val': Rides(test)}

In [103]:
dataloaders_dict = {x: torch.utils.data.DataLoader(train_datasets[x], 
                                                   batch_size=CFG.batch_size, 
                                                   shuffle=True, 
                                                   num_workers=CFG.num_workers)
                    for x in ['train', 'val']}

In [92]:
# Построим архитектуру mlp с двумя головами для регрессии и классифкации

class TabularNN(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.mlp = nn.Sequential(
                          nn.Linear(cfg.num_features, cfg.hidden_size),
                          #nn.BatchNorm1d(cfg.hidden_size),
                          nn.Dropout(cfg.dropout),
                          nn.ReLU(),
                          nn.Linear(cfg.hidden_size, cfg.hidden_size),
                          #nn.BatchNorm1d(cfg.hidden_size),
                          nn.Dropout(cfg.dropout),
                          nn.ReLU(),
                          nn.Linear(cfg.hidden_size, cfg.hidden_size//2),
                          )
        
        self.regressor = nn.Sequential(
            nn.Linear(cfg.hidden_size//2, 1)
        )
        self.classifier = nn.Sequential(
            nn.Linear(cfg.hidden_size//2, cfg.num_tar_2)
        )

    def forward(self, data):
        x = self.mlp(data)
        tar1 = self.regressor(x)
        tar2 = self.classifier(x)
        return tar1.view(-1), tar2

In [98]:
model = TabularNN(CFG).to(device)

In [99]:
# оптимайзер и лоссы для регрессии и классификации
optimizer = torch.optim.Adam(model.parameters(), lr = CFG.lr)
regression_criterion = nn.MSELoss().to(device)
classification_criterion = nn.CrossEntropyLoss().to(device)

In [104]:
# функция для тренировки, на выходе - обученная модель
def train_model(model, dataloaders, regression_criterion,
                classification_criterion, optimizer, num_epochs=25):

    val_acc_history = []

    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = 100000.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0

            # Iterate over data.
            for inputs, labels_1, labels_2 in dataloaders[phase]:
                inputs = inputs.to(device)
                labels_1 = labels_1.to(device)
                labels_2 = labels_2.to(device)
                
                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    # Get model outputs and calculate loss

                    outputs_1, outputs_2 = model(inputs)
                    loss_1 = regression_criterion(outputs_1, labels_1)
                    loss_2 = classification_criterion(outputs_2, labels_2)

                    loss = loss_1 + loss_2

                    _, preds_2 = torch.max(outputs_2, 1)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                val_acc_history.append(running_loss)

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            print('{} Loss: {:.4f}'.format(phase, epoch_loss))

            # deep copy the model
            if phase == 'val' and epoch_loss < best_loss:
                best_model_wts = copy.deepcopy(model.state_dict())


    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [105]:
model_ft = train_model(model, dataloaders_dict, regression_criterion,
                classification_criterion, optimizer, num_epochs=22)

Epoch 0/21
----------
train Loss: 2.1124
val Loss: 1.9295
Epoch 1/21
----------
train Loss: 1.9634
val Loss: 1.7706
Epoch 2/21
----------
train Loss: 1.8606
val Loss: 1.6315
Epoch 3/21
----------
train Loss: 1.7908
val Loss: 1.5434
Epoch 4/21
----------
train Loss: 1.7332
val Loss: 1.4882
Epoch 5/21
----------
train Loss: 1.6922
val Loss: 1.4323
Epoch 6/21
----------
train Loss: 1.6565
val Loss: 1.4066
Epoch 7/21
----------
train Loss: 1.6273
val Loss: 1.3430
Epoch 8/21
----------
train Loss: 1.6056
val Loss: 1.3230
Epoch 9/21
----------
train Loss: 1.5823
val Loss: 1.2987
Epoch 10/21
----------
train Loss: 1.5639
val Loss: 1.2844
Epoch 11/21
----------
train Loss: 1.5445
val Loss: 1.2513
Epoch 12/21
----------
train Loss: 1.5311
val Loss: 1.2361
Epoch 13/21
----------
train Loss: 1.5163
val Loss: 1.1967
Epoch 14/21
----------
train Loss: 1.5027
val Loss: 1.1806
Epoch 15/21
----------
train Loss: 1.4920
val Loss: 1.1587
Epoch 16/21
----------
train Loss: 1.4808
val Loss: 1.1446
Epoch 1

In [106]:
# p1, p2 - предсказания; l1, l2  - истинные значения

p1 = []
p2 = []
l1 = []
l2 = []

with torch.set_grad_enabled(False):
    # Get model outputs and calculate loss
    for inputs, labels_1, labels_2 in dataloaders_dict['val']:
        inputs = inputs.to(device)
        labels_1 = labels_1.to(device)
        labels_2 = labels_2.to(device)
        l1.extend(labels_1.detach().cpu().numpy())
        l2.extend(labels_2.detach().cpu().numpy())
        
        outputs_1, outputs_2 = model_ft(inputs)
        _, outputs_2 = torch.max(outputs_2, 1)

        p1.extend(outputs_1.detach().cpu().numpy())
        p2.extend(outputs_2.detach().cpu().numpy())

**посмотрим на результаты по каждой из задач**

In [107]:
# классификация
from sklearn.metrics import f1_score
f1_score(l2, p2, average='macro')

0.7252206590869403

In [108]:
p2 = target_scaler.inverse_transform(np.array(p2).reshape(-1, 1))
l2 = target_scaler.inverse_transform(np.array(l2).reshape(-1, 1))

In [109]:
# регрессия
mean_squared_error(l2, p2)

52101958057491.28

чтобы улучшить скор можно поэкспериментировать с: 
- нормализацией данных, заполнением пропусков
- архитектурой сети
- попробовать lr sheduler

# А теперь решим задачу с помощью TABNET

https://arxiv.org/abs/1908.07442

In [110]:
X = rides_info.drop(drop_cols + ['target_1', 'target_2'], axis=1)
y = rides_info['target_2']

In [111]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [113]:
pip install pytorch-tabnet

Collecting pytorch-tabnet
  Downloading pytorch_tabnet-4.0-py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.8/41.8 kB[0m [31m660.1 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-4.0
Note: you may need to restart the kernel to use updated packages.


In [114]:
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor

## **Классификация**

In [116]:
clf = TabNetClassifier(device_name='cpu')
clf.fit(
  X_train.values, y_train,
  eval_set=[(X_test.values, y_test)]
)



epoch 0  | loss: 1.96338 | val_0_accuracy: 0.25959 |  0:00:22s
epoch 1  | loss: 1.86576 | val_0_accuracy: 0.27368 |  0:00:43s
epoch 2  | loss: 1.847   | val_0_accuracy: 0.28581 |  0:01:05s
epoch 3  | loss: 1.82419 | val_0_accuracy: 0.30055 |  0:01:27s
epoch 4  | loss: 1.79818 | val_0_accuracy: 0.31464 |  0:01:49s
epoch 5  | loss: 1.76458 | val_0_accuracy: 0.33977 |  0:02:11s
epoch 6  | loss: 1.72492 | val_0_accuracy: 0.36527 |  0:02:32s
epoch 7  | loss: 1.69302 | val_0_accuracy: 0.38609 |  0:02:54s
epoch 8  | loss: 1.66424 | val_0_accuracy: 0.39302 |  0:03:16s
epoch 9  | loss: 1.64177 | val_0_accuracy: 0.41574 |  0:03:37s
epoch 10 | loss: 1.61889 | val_0_accuracy: 0.42381 |  0:03:59s
epoch 11 | loss: 1.60064 | val_0_accuracy: 0.44385 |  0:04:20s
epoch 12 | loss: 1.58663 | val_0_accuracy: 0.44942 |  0:04:42s
epoch 13 | loss: 1.57366 | val_0_accuracy: 0.45852 |  0:05:04s
epoch 14 | loss: 1.56333 | val_0_accuracy: 0.46045 |  0:05:26s
epoch 15 | loss: 1.55365 | val_0_accuracy: 0.46949 |  0



In [117]:
preds = clf.predict(X_test.values)

In [41]:
preds[:5]

array(['engine_fuel', 'another_bug', 'another_bug', 'another_bug',
       'engine_check'], dtype='<U15')

In [118]:
f1_score(y_test, preds, average='macro')

0.5659570480334537

## **Регрессия**

In [119]:
y = rides_info['target_1']

In [120]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [121]:
reg = TabNetRegressor(device_name='cpu')
reg.fit(
  X_train.values, y_train.values.reshape(-1,1),
  eval_set=[(X_test.values, y_test.values.reshape(-1,1))]
)



epoch 0  | loss: 13071166116457.59| val_0_mse: 13114763854235.57|  0:00:22s
epoch 1  | loss: 13036448048594.746| val_0_mse: 13082107736203.969|  0:00:44s
epoch 2  | loss: 12993154211911.0| val_0_mse: 13012934553076.578|  0:01:06s
epoch 3  | loss: 12938276803404.754| val_0_mse: 12971329225959.338|  0:01:28s
epoch 4  | loss: 12864850954122.873| val_0_mse: 12913032110380.582|  0:01:50s
epoch 5  | loss: 12774357662267.453| val_0_mse: 12762433003363.947|  0:02:12s
epoch 6  | loss: 12660569340475.453| val_0_mse: 12629622799360.06|  0:02:34s
epoch 7  | loss: 12509594837346.947| val_0_mse: 12383024418846.844|  0:02:56s
epoch 8  | loss: 12299034565051.672| val_0_mse: 12068202789007.113|  0:03:19s
epoch 9  | loss: 12039692082078.393| val_0_mse: 11826439971454.65|  0:03:41s
epoch 10 | loss: 11749502732085.686| val_0_mse: 11670502449324.525|  0:04:03s
epoch 11 | loss: 11406910236767.844| val_0_mse: 11287346087265.936|  0:04:25s
epoch 12 | loss: 11041688805333.412| val_0_mse: 10348520529247.09|  0:



In [125]:
preds = reg.predict(X_test.values)

In [126]:
preds[:5]

array([[5160755.5],
       [1466021.5],
       [1387069.1],
       [4700907. ],
       [1508347.6]], dtype=float32)

In [127]:
mean_squared_error(y_test.values.reshape(-1,1), preds)

2727736816756.805

# <center> Трансформеры </center>

![sheet](./images/tr.png) ![sheet](./images/hf.png)

## Библиотека transformers и платформа Hugging Face

Отличный туториал для начинающих: <a> https://habr.com/ru/post/704592/ </a>