In [1]:
import pandas as pd
import numpy as np
import nltk
from collections import Counter
from sklearn.feature_extraction import DictVectorizer
from torch.utils.data import Dataset, DataLoader, random_split
import torch
import torch.nn as nn
import torch.functional as F
from tqdm import tqdm
import gensim.downloader as api

In [2]:
CPU = torch.device('cpu')
GPU = torch.device('cuda')

Utils

In [3]:
def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    if isinstance(data, dict):
        return dict((k, to_device(v, device)) for k, v in data.items())
    return data.to(device, non_blocking=True)

class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
        
    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for b in self.dl: 
            yield to_device(b, self.device)

    def __len__(self):
        """Number of batches"""
        return len(self.dl)

# Preprocessing data

In [4]:
DATA_DIR = 'Train_rev1/Train_rev1.csv'

In [5]:
class SalaryDataset(Dataset):
    # token threshold
    MIN_COUNT = 10

    # special tokens for unknown and empty words
    PAD, UNK = 'PAD', 'UNK'
    PAD_IX, UNK_IX = 0, 1

    TEXT_COLS = ['Title', 'FullDescription']
    CATEGORIAL_COLS = ['Category', 'Company', 'LocationNormalized', 'ContractType', 'ContractTime']
    TARGET_COL = 'Log1pSalary'

    MAX_TITLE_LENGHT = 20
    MAX_DESC_LENGHT = 500

    def _process_data(self):
        # use logarithm to get rid of bad distribution
        self._data[self.TARGET_COL] = np.log1p(self._data['SalaryNormalized']).astype('float32')

        # cast missing values to string "NaN"
        self._data[self.CATEGORIAL_COLS] = self._data[self.CATEGORIAL_COLS].fillna('NaN')
        self._data[self.TEXT_COLS] = self._data[self.TEXT_COLS].fillna('NaN')

        # convert text fields to space-separated string of tokens
        tokenizer = nltk.tokenize.WordPunctTokenizer()
        self._data[self.TEXT_COLS] = self._data[self.TEXT_COLS].applymap(lambda x: " ".join(tokenizer.tokenize(x.lower())))

        # count how many times does each token occur in both "Title" and "FullDescription" in total
        for col in self._data[self.TEXT_COLS]:
            for line in self._data[col].values:
                self._tok_cntr.update(line.split(" "))

        # get a list of all tokens that occur at least MIN_COUNT times
        self._tokens = sorted(t for t, c in self._tok_cntr.items() if c >= self.MIN_COUNT)

        # add a special tokens for unknown and empty words
        self._tokens = [self.PAD, self.UNK] + self._tokens

        # build an inverse token index: a dictionary from token(string) to it's index in tokens(int)
        self._token_to_id = {t: i for i, t in enumerate(self._tokens)}

        # we only consider top-1k most frequent companies to minimize memory usage
        top_companies, top_counts = zip(*Counter(self._data['Company']).most_common(1000))
        recognized_companies = set(top_companies)
        self._data["Company"] = self._data["Company"].apply(lambda comp: comp if comp in recognized_companies else "Other")

        # encode the categorical data we have
        self._categorical_vectorizer = DictVectorizer(dtype=np.float32, sparse=False).fit(self._data[self.CATEGORIAL_COLS].apply(dict, axis=1))

    def __init__(self, path: str):
        self._data = pd.read_csv(path)
        self._tok_cntr = Counter()
        self._process_data()

    def __getitem__(self, i):
        row = self._data[i:i+1]

        title = row['Title'].values[0]
        desc = row['FullDescription'].values[0]

        title_vals_encoded = [self._token_to_id.get(tok, self.UNK_IX) for tok in str.split(title, ' ')]
        desc_vals_encoded = [self._token_to_id.get(tok, self.UNK_IX) for tok in str.split(desc, ' ')]

        return {
            'Title': title_vals_encoded,
            'FullDescription': desc_vals_encoded,
            self.TARGET_COL: row[self.TARGET_COL].values[0],
            'Categorical': self._categorical_vectorizer.transform(row[self.CATEGORIAL_COLS].apply(dict, axis=1)).flatten().tolist()
        }

    def __len__(self):
        return len(self._data)

Create dataset

In [6]:
dataset = SalaryDataset(DATA_DIR)
NUM_TOKENS = len(dataset._tokens)
NUM_CAT_FEATURES = len(dataset._categorical_vectorizer.vocabulary_)

Split to train / test / validation subsets

In [7]:
# the sizes is 0.7, 0.1, 0.2 of the original data size, respectively
train_size = round(len(dataset)*0.7)
val_size = round((len(dataset) - train_size) * (1/3))
test_size = (len(dataset) - train_size) - val_size
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size], torch.Generator().manual_seed(12))

In [8]:
def collate_fn(data):
    """ Converts batch values to tensors and adds padding """
    collated = dict(zip(data[0].keys(), [[], [], [], []]))

    for d in data:
        for k, v in d.items():
            if k == 'Title':
                v.extend([SalaryDataset.PAD_IX] * (SalaryDataset.MAX_TITLE_LENGHT - len(v))) # padding
                v = v[:SalaryDataset.MAX_TITLE_LENGHT]
                collated[k].append(v)
            elif k == 'FullDescription':
                v.extend([SalaryDataset.PAD_IX] * (SalaryDataset.MAX_DESC_LENGHT - len(v)))  # padding
                v = v[:SalaryDataset.MAX_DESC_LENGHT]
                collated[k].append(v)
            else:
                collated[k].append(v)

    for k, v in collated.items():
        t = torch.float32 if k in [SalaryDataset.TARGET_COL, 'Categorical'] else torch.int32
        collated[k] = torch.as_tensor(v, dtype=t)
        
    return collated

In [9]:
train_loader = DataLoader(train_dataset, batch_size=128, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=128, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=128, collate_fn=collate_fn)

# Deep learning, finally

In [10]:
class SalaryPredictor(nn.Module):
    def __init__(self, n_tokens=NUM_TOKENS, n_cat_features=NUM_CAT_FEATURES, hid_size=8):
        super().__init__()
        self.n_tokens = n_tokens
        self.n_cat_features = n_cat_features
        self.hid_size = hid_size
        self.embedder = nn.Embedding(n_tokens, hid_size)
        self.title_encoder = nn.Sequential(
            nn.Conv1d(hid_size, hid_size, kernel_size=2),
            nn.Dropout(p=0.25),
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(output_size=1)
        )
        self.description_encoder = nn.Sequential(
            nn.Conv1d(hid_size, hid_size, kernel_size=2),
            nn.Dropout(p=0.25),
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(output_size=1)
        )
        self.categorical_encoder = nn.Sequential(
            nn.Linear(n_cat_features, hid_size * 2),
            nn.ReLU(),
            nn.Linear(hid_size * 2, hid_size * 2),
            nn.ReLU()
        )
        self.final_predictor = nn.Sequential(
            nn.Linear(hid_size * 4, hid_size),
            nn.ReLU(),
            nn.Linear(hid_size, 1)
        )

    def forward(self, batch):
        title_embeddings = self.embedder(batch['Title']).permute(0, 2, 1)
        title_features = self.title_encoder(title_embeddings).squeeze()

        description_embeddings = self.embedder(batch['FullDescription']).permute(0, 2, 1)
        description_features = self.description_encoder(description_embeddings).squeeze()

        categorical_features = self.categorical_encoder(batch['Categorical'])

        features = torch.cat([title_features, description_features, categorical_features], dim=1)
        return self.final_predictor(features).squeeze()

In [11]:
def evaluate(model, device=None):
    squared_error = abs_error = num_samples = 0.0

    loader = val_loader if not device else DeviceDataLoader(val_loader, device)

    model.eval()
    with torch.no_grad():
        for batch in loader:
            pred = model(batch)
            squared_error += torch.mean(torch.square(pred - batch[SalaryDataset.TARGET_COL]))
            abs_error += torch.mean(torch.abs(pred - batch[SalaryDataset.TARGET_COL]))
            num_samples += len(batch)
    mse = squared_error.detach().cpu().numpy() / num_samples
    mae = abs_error.detach().cpu().numpy() / num_samples

    return mse, mae

In [12]:
def train(model, optimizer, epoches=5, device=None, criterion=nn.MSELoss(reduction='mean')):
    loader = train_loader

    if device:
        model.to(device)
        loader = DeviceDataLoader(train_loader, device)

    for epoch in range(epoches):
        model.train()
        for batch in tqdm(loader):
            optimizer.zero_grad(set_to_none=True)
            pred = model(batch)
            loss = criterion(pred, batch[SalaryDataset.TARGET_COL])
            loss.backward()
            optimizer.step()

        mse, mae = evaluate(model, device)
        print(f'Epoch: {epoch+1} | Loss: {loss.item()} | Validation: MSE={mse}/MAE={mae}')

    if device:
        model.cpu()

In [13]:
def test(model, device=None):
    squared_error = abs_error = num_samples = 0.0
    loader = test_loader

    if device:
        model.to(device)
        loader = DeviceDataLoader(test_loader, device)

    model.eval()
    with torch.no_grad():
        for x in loader:
            pred = model(x)
            squared_error += torch.mean(torch.square(pred - x[SalaryDataset.TARGET_COL]))
            abs_error += torch.mean(torch.abs(pred - x[SalaryDataset.TARGET_COL]))
            num_samples += len(x)

    mse = squared_error.detach().cpu().numpy() / num_samples
    mae = abs_error.detach().cpu().numpy() / num_samples

    if device:
        model.cpu()

    return mse, mae

# 1. Развейте СNN архитектуру

Начальные показатели:

In [14]:
model = SalaryPredictor()
train(model, torch.optim.Adam(model.parameters(), lr=1e-3), epoches=3)

100%|██████████| 1339/1339 [03:02<00:00,  7.35it/s]


Epoch: 1 | Loss: 0.18208062648773193 | Validation: MSE=0.7155646483103434/MAE=0.4108586311340332


100%|██████████| 1339/1339 [02:58<00:00,  7.50it/s]


Epoch: 2 | Loss: 0.13678191602230072 | Validation: MSE=0.6288680632909139/MAE=0.38589104016621906


100%|██████████| 1339/1339 [03:00<00:00,  7.43it/s]


Epoch: 3 | Loss: 0.1166573241353035 | Validation: MSE=0.557366689046224/MAE=0.3631752332051595


With BatchNorm and LayerNorm:

In [17]:
class BatchLayerNormSalaryPredictor(SalaryPredictor):
    def __init__(self):
        super().__init__()

        self.title_encoder = nn.Sequential(
            nn.Conv1d(self.hid_size, self.hid_size, kernel_size=2),
            nn.Dropout(p=0.25),
            nn.LayerNorm(SalaryDataset.MAX_TITLE_LENGHT-1),
            nn.ReLU(),
            nn.BatchNorm1d(self.hid_size),
            nn.AdaptiveMaxPool1d(output_size=1)
        )
        self.description_encoder = nn.Sequential(
            nn.Conv1d(self.hid_size, self.hid_size, kernel_size=2),
            nn.Dropout(p=0.25),
            nn.LayerNorm(SalaryDataset.MAX_DESC_LENGHT-1),
            nn.ReLU(),
            nn.BatchNorm1d(self.hid_size),
            nn.AdaptiveMaxPool1d(output_size=1)
        )
        self.categorical_encoder = nn.Sequential(
            nn.Linear(self.n_cat_features, self.hid_size * 2),
            nn.ReLU(),
            nn.LayerNorm(self.hid_size * 2),
            nn.Linear(self.hid_size * 2, self.hid_size * 2),
            nn.ReLU()
        )
        self.final_predictor = nn.Sequential(
            nn.Linear(self.hid_size * 4, self.hid_size),
            nn.ReLU(),
            nn.LayerNorm(self.hid_size),
            nn.Linear(self.hid_size, 1)
        )

С добавлением BatchNorm и LayerNorm исходная сеть начала показывать результаты намного лучше, особенно это заметно по значениям MSE/MAE

In [18]:
model = BatchLayerNormSalaryPredictor()
train(model, torch.optim.Adam(model.parameters(), lr=1e-3), epoches=3)

100%|██████████| 1339/1339 [03:04<00:00,  7.26it/s]


Epoch: 1 | Loss: 0.15004849433898926 | Validation: MSE=0.04239973425865173/MAE=0.08052261173725128


100%|██████████| 1339/1339 [03:08<00:00,  7.12it/s]


Epoch: 2 | Loss: 0.11695276945829391 | Validation: MSE=0.0338154137134552/MAE=0.07059418161710103


100%|██████████| 1339/1339 [03:07<00:00,  7.13it/s]


Epoch: 3 | Loss: 0.09838034957647324 | Validation: MSE=0.028148611386617024/MAE=0.06330659985542297


With parrallel conv layers

In [19]:
class ParrallelConvSalaryPredictor(SalaryPredictor):
    def __init__(self):
        super().__init__()

        self.title_encoder2 = nn.Sequential(
            nn.Conv1d(self.hid_size, self.hid_size, kernel_size=4),
            nn.Dropout(p=0.33),
            nn.ReLU(),
            nn.BatchNorm1d(self.hid_size),
            nn.AdaptiveMaxPool1d(output_size=1)
        )
        self.description_encoder2 = nn.Sequential(
            nn.Conv1d(self.hid_size, self.hid_size, kernel_size=4),
            nn.Dropout(p=0.33),
            nn.ReLU(),
            nn.BatchNorm1d(self.hid_size),
            nn.AdaptiveMaxPool1d(output_size=1)
        )
        self.final_predictor = nn.Sequential(
            nn.Linear(self.hid_size * 6, self.hid_size * 3),
            nn.ReLU(),
            nn.Linear(self.hid_size * 3, int(self.hid_size * 1.5)),
            nn.ReLU(),
            nn.Linear(int(self.hid_size * 1.5), 1)
        )

    def forward(self, batch):
            title_embeddings = self.embedder(batch['Title']).permute(0, 2, 1)

            title_features = self.title_encoder(title_embeddings).squeeze()
            title_features2 = self.title_encoder2(title_embeddings).squeeze()

            description_embeddings = self.embedder(batch['FullDescription']).permute(0, 2, 1)

            description_features = self.description_encoder(description_embeddings).squeeze()
            description_features2 = self.description_encoder2(description_embeddings).squeeze()

            categorical_features = self.categorical_encoder(batch['Categorical'])

            title_features = torch.cat((title_features, title_features2), dim=1)
            description_features = torch.cat((description_features, description_features2), dim=1)
            features = torch.cat([title_features, description_features, categorical_features], dim=1)
            
            return self.final_predictor(features).squeeze()

При добавлении в исходную сеть параллельных сверточных слоев заметно лишь небольшое улучшение MSE и MAE

In [21]:
model = ParrallelConvSalaryPredictor()
train(model, torch.optim.Adam(model.parameters(), lr=1e-3), 3)

100%|██████████| 1339/1339 [03:25<00:00,  6.51it/s]


Epoch: 1 | Loss: 0.15557654201984406 | Validation: MSE=0.8001741568247477/MAE=0.436512549718221


100%|██████████| 1339/1339 [03:23<00:00,  6.58it/s]


Epoch: 2 | Loss: 0.13028569519519806 | Validation: MSE=0.5427242517471313/MAE=0.35794345537821454


100%|██████████| 1339/1339 [03:20<00:00,  6.67it/s]


Epoch: 3 | Loss: 0.12404994666576385 | Validation: MSE=0.4783267180124919/MAE=0.3356606165568034


Mixed

In [22]:
class NewSalaryPredictor(SalaryPredictor):
    def __init__(self):
        super().__init__()

        self.embedder = nn.Embedding(self.n_tokens, self.hid_size * 2)  # change output size
        self.title_encoder = nn.Sequential(
            nn.Conv1d(self.hid_size * 2, self.hid_size, kernel_size=2),
            nn.Dropout(p=0.25),
            nn.ReLU(),
            nn.BatchNorm1d(self.hid_size),                              # add BatchNorm
            nn.Conv1d(self.hid_size, self.hid_size, kernel_size=2),     # add second convolution
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(output_size=1)
        )
        self.title_encoder2 = nn.Sequential(                            # parrallel encoder for title
            nn.Conv1d(self.hid_size * 2, self.hid_size, kernel_size=4),
            nn.Dropout(p=0.33),
            nn.ReLU(),
            nn.BatchNorm1d(self.hid_size),
            nn.AdaptiveMaxPool1d(output_size=1)
        )
        self.description_encoder = nn.Sequential(
            nn.Conv1d(self.hid_size * 2, self.hid_size, kernel_size=2),
            nn.Dropout(p=0.25),
            nn.ReLU(),
            nn.BatchNorm1d(self.hid_size),                              # add BatchNorm
            nn.Conv1d(self.hid_size, self.hid_size, kernel_size=2),     # add second convolution
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(output_size=1)
        )
        self.description_encoder2 = nn.Sequential(                      # parrallel encoder for description
            nn.Conv1d(self.hid_size * 2, self.hid_size, kernel_size=4),
            nn.Dropout(p=0.33),
            nn.ReLU(),
            nn.BatchNorm1d(self.hid_size),
            nn.AdaptiveMaxPool1d(output_size=1)
        )
        self.categorical_encoder = nn.Sequential(                       # change sizes of Linear layers
            nn.Linear(self.n_cat_features, self.hid_size * 4),
            nn.ReLU(),
            nn.LayerNorm(self.hid_size * 4),                            # add LayerNorm
            nn.Linear(self.hid_size * 4, self.hid_size * 2),
            nn.ReLU()
        )
        self.final_predictor = nn.Sequential(                           # add more linear layers
            nn.Linear(self.hid_size * 6, self.hid_size * 3),
            nn.ReLU(),
            nn.LayerNorm(self.hid_size * 3),                            # add LayerNorm
            nn.Linear(self.hid_size * 3, int(self.hid_size * 1.5)),
            nn.ReLU(),
            nn.BatchNorm1d(int(self.hid_size * 1.5)),                   # add BatchNorm
            nn.Linear(int(self.hid_size * 1.5), 1)
        )

    def forward(self, batch):
        title_embeddings = self.embedder(batch['Title']).permute(0, 2, 1)

        title_features = self.title_encoder(title_embeddings).squeeze()
        title_features2 = self.title_encoder2(title_embeddings).squeeze()

        description_embeddings = self.embedder(batch['FullDescription']).permute(0, 2, 1)

        description_features = self.description_encoder(description_embeddings).squeeze()
        description_features2 = self.description_encoder2(description_embeddings).squeeze()

        categorical_features = self.categorical_encoder(batch['Categorical'])

        title_features = torch.cat((title_features, title_features2), dim=1)
        description_features = torch.cat((description_features, description_features2), dim=1)
        features = torch.cat((title_features, description_features, categorical_features), dim=1)
        
        return self.final_predictor(features).squeeze()

Используя и нормализацию и параллельные энкодеры получилось добиться результатов еще лучше

In [24]:
model = NewSalaryPredictor()
train(model, torch.optim.Adam(model.parameters(), lr=1e-3), 3)

100%|██████████| 1339/1339 [03:40<00:00,  6.07it/s]


Epoch: 1 | Loss: 0.11877128481864929 | Validation: MSE=0.05135465164979299/MAE=0.0930001934369405


100%|██████████| 1339/1339 [03:42<00:00,  6.03it/s]


Epoch: 2 | Loss: 0.10364072024822235 | Validation: MSE=0.023742082218329113/MAE=0.057655890782674156


100%|██████████| 1339/1339 [03:37<00:00,  6.15it/s]


Epoch: 3 | Loss: 0.10311577469110489 | Validation: MSE=0.0232709397872289/MAE=0.0569325586160024


# Раняя остановка

In [25]:
def avg(lst: list):
    return sum(lst) / len(lst)

def delta(x):
    def delta_impl(y):
        return x - y
    return delta_impl

In [26]:
def early_stop_train(model, optimizer, epoches=5, device=None, criterion=nn.MSELoss(reduction='mean')):
    EXIT_CRITERION = 0.01
    loader = train_loader
    outputs: dict[str, list] = {}

    if device:
        model.to(device)
        loader = DeviceDataLoader(train_loader, device)

    for epoch in range(epoches):
        model.train()
        for batch in tqdm(loader):
            optimizer.zero_grad(set_to_none=True)
            pred = model(batch)
            loss = criterion(pred, batch[SalaryDataset.TARGET_COL])
            loss.backward()
            optimizer.step()

        mse, mae = evaluate(model, device)
        print(f'Epoch: {epoch+1} | Loss: {loss.item()} | Validation: MSE={mse}/MAE={mae}')

        # mean loss, mse, mae from last 5 iterations
        means = {k: avg(v[-5:]) for k,v in outputs.items()}
        deltas = [means['Loss'] - loss.item(), means['mse'] - mse, means['mae'] - mae]

        # if any of the deltas smaller than EXIT_CRITERION
        if any(x < EXIT_CRITERION for x in deltas):
            print('Stop criterion achieved')
            return

        outputs['Losses'].append(loss.item())
        outputs['mse'].append(mse)
        outputs['mae'].append(mae)

    if device:
        model.cpu()

# 2. Pooling слои стандартные

### Как работает pooling layer

Pooling слой это, фактически, фильтр заданного размера (kernel_size) который "скользит" по входной матрице и выдает число, в зависимости от алгоритма. Например, MaxPooling запишет в выходную матрицу только максимальное число из попавших в окно фильтра. Далее окно сдвигается на заданную величину (stride) и алгоритм повторяется, пока не будет покрыта вся входная матрица. AvgPooling вместо максимального числа выдает среднее среди тех, что попали в окно.

Максимум по временной компоненте, для каждой фичи

In [27]:
class MaxPoolSalaryPredictor(SalaryPredictor):
    def __init__(self):
        super().__init__()

        self.title_encoder = nn.Sequential(
            nn.Conv1d(self.hid_size, self.hid_size, kernel_size=2),
            nn.Dropout(p=0.25),
            nn.ReLU()
            # nn.AdaptiveMaxPool1d(output_size=1)
        )
        self.description_encoder = nn.Sequential(
            nn.Conv1d(self.hid_size, self.hid_size, kernel_size=2),
            nn.Dropout(p=0.25),
            nn.ReLU()
            # nn.AdaptiveMaxPool1d(output_size=1)
        )
        self.final_predictor = nn.Sequential(
            nn.Linear(534, self.hid_size),
            nn.ReLU(),
            nn.Linear(self.hid_size, 1)
        )

    def forward(self, batch):
        max_pool = nn.AdaptiveMaxPool1d(output_size=1)

        title_embeddings = self.embedder(batch['Title']).permute(0, 2, 1)
        title_features = self.title_encoder(title_embeddings).squeeze().permute(0, 2, 1)
        title_features = max_pool(title_features).squeeze() # Max pooling

        description_embeddings = self.embedder(batch['FullDescription']).permute(0, 2, 1)
        description_features = self.description_encoder(description_embeddings).squeeze().permute(0, 2, 1)
        description_features = max_pool(description_features).squeeze() # Max pooling

        categorical_features = self.categorical_encoder(batch['Categorical'])

        features = torch.cat((title_features, description_features, categorical_features), dim=1)
        return self.final_predictor(features).squeeze()

Использование MaxPool дало лишь немного меньшее значение loss по сравнению с исходной сетью, остальные параметры ухудшились, особенно MSE

In [28]:
model = MaxPoolSalaryPredictor()
train(model, torch.optim.Adam(model.parameters(), lr=1e-3), 3)

100%|██████████| 1339/1339 [02:59<00:00,  7.46it/s]


Epoch: 1 | Loss: 0.13812217116355896 | Validation: MSE=0.7932665348052979/MAE=0.434911052385966


100%|██████████| 1339/1339 [03:02<00:00,  7.33it/s]


Epoch: 2 | Loss: 0.10082586109638214 | Validation: MSE=0.8566711743672689/MAE=0.45446570714314777


100%|██████████| 1339/1339 [03:02<00:00,  7.33it/s]


Epoch: 3 | Loss: 0.10044009983539581 | Validation: MSE=0.8534278869628906/MAE=0.4546033938725789


Среднее по временной компоненте (исключая PAD символы)

In [29]:
class AvgPoolSalaryPredictor(SalaryPredictor):
    def __init__(self):
        super().__init__()

        self.title_encoder = nn.Sequential(
            nn.Conv1d(self.hid_size, self.hid_size, kernel_size=2),
            nn.Dropout(p=0.25),
            nn.ReLU(),
            # nn.AdaptiveMaxPool1d(output_size=1)
        )
        self.title_avgPooler = nn.AvgPool1d(19, stride=19, count_include_pad=False)

        self.description_encoder = nn.Sequential(
            nn.Conv1d(self.hid_size, self.hid_size, kernel_size=2),
            nn.Dropout(p=0.25),
            nn.ReLU(),
            # nn.AdaptiveMaxPool1d(output_size=1)
        )
        self.desc_avgPooler = nn.AvgPool1d(499, stride=499, count_include_pad=False)

    def forward(self, batch):
        # Works like AdaptiveAvgPool1d(output_size=1), but have count_include_pad parameter, which should ignore PAD indices (maybe...)
        # nn.AvgPool1d(kernel_size=input_size-(output_size-1)*stride, stride=input_size/output_size, count_include_pad=False)

        title_embeddings = self.embedder(batch['Title']).permute(0, 2, 1)
        title_features = self.title_encoder(title_embeddings).squeeze()
        title_features = self.title_avgPooler(title_features).squeeze() # Average pooling

        description_embeddings = self.embedder(batch['FullDescription']).permute(0, 2, 1)
        description_features = self.description_encoder(description_embeddings).squeeze()
        description_features = self.desc_avgPooler(description_features).squeeze() # Average pooling

        categorical_features = self.categorical_encoder(batch['Categorical'])
        features = torch.cat((title_features, description_features, categorical_features), dim=1)
        return self.final_predictor(features).squeeze()

Другое дело с Average Pooling'ом. С ним сеть показывает отличные результаты

In [31]:
model = AvgPoolSalaryPredictor()
train(model, torch.optim.Adam(model.parameters(), lr=1e-3), 3)

100%|██████████| 1339/1339 [03:01<00:00,  7.39it/s]


Epoch: 1 | Loss: 0.15778055787086487 | Validation: MSE=0.03812142958243688/MAE=0.07580889264742534


100%|██████████| 1339/1339 [02:56<00:00,  7.58it/s]


Epoch: 2 | Loss: 0.12668080627918243 | Validation: MSE=0.028624010582764942/MAE=0.06433808306852977


100%|██████████| 1339/1339 [02:57<00:00,  7.56it/s]


Epoch: 3 | Loss: 0.1180286556482315 | Validation: MSE=0.02499747524658839/MAE=0.05994426210721334


# 3. Используйте предобученные эмбеддинги

Загрузите предобученные эмбеддинги с помощью gensim.downloader.load

Предобученный эмбеддинг с замороженными весами

In [32]:
class PretrainedFrozenSalaryPredictor(SalaryPredictor):
    def __init__(self, wordvec: torch.FloatTensor, hid_size=100):
        super().__init__(hid_size=100)
        self.embedder = nn.Embedding.from_pretrained(wordvec, freeze=True) # Freezed weights

In [33]:
kv = api.load('glove-wiki-gigaword-100')

Показывает убывающие низкие значения для loss, но ошибка все еще остается очень высокой

In [34]:
model = PretrainedFrozenSalaryPredictor(torch.FloatTensor(kv.vectors))
train(model, torch.optim.Adam(model.parameters(), lr=1e-3), 3)

100%|██████████| 1339/1339 [06:05<00:00,  3.66it/s]


Epoch: 1 | Loss: 0.09762098640203476 | Validation: MSE=1.3745265007019043/MAE=0.580855925877889


100%|██████████| 1339/1339 [06:07<00:00,  3.64it/s]


Epoch: 2 | Loss: 0.08756406605243683 | Validation: MSE=1.3878563245137532/MAE=0.5844357013702393


100%|██████████| 1339/1339 [06:09<00:00,  3.62it/s]


Epoch: 3 | Loss: 0.0801367312669754 | Validation: MSE=1.358284632364909/MAE=0.5782964626948038


Предобученный эмбеддинг с обучаемыми весами

In [37]:
class PretrainedEmbedSalaryPredictor(SalaryPredictor):
    def __init__(self, wordvec: torch.FloatTensor, hid_size=100):
        super().__init__(hid_size=hid_size)
        self.embedder = nn.Embedding.from_pretrained(wordvec, freeze=False) # Not freezed weights

С незамороженными весами ситуцаия таже

In [38]:
model = PretrainedEmbedSalaryPredictor(torch.FloatTensor(kv.vectors))
train(model, torch.optim.Adam(model.parameters(), lr=1e-3), 3)

100%|██████████| 1339/1339 [12:00<00:00,  1.86it/s]


Epoch: 1 | Loss: 0.10760591924190521 | Validation: MSE=1.3251994450887044/MAE=0.5707526604334513


100%|██████████| 1339/1339 [12:06<00:00,  1.84it/s]


Epoch: 2 | Loss: 0.09182409197092056 | Validation: MSE=1.3764427502950032/MAE=0.5826658805211385


100%|██████████| 1339/1339 [12:02<00:00,  1.85it/s]


Epoch: 3 | Loss: 0.06739814579486847 | Validation: MSE=1.4146699905395508/MAE=0.5909947554270426


# 4. Замените сверточные слои на рекуррентные

With LSTM

In [39]:
class LSTMSalaryPredictor(SalaryPredictor):
    def __init__(self, bidirectional=False):
        super().__init__()

        self.title_lstm = nn.LSTM(20, self.hid_size, bidirectional=bidirectional)
        self.title_encoder = nn.Sequential(
            # nn.Conv1d(self.hid_size, self.hid_size, kernel_size=2),
            nn.Dropout(p=0.25),
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(output_size=1)
        )
        self.desc_lstm = nn.LSTM(500, self.hid_size, bidirectional=bidirectional)
        self.description_encoder = nn.Sequential(
            # nn.Conv1d(self.hid_size, self.hid_size, kernel_size=2),
            nn.Dropout(p=0.25),
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(output_size=1)
        )

    def forward(self, batch):
        title_embeddings = self.embedder(batch['Title']).permute(0, 2, 1)
        lstm_out, (hn, cn) = self.title_lstm(title_embeddings) # LSTM

        title_features = self.title_encoder(lstm_out).squeeze()

        description_embeddings = self.embedder(batch['FullDescription']).permute(0, 2, 1)
        lstm_out, (hn, cn) = self.desc_lstm(description_embeddings) # LSTM

        description_features = self.description_encoder(lstm_out).squeeze()

        categorical_features = self.categorical_encoder(batch['Categorical'])

        features = torch.cat(
            [title_features, description_features, categorical_features], dim=1)
        
        return self.final_predictor(features).squeeze()

bidirectional = False

In [40]:
model = LSTMSalaryPredictor()
train(model, torch.optim.Adam(model.parameters(), lr=1e-3), 3)

100%|██████████| 1339/1339 [03:20<00:00,  6.69it/s]


Epoch: 1 | Loss: 0.18705296516418457 | Validation: MSE=0.29569631814956665/MAE=0.25262222687403363


100%|██████████| 1339/1339 [03:15<00:00,  6.84it/s]


Epoch: 2 | Loss: 0.15957172214984894 | Validation: MSE=0.252300759156545/MAE=0.232072651386261


100%|██████████| 1339/1339 [03:15<00:00,  6.86it/s]


Epoch: 3 | Loss: 0.13809098303318024 | Validation: MSE=0.15904400746027628/MAE=0.1786778966585795


bidirectional = True

In [41]:
model = LSTMSalaryPredictor(bidirectional=True)
train(model, torch.optim.Adam(model.parameters(), lr=1e-3), 3)

100%|██████████| 1339/1339 [03:57<00:00,  5.63it/s]


Epoch: 1 | Loss: 0.15448284149169922 | Validation: MSE=0.780951976776123/MAE=0.4307365417480469


100%|██████████| 1339/1339 [03:59<00:00,  5.59it/s]


Epoch: 2 | Loss: 0.14052137732505798 | Validation: MSE=0.7810777823130289/MAE=0.43112874031066895


100%|██████████| 1339/1339 [03:59<00:00,  5.58it/s]


Epoch: 3 | Loss: 0.13719868659973145 | Validation: MSE=0.810487429300944/MAE=0.4400142828623454


With GRU

In [42]:
class GRUSalaryPredictor(SalaryPredictor):
    def __init__(self, bidirectional=False):
        super().__init__()

        self.title_gru = nn.GRU(20, self.hid_size, bidirectional=bidirectional)
        self.title_encoder = nn.Sequential(
            # nn.Conv1d(self.hid_size, self.hid_size, kernel_size=2),
            nn.Dropout(p=0.25),
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(output_size=1)
        )
        self.desc_gru = nn.GRU(500, self.hid_size, bidirectional=bidirectional)
        self.description_encoder = nn.Sequential(
            # nn.Conv1d(self.hid_size, self.hid_size, kernel_size=2),
            nn.Dropout(p=0.25),
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(output_size=1)
        )

    def forward(self, batch):
        title_embeddings = self.embedder(batch['Title']).permute(0, 2, 1)
        gru_out, hn = self.title_gru(title_embeddings) # GRU

        title_features = self.title_encoder(gru_out).squeeze()

        description_embeddings = self.embedder(batch['FullDescription']).permute(0, 2, 1)
        gru_out, hn = self.desc_gru(description_embeddings) # GRU

        description_features = self.description_encoder(gru_out).squeeze()

        categorical_features = self.categorical_encoder(batch['Categorical'])

        features = torch.cat([title_features, description_features, categorical_features], dim=1)
        
        return self.final_predictor(features).squeeze()

bidirectional = False

In [43]:
model = GRUSalaryPredictor()
train(model, torch.optim.Adam(model.parameters(), lr=1e-3), 3)

100%|██████████| 1339/1339 [03:19<00:00,  6.70it/s]


Epoch: 1 | Loss: 0.13911426067352295 | Validation: MSE=0.3445599476496379/MAE=0.2771459420522054


100%|██████████| 1339/1339 [03:20<00:00,  6.68it/s]


Epoch: 2 | Loss: 0.13552053272724152 | Validation: MSE=0.35835285981496173/MAE=0.28341031074523926


100%|██████████| 1339/1339 [03:20<00:00,  6.69it/s]


Epoch: 3 | Loss: 0.13401997089385986 | Validation: MSE=0.3018399675687154/MAE=0.25797730684280396


bidirectional = True

In [44]:
model = GRUSalaryPredictor(bidirectional=True)
train(model, torch.optim.Adam(model.parameters(), lr=1e-3), 3)

100%|██████████| 1339/1339 [04:10<00:00,  5.35it/s]


Epoch: 1 | Loss: 0.1441812962293625 | Validation: MSE=0.32176361481348675/MAE=0.26619766155878705


100%|██████████| 1339/1339 [04:09<00:00,  5.36it/s]


Epoch: 2 | Loss: 0.13940666615962982 | Validation: MSE=0.3248337507247925/MAE=0.26810304323832196


100%|██████████| 1339/1339 [04:09<00:00,  5.38it/s]


Epoch: 3 | Loss: 0.13403506577014923 | Validation: MSE=0.35299789905548096/MAE=0.28113792339960736


LSTM отрабатывает лучше чем GRU, но по сравнению с другими сетями результат оставляет желать лучшего