In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import torch
from icecream import ic
from tqdm import tqdm
from utils import queries_embeddings, load_passages_tensors, train_raw_df
from NN.CustomDataset import CustomDataset

In [2]:

p_tensors_all = load_passages_tensors()
'done'

100%|██████████| 30/30 [01:37<00:00,  3.25s/it]


'done'

In [3]:
verbose = False


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def BCEloss(y, h):
    return y * np.log(h) + (1 - y) * np.log(1 - h)


class LogisticRegression():
    def __init__(self, learning_rate, n_iterations):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.accuracies = []
        self.continue_training = False
        self.losses = np.array([])
        self.w = None
        self.b = None

    def _init_weights(self):
        if self.w is None and self.b is None:
            self.w = np.zeros(600)
            self.b = 0

        self.losses = np.concatenate([self.losses, np.zeros(self.n_iterations)])

    def fit(self, dataloader, evaluator=None):
        start_epoch = len(self.losses)

        self._init_weights()

        # gradient descent
        for epoch in range(start_epoch, start_epoch + self.n_iterations):
            ic(epoch)

            pbar = tqdm(enumerate(dataloader), unit='batch',
                        total=len(dataloader)) if verbose else enumerate(dataloader)
            for i_batch, (x_batch, y_batch) in pbar:
                loss = self._fit_batch(x_batch, y_batch)

                if verbose:
                    pbar.set_postfix({'loss': loss})
            self.losses[epoch] = loss

            if evaluator is not None:
                self.accuracies.append(evaluator(self.forward))

        self.get_history()
        print("done")

    def _fit_batch(self, x, y):
        #         weights=np.int(y==1)
        n = x.shape[0]
        h = self.forward(x)
        tmp = h - y

        dw = x.T.dot(tmp)
        db = np.einsum('i->', tmp)

        scaler = self.learning_rate / n
        self.w -= dw * scaler
        self.b -= db * scaler

        return - np.einsum('i->', BCEloss(y, h)) / n

    def forward(self, x):
        res = sigmoid(x.dot(self.w) + self.b)
        return res

    def save(self, path):
        return torch.save({'w': self.w, "b": self.b, 'history': self.history,
                           'losses': self.losses, 'accuracies': self.accuracies}, path)

    def load(self, path):
        value = torch.load(path)
        self.w = value['w']
        self.b = value['b']
        self.losses = value['losses']
        self.losses = self.losses[self.losses != 0]
        self.accuracies = value['accuracies']
        self.history = value['history']
        self.continue_training = True

    def get_history(self):
        result_df = pd.DataFrame(self.losses, columns=['Loss'])
        result_df.loc[:, ['mAP@3', 'mAP@10', 'mAP@100']] = [a[0] for a in self.accuracies]
        result_df.loc[:, ['NDCG@3', 'NDCG@10', 'NDCG@100']] = [a[1] for a in self.accuracies]
        self.history = result_df.iloc[:, [1, 2, 3, 4, 5, 6, 0]]
        return self.history


class DataLoader:
    def __init__(self, batch_size: int, passages_per_query: int, p_tensors=None, dataframe=None, q_tensors=None, ):

        if p_tensors is None:
            p_tensors = load_passages_tensors()

        if q_tensors is None:
            q_tensors = torch.load(queries_embeddings, map_location=torch.device('cpu'))

        if dataframe is None:
            dataframe = pd.read_parquet(train_raw_df)

        self.dataset = CustomDataset(all_dataframe=dataframe,
                                     passages_tensors=p_tensors,
                                     queries_tensors=q_tensors,
                                     passages_per_query=passages_per_query,
                                     return_tensors='cat',
                                     shuffle_passages=False)

        self.passages_per_query = passages_per_query

        self.batch_size = batch_size
        self.num_batches = len(self.dataset) // self.batch_size + 1

        ic('DataLoader', len(self.dataset), self.num_batches, self.batch_size)

    def __len__(self):
        return self.num_batches

    def __iter__(self):
        for start in range(0, len(self.dataset), self.batch_size):
            end = min(start + self.batch_size, len(self.dataset))
            this_batch_size = end - start

            x = np.zeros((this_batch_size * self.passages_per_query, 2, 300))
            y = np.zeros(this_batch_size * self.passages_per_query)
            for indice, q_idx in enumerate(range(start, end)):
                xx, yy = self.dataset[q_idx]
                idx_start = indice * self.passages_per_query
                idx_end = idx_start + self.passages_per_query

                x[idx_start:idx_end, ...] = xx
                y[idx_start:idx_end] = yy

            yield x.reshape(-1, 600), y





In [4]:
import eval

evaluator=eval.init_evaluator(
    x_val_handler=lambda x: x.numpy().reshape(-1, 600))


q_tensors = torch.load(queries_embeddings, map_location=torch.device('cpu'))
dataframe = pd.read_parquet(train_raw_df)


In [5]:
dataloader = DataLoader(batch_size=256, passages_per_query=20,
                        p_tensors=p_tensors_all, dataframe=dataframe,
                        q_tensors=q_tensors)

ic| 'DataLoader': 'DataLoader'
    len(self.dataset): 4521
    self.num_batches: 18
    self.batch_size: 256


In [None]:
# name = './nsample15_100_5'
model = LogisticRegression(learning_rate=10, n_iterations=200)
# model.load(f'{name}.pth')
model.fit(dataloader,evaluator)

name = './nsample20_200_10'
model.save(f'{name}.pth')

dff = model.get_history()
dff.to_parquet(f'{name}.dataframe')

In [None]:
# name='./nsample15_100_10'

model = LogisticRegression(learning_rate=30, n_iterations=200)

model.fit(dataloader,evaluator)

name='./nsample20_200_30'
model.save(f'{name}.pth')

dff = model.get_history()
dff.to_parquet(f'{name}.dataframe')

In [None]:
# name='./nsample15_100_10'

model = LogisticRegression(learning_rate=.5, n_iterations=300)

model.fit(dataloader,evaluator)

name='./nsample50_300_.5'
model.save(f'{name}.pth')

dff = model.get_history()
dff.to_parquet(f'{name}.dataframe')

In [None]:
model = LogisticRegression(learning_rate=40, n_iterations=200)

model.fit(dataloader,evaluator)

name='./debug_200_40'
model.save(f'{name}.pth')

dff = model.get_history()
dff.to_parquet(f'{name}.dataframe')


# ---------------------------------
model2 = LogisticRegression(learning_rate=0.005, n_iterations=200)
# model2.load('./debug_200_0.005.pth')
model2.fit(dataloader,evaluator)

name='./debug_200_0.005'
model2.save(f'{name}.pth')

dff2 = model2.get_history()
dff2.to_parquet(f'{name}.dataframe')


# ---------------------------------
model2 = LogisticRegression(learning_rate=0.02, n_iterations=200)
# model2.load('./debug_400_0.02.pth')
model2.fit(dataloader,evaluator)

name='./debug_200_0.02'
model2.save(f'{name}.pth')

dff2 = model2.get_history()
dff2.to_parquet(f'{name}.dataframe')

In [None]:
model2 = LogisticRegression(learning_rate=15, n_iterations=400)
model2.load('./debug_200_15.pth')
model2.fit(dataloader,evaluator)

name='./debug_400_15'
model2.save(f'{name}.pth')

dff2 = model2.get_history()
dff2.to_parquet(f'{name}.dataframe')

In [None]:
import requests
def send(message):
    bot_token = '6028862035:AAF_oOGGHuJL0CSPBCXzNkYlyzrV2uWqh9Y'
    bot_chatId = '5287678337'

    url = f"https://api.telegram.org/bot{bot_token}/sendMessage?chat_id={bot_chatId}&text={message}"
    a = requests.get(url).json()


send('LR finished!')