In [41]:
import sys
import numpy as np
import os
import pandas as pd
import copy
import math
from sklearn.preprocessing import MinMaxScaler
from model.transformer import get_model
import torch.nn as nn

def clean_data(df):
    def median_income(df):
        df.loc[df.renta.isnull(), 'renta'] = df.renta.median(skipna=True)
        return df
    # provide median income by province
    df = df.groupby('nomprov').apply(median_income)
    df.loc[df.renta.isnull(), "renta"] = df.renta.median(skipna=True)
    # set entries whose "antiguedad" field is missing as minimum seniority
    df.antiguedad = pd.to_numeric(df.antiguedad, errors="coerce")
    df.loc[df.antiguedad.isnull(), "antiguedad"] = df.antiguedad.min()
    df.loc[df.antiguedad < 0, "antiguedad"] = 0
    df["antiguedad"] = df["antiguedad"].astype(int)
    # fix customers age
    df["age"] = pd.to_numeric(df["age"], errors="coerce")
    df["age"].fillna(df["age"].mean(), inplace=True)
    df["age"] = df["age"].astype(int)
    # fill missing field "segmento" with most frequent one
    df.loc[df["segmento"].isnull(), "segmento"] = "03 - UNIVERSITARIO"
    # normalize scalar columns
    scale_cols = ["antiguedad", "age", "renta"]
    for col in scale_cols:
        scaler = MinMaxScaler()
        df[col] = scaler.fit_transform(df[[col]])
    return df

def preprocess(input_file, y_date, seq_len=16, batch_size=32, exclude_date=None, d_model=35):
    """
    Preprocess data and split it in train and test data
    :param d_model:
    :param input_file: string, path to raw dataset, csv file
    :param y_date: string, timestamp use for testing
    :param exclude_date: list[string] timestamps to ignore
    :return: train_x, train_y (both are np.array)
    """
    months_one_hot = [0 for _ in range(12)]
    segmentation_dict = {}
    x_users, y_users = {}, {}
    df = pd.read_csv(input_file)
    df = clean_data(df)
    users = []
    for i, row in df.iterrows():
        if row['fecha_dato'] in exclude_date:
            pass
        user = row['ncodpers'] # uid
        date = row['fecha_dato'].split("-")
        year = [int(date[0] == "2016")]  # 1=2016, 0=2015 (1)
        month = copy.copy(months_one_hot)
        month[int(date[1]) - 1] = 1  # months one-hot encoded (12)
        items = list(row.values)[26:]  # items are one-hot encoded (22)
        items = [int(item) if not math.isnan(item) and item != 'NA' else 0 for item in items]
        # one-hot encode segmentation (4)
        segmentation = row['segmento']
        segmentation_array = [0, 0, 0, 0]
        if segmentation not in segmentation_dict.keys():
            segmentation_dict[segmentation] = len(segmentation_dict)
        segmentation_array[segmentation_dict[segmentation]] = 1
        # one-hot encode new-index (1)
        #new_index = [1] if row['ind_nuevo'] == 1 else [0]
        # seniority + age + income (3) - values features
        seniority = float(row['antiguedad'])
        age = float(row['age'])
        income = float(row['renta'])
        value_features = [seniority, age, income]
        # put the data together
        data = year + month + segmentation_array + value_features + items  # (42) values
        if row['fecha_dato'] == y_date and user in x_users.keys():
            y_value = [0]
            if sum(items) > 0:
                y_value = [1]
            y_users[user] = y_value
            users.append(user)
        elif user in x_users.keys():
            x_users[user] = np.vstack((x_users[user], np.array(data)))
        else:
            x_users[user] = np.array(data)

    assert len(x_users) == len(y_users)
    x_data = []
    y_data = []
    for user in users:
        if np.array(x_users[user]).shape[0] == seq_len:
            x_data.append(x_users[user].reshape((seq_len, d_model)))
        else:
            continue
        y_data.append(y_users[user])
    x_data = np.stack(x_data)
    y_data = np.stack(y_data)
    num_users = x_data.shape[0]
    x_data = x_data[:num_users - num_users % batch_size]
    y_data = y_data[:num_users - num_users % batch_size]

    return x_data, y_data

In [49]:
import torch
from tqdm import tqdm
def evaluate_one_epoch(model, criterion, dataset, device="cpu", owned_items=None):
    batch_size = 1
    generator = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size
    )
    model.eval()
    tot_loss = 0.0
    tot_prec1 = 0.0
    n_users = 0
    j = 0
    with torch.no_grad():
        for batch, labels in tqdm(generator):
            batch, labels = batch.to(device), labels.to(device)
            logits = model(batch)
            loss = criterion(logits, labels)
            tot_loss += loss.item()
            recommendations = logits_to_recs(logits.detach().cpu().numpy())
            tot_prec1 += precision_k(1, labels, recommendations)
            n_users += 1 # not sure
        tot_loss /= len(dataset) // batch_size
        tot_prec1 /= n_users
        metrics_dict = {"prec1": tot_prec1}
    return tot_loss, metrics_dict

def logits_to_recs(logits):
    logits = np.squeeze(logits)
    recs = np.argsort(logits)[::-1]
    return recs

def precision_k(k, gt, preds):
    """
    :param k: int, scope of metric
    :param gt: list[int], index of ground truth recommendations
    :param preds: list[int], index of predicted recommendations
    """
    c = 0
    for p in preds[:k]:
        if p in gt:
            c += 1
    return c / k

In [42]:
from torch.utils.data import Dataset
class CustomDataset(Dataset):
    def __init__(self, train_x, train_y, nrows=None):
        if nrows is None:
            self.data = [(x, y) for x, y in zip(train_x, train_y)]
        else:
            self.data = [(x, y) for x, y in zip(train_x[:nrows], train_y[:nrows])]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        x, y = self.data[index]
        x = torch.FloatTensor(x)
        y = torch.FloatTensor(y)
        return x, y

args_dataset = 'data/train_reduced.csv'
args_seq_len = 16
args_batch_size = 64
args_d_model = 42
#x_train, y_train = preprocess(args_dataset, y_date="2016-04-28", exclude_date=["2016-05-28"], seq_len=args_seq_len, batch_size=args_batch_size, d_model=args_d_model)
x_test, y_test = preprocess(args_dataset, y_date="2016-05-28", exclude_date=["2015-01-28"], seq_len=args_seq_len, batch_size=args_batch_size, d_model=args_d_model)
test_set = CustomDataset(x_test, y_test, nrows=None)

  df = pd.read_csv(input_file)


In [50]:
criterion = nn.BCEWithLogitsLoss()
args_n_items = 1 # not very sure
args_d_model = 42
args_heads = 7
args_dropout = 0.5
args_n_layers = 6
args_hidden_size = 2048
model = get_model(args_n_items, args_d_model, args_heads, args_dropout, args_n_layers, args_hidden_size, None, 'cpu')
test_loss, test_metrics = evaluate_one_epoch(model, criterion, test_set, 'cpu', owned_items=None)

100%|██████████| 19968/19968 [01:09<00:00, 289.00it/s]


In [51]:
test_metrics

{'prec1': 0.036207932692307696}

In [34]:
y_test[:10]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [33]:
for i in range(10):
    x,y = test_set. __getitem__(i)
    print(y)

tensor([-0.0002])
tensor([1.4013e-45])
tensor([1.4013e-45])
tensor([-0.0002])
tensor([0.])
tensor([-0.0002])
tensor([0.])
tensor([1.4013e-45])
tensor([1.4013e-45])
tensor([0.])


In [40]:
for i in range(10):
    x,y = test_set.data[i]
    #print(y)
    z = [y]
    print(y, torch.FloatTensor(y))
    print(z, torch.FloatTensor(z))
    

1 tensor([1.0000])
[1] tensor([1.])
1 tensor([0.])
[1] tensor([1.])
1 tensor([inf])
[1] tensor([1.])
1 tensor([1.4013e-45])
[1] tensor([1.])
1 tensor([1.4013e-45])
[1] tensor([1.])
1 tensor([1.4013e-45])
[1] tensor([1.])
1 tensor([-1.8891e+26])
[1] tensor([1.])
1 tensor([-1.4013e-45])
[1] tensor([1.])
1 tensor([0.])
[1] tensor([1.])
1 tensor([-0.0002])
[1] tensor([1.])
