In [47]:
import torch
import os
import datetime
import numpy as np
import pandas as pd
import math
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as Fun
from utils import price_to_log_cat, get_date_to_month_buckets, quantity_to_log_cat

In [108]:
total_tasks = 100
per_task = 10
embeddings = np.empty(0)
embedding_indices = set()

for i in tqdm(range(total_tasks)):
    emb_f = f'test_embeddings/test_embeddings_{i}.npy'
    if os.path.isfile(emb_f):
        embedding = np.load(emb_f)
        if embeddings.size == 0:
            embeddings = embedding[:,1:]
        else:
            embeddings = np.vstack((embeddings, embedding[:,1:]))
    else:
        print(f'Missing embedding file: {emb_f}')

print(f'Found {len(embedding_indices)} embeddings: {embeddings.shape}')
test_df = pd.read_csv('test.csv', index_col=0)
train_df = pd.read_csv('train.csv', index_col=0)

  0%|          | 0/100 [00:00<?, ?it/s]

Found 0 embeddings: (1000, 1536)


In [109]:
concealed_index = np.load('test_conceal_index.npy')
train_df['Creation Date'] = pd.to_datetime(train_df['Creation Date'], format='%m/%d/%Y')
date_to_month_buckets = get_date_to_month_buckets(train_df['Creation Date'].min())

In [5]:
class MultiTask_Network(torch.nn.Module):
    def __init__(self, input_dim, 
                 output_dim_0 : int = 1,
                 output_dim_1 : int = 1,
                 output_dim_2 : int = 1,
                 output_dim_3 : int = 1,
                 output_dim_4 : int = 1,
                 hidden_dim : int = 2048):
        
        super(MultiTask_Network, self).__init__()
        self.input_dim = input_dim
        self.output_dim_0 = output_dim_0
        self.output_dim_1 = output_dim_1
        self.output_dim_2 = output_dim_2
        self.output_dim_3 = output_dim_3
        self.output_dim_4 = output_dim_4
        self.hidden_dim = hidden_dim
        
        self.hidden0 = torch.nn.Linear(self.input_dim, self.hidden_dim)
        self.hidden1 = torch.nn.Linear(self.hidden_dim, self.hidden_dim)
        self.hidden2 = torch.nn.Linear(self.hidden_dim, self.hidden_dim)
        self.final_0 = torch.nn.Linear(self.hidden_dim, self.output_dim_0)
        self.final_1 = torch.nn.Linear(self.hidden_dim, self.output_dim_1)
        self.final_2 = torch.nn.Linear(self.hidden_dim, self.output_dim_2)
        self.final_3 = torch.nn.Linear(self.hidden_dim, self.output_dim_3)
        self.final_4 = torch.nn.Linear(self.hidden_dim, self.output_dim_4)
        
    def forward(self, x : torch.Tensor, task_id : int):
        x = self.hidden0(x)
        x = torch.relu(x)
        x = self.hidden1(x)
        x = torch.relu(x)
        x = self.hidden2(x)
        x = torch.relu(x)
        if task_id == 0:
            x = self.final_0(x)
        elif task_id == 1:
            x = self.final_1(x)
        elif task_id == 2:
            x = self.final_2(x)
        elif task_id == 3:
            x = self.final_3(x)
        elif task_id == 4:
            x = self.final_4(x)
        else:
            assert False, 'Bad Task ID passed'
        
        return x

In [115]:
model = MultiTask_Network(1536,
                          output_dim_0 = 13,
                          output_dim_1 = 13,
                          output_dim_2 = 39,
                          output_dim_3 = 75,
                          output_dim_4 = 19)
model.load_state_dict(torch.load('./ckpt2'))
model.eval()

MultiTask_Network(
  (hidden0): Linear(in_features=1536, out_features=2048, bias=True)
  (hidden1): Linear(in_features=2048, out_features=2048, bias=True)
  (hidden2): Linear(in_features=2048, out_features=2048, bias=True)
  (final_0): Linear(in_features=2048, out_features=13, bias=True)
  (final_1): Linear(in_features=2048, out_features=13, bias=True)
  (final_2): Linear(in_features=2048, out_features=39, bias=True)
  (final_3): Linear(in_features=2048, out_features=75, bias=True)
  (final_4): Linear(in_features=2048, out_features=19, bias=True)
)

In [116]:
concealed_entries = []
actual_bin = []
actual_bin_value = []
pred_bin = []
pred_bin_value = []
correct = []
for i in range(1000):
    ci = concealed_index[i]
    pred = int(model(torch.from_numpy(embeddings[i:i+1]).float().squeeze(), task_id=ci).argmax())
    task_correct = [0]*5
    task_total = [0]*5
    if ci == 0:
        # Total Price
        if pred == 0:
            total_price = 'NaN'
        elif pred == 1:
            total_price = 'negative'
        elif pred == 2:
            total_price = '$0 - $1'
        else:
            total_price = f'${10**(pred-3)} - ${10**(pred-2)}'
        actual = test_df.iloc[i]["Total Price"]
        y = price_to_log_cat(actual)
        concealed_entries.append('Total Price')
        # print(f'Concealed entry = Total Price. Actual: bin {y} ({actual}) Predicted: bin {pred} ({total_price})')
    elif ci == 1:
        # Unit Price
        if pred == 0:
            unit_price = 'NaN'
        elif pred == 1:
            unit_price = 'negative'
        elif pred == 2:
            unit_price = '$0 - $1'
        else:
            unit_price = f'${10**(pred-3)} - ${10**(pred-2)}'
        actual = test_df.iloc[i]["Unit Price"]
        y = price_to_log_cat(actual)
        concealed_entries.append('Unit Price')
        # print(f'Concealed entry = Unit Price. Actual: bin {y} ({actual}) Predicted: bin {pred} ({unit_price})')
    elif ci == 2:
        # Creation Date
        if pred == 0:
            creation_date = 'NaN'
        elif pred == 1:
            creation_date = 'before 2012-07-02'
        elif pred == 2:
            date_start = pd.Timestamp(train_df['Creation Date'].min() + datetime.timedelta(days=0)).strftime('%m/%d/%Y')
            date_end = pd.Timestamp(train_df['Creation Date'].min() + datetime.timedelta(days=15)).strftime('%m/%d/%Y')
            creation_date = f'{date_start} - {date_end}'
        else:
            month_delta = pred - 2
            date_start = pd.Timestamp(train_df['Creation Date'].min() + datetime.timedelta(days=month_delta*30-15)).strftime('%m/%d/%Y')
            date_end = pd.Timestamp(train_df['Creation Date'].min() + datetime.timedelta(days=month_delta*30+15)).strftime('%m/%d/%Y')
            creation_date = f'{date_start} - {date_end}'
        actual = test_df.iloc[i]["Creation Date"]
        y = date_to_month_buckets(pd.to_datetime(actual, format='%m/%d/%Y'))
        concealed_entries.append('Creation Date')
        # print(f'Concealed entry = Creation Date. Actual: bin {y} ({actual}) Predicted: bin {pred} ({creation_date})')
    elif ci == 3:
        # Purchase Date
        if pred == 0:
            purchase_date = 'NaN'
        elif pred == 1:
            purchase_date = 'before 2012-07-02'
        elif pred == 2:
            date_start = pd.Timestamp(train_df['Creation Date'].min() + datetime.timedelta(days=0)).strftime('%m/%d/%Y')
            date_end = pd.Timestamp(train_df['Creation Date'].min() + datetime.timedelta(days=15)).strftime('%m/%d/%Y')
            purchase_date = f'{date_start} - {date_end}'
        else:
            month_delta = pred - 2
            date_start = pd.Timestamp(train_df['Creation Date'].min() + datetime.timedelta(days=month_delta*30-15)).strftime('%m/%d/%Y')
            date_end = pd.Timestamp(train_df['Creation Date'].min() + datetime.timedelta(days=month_delta*30+15)).strftime('%m/%d/%Y')
            purchase_date = f'{date_start} - {date_end}'
        actual = test_df.iloc[i]["Creation Date"]
        y = date_to_month_buckets(pd.to_datetime(actual, format='%m/%d/%Y'))
        concealed_entries.append('Purchase Date')
        # print(f'Concealed entry = Purchase Date. Actual: bin {y} ({actual}) Predicted: bin {pred} ({purchase_date})')
    elif ci == 4:
        # Quantity
        if pred == 0:
            quantity = 'NaN'
        elif pred == 1:
            quantity = '<=1'
        else:
            quantity = f'{2**(pred-2)} - {2**(pred-1)}'
        actual = test_df.iloc[i]["Quantity"]
        y = quantity_to_log_cat(actual)
        concealed_entries.append('Quantity')
        # print(f'Concealed entry = Quantity. Actual: bin {y} ({actual}) Predicted: bin {pred} ({quantity})')
    else:
        raise Exception
    if ci == 2 or ci == 3:
        if abs(pred-y) <= 1:
            correct.append(1)
        else:
            correct.append(0)
    else:
        if pred == y:
            correct.append(1)
        else:
            correct.append(0)
    actual_bin.append(y)
    actual_bin_value.append(str(actual))
    pred_bin.append(pred)
    if ci == 0:
        pred_bin_value.append(total_price)
    elif ci == 1:
        pred_bin_value.append(unit_price)
    elif ci == 2:
        pred_bin_value.append(creation_date)
    elif ci == 3:
        pred_bin_value.append(purchase_date)
    elif ci == 4:
        pred_bin_value.append(quantity)

In [117]:
sum(correct)/1000

0.7

In [112]:
test_df.insert(0, "correct", correct, True)
test_df.insert(0, "pred_bin_value", pred_bin_value, True)
test_df.insert(0, "pred_bin", pred_bin, True)
test_df.insert(0, "actual_bin_value", actual_bin_value, True)
test_df.insert(0, "actual_bin", actual_bin, True)
test_df.insert(0, "concealed_entries", concealed_entries, True)

In [114]:
test_df.to_csv('test_.csv', sep=',')