# CSE 258: Assignment 1
### Benjamin Xia

### Setup

In [1]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from sklearn.decomposition import LatentDirichletAllocation
from sklearn import feature_extraction
import random
from collections import defaultdict
from tqdm import tqdm
import gzip

import os

RANDOM_SEED = 0
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

### Preprocessing

#### Preprocess user/item ID's, compensation, early_access, and time

In [2]:
user_oe = preprocessing.OrdinalEncoder(dtype=np.int32, min_frequency=5, handle_unknown='use_encoded_value', unknown_value=6710)
item_oe = preprocessing.OrdinalEncoder(dtype=np.int32, min_frequency=5)

itemset = set() # Set of all unique users
userset = set() # Set of all unique items
U = defaultdict(set)
I = defaultdict(set)

ft = ['early_access', 'compensation'] # features unavailable/cannot be approximated in inference
def read_json(path):
    f: gzip.GzipFile = gzip.open(path)
    f.readline()
    for line in f:
        entry = eval(line)
        yield entry

# Encode userID and itemID as integers
def process_data():
    global itemset, userset, U, I
    data = []
    for entry in read_json('train.json.gz'):
        data.append(entry)

    df: pd.DataFrame = pd.DataFrame(data)
    del data
    itemset = set(df['gameID'].unique())
    userset = set(df['userID'].unique())

    U = dict(df.groupby('gameID')['userID'].unique())
    I = dict(df.groupby('userID')['gameID'].unique())
    U = { g : set(U[g]) for g in U }
    I = { u : set(I[u]) for u in I }

    df['userIDX'] = user_oe.fit_transform(df[['userID']])
    df['itemIDX'] = item_oe.fit_transform(df[['gameID']])
    df.rename({'gameID' : 'itemID'}, axis=1, inplace=True)


    df.drop(labels=['hours', 'user_id', 'date'], axis=1, inplace=True)


    # Get features that won't be available
    df.fillna(value=0, axis=1, inplace=True)
    df['compensation'] = df['compensation'].map(lambda x : x if x == 0 else 1)
    df[['early_access', 'compensation']] = df[['early_access', 'compensation']].astype(np.int32)

    time_label = df['hours_transformed']

    return df, time_label

df, time_label = process_data()
user_mean = df.groupby('userIDX')[ft].mean()
item_mean = df.groupby('itemIDX')[ft].mean()

In [4]:
df.drop(labels=ft + ['hours_transformed', 'found_funny'], axis=1, inplace=True)
df.head()

Unnamed: 0,userID,text,itemID,userIDX,itemIDX
0,u70666506,If you want to sit in queue for 10-20min and h...,g49368897,4740,1209
1,u18612571,I was really not a fan of the gameplay. Games ...,g73495588,1240,1800
2,u34283088,Vaas Montenegro is the reason why you should g...,g68047320,2314,1652
3,u16220374,"8/10 Wonderful game, simple controls and platf...",g51234623,1067,1244
4,u01499286,Never knew a guns had THAT many parts!,g25723374,92,609


#### Preprocess user text and convert to descriptors

In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'


def get_text_embedding():
    if not os.path.isfile('./text_embed.npy'): # Generate new descriptors for each review using pretrained transformer
        dftext = df.groupby('itemIDX')['text'].apply(' '.join).reset_index()
        counter = feature_extraction.text.CountVectorizer(min_df=0.01, max_df=0.5, stop_words='english', max_features=2000, ngram_range=(1, 2), binary=True)
        wordcount = counter.fit_transform(dftext['text'])
        LDA = LatentDirichletAllocation(n_components=20, random_state=RANDOM_SEED)
        text_embed = LDA.fit_transform(wordcount)
        np.save('text_embed.npy', text_embed)
    else: # Text descriptors already computed
        text_embed = np.load('./text_embed.npy')

    return text_embed

text_embed = get_text_embedding()
text_embed = text_embed / np.linalg.norm(text_embed, axis=1)[...,None]

df.drop('text', axis=1, inplace=True)


In [6]:
df_played_train = df.iloc[:150000]
df_played_valid = df.iloc[150000:]

In [7]:
len(df.columns) + len(user_mean.columns) + len(item_mean.columns) + text_embed.shape[1]

28

#### Played dataset

In [8]:
torch.set_default_tensor_type(torch.DoubleTensor)
class PlayedDataset(Dataset):
    def __init__(self, df) -> None:
        super().__init__()
        self.df = df
    def __len__(self):
        return len(self.df)
    def __getitem__(self, index):
        row = self.df.iloc[index]
        userID = row['userID']
        itemID = row['itemID']
        userIDX = row['userIDX']
        itemIDX = row['itemIDX']
        negaID = random.choice(tuple(itemset.difference(I[userID]))) # Negative item ID
        negaIDX = item_oe.transform([[negaID]])[0][0]
        # Build positive pair
        pos = np.concatenate((row[2:].to_numpy().astype(np.float32),
                              user_mean.iloc[userIDX].to_numpy().astype(np.float32),
                              item_mean.iloc[itemIDX].to_numpy().astype(np.float32),
                              text_embed[itemIDX].astype(np.float32)))
        negrow = row.copy()
        negrow['itemIDX'] = negaIDX
        neg = np.concatenate((negrow[2:].to_numpy().astype(np.float32),
                              user_mean.iloc[userIDX].to_numpy().astype(np.float32),
                              item_mean.iloc[negaIDX].to_numpy().astype(np.float32),
                              text_embed[negaIDX].astype(np.float32)))
        return torch.from_numpy(pos).to(dtype=torch.float64), torch.from_numpy(neg).to(dtype=torch.float64)

played_ds = PlayedDataset(df_played_train)
played_ds[0]

(tensor([4.7400e+03, 1.2090e+03, 2.1429e-01, 0.0000e+00, 5.7895e-01, 2.9240e-02,
         2.5428e-01, 6.9249e-01, 1.2759e-04, 1.2759e-04, 3.9793e-01, 1.2759e-04,
         1.2759e-04, 1.2759e-04, 3.9421e-01, 1.2759e-04, 1.2759e-04, 1.2759e-04,
         1.2759e-04, 3.7154e-01, 2.6741e-02, 1.9906e-02, 5.3868e-02, 1.2759e-04,
         1.2759e-04, 1.2759e-04]),
 tensor([4.7400e+03, 1.1900e+02, 2.1429e-01, 0.0000e+00, 0.0000e+00, 3.3898e-02,
         1.8379e-04, 4.6149e-01, 1.8379e-04, 1.5461e-01, 1.8379e-04, 1.8379e-04,
         1.8379e-04, 8.3567e-01, 2.5452e-01, 1.8379e-04, 1.8379e-04, 1.8379e-04,
         1.8379e-04, 1.8379e-04, 1.8379e-04, 1.8379e-04, 1.8379e-04, 1.8379e-04,
         1.8379e-04, 1.8379e-04]))

### Model Definition

In [9]:
class FactorizationMachine(nn.Module):
    def __init__(self, n_user, n_item, n_feature, latent_dim, weight=True) -> None:
        """
        n_user: Number of unique users
        n_item: Number of unique items
        n_feature: Number of extra features to use
        latent_dim: Dimension of latent representations of users/items/features
        """
        super().__init__()
        self.n_user = n_user
        self.n_item = n_item
        self.n_feature = n_feature
        self.latent_dim = latent_dim
        self.weight = weight
        self.user_latent = nn.Embedding(n_user, latent_dim)
        self.item_latent = nn.Embedding(n_item, latent_dim)
        if self.n_feature > 0:
            self.feat_latent = nn.Parameter(torch.randn(n_feature, latent_dim),requires_grad=True)
            self.feat_weight = nn.Linear(n_feature, 1)
        if self.weight:
            self.user_weight = nn.Embedding(n_user, 1)
            self.item_weight = nn.Embedding(n_item, 1)
        # "alpha" or "w_0" term will be absorbed into feat_weight linear's bias
    def forward(self, x) -> torch.Tensor:
        """
        Input shape: batch_size x (user idx, item idx, features) - 2 dimensional
        Returns: n x 1 tensor of predictions
        """
        if len(x.size()) == 1:
            x = x.unsqueeze(0)
        # f(u, i) = w_0 + \sum_{j=1}^{d} w_j * x_j
        out = torch.zeros((x.size()[0], 1), device=device)
        if self.n_feature > 0 and self.weight:
            out += self.feat_weight(x[:, 2:])
        users = x[:, 0].to(dtype=torch.int32)
        items = x[:, 1].to(dtype=torch.int32)
        if self.weight:
            out += self.user_weight(users)
            out += self.item_weight(items)
        # Nested summation thingy
        # Interactions between users/items and features
        u_embed = self.user_latent(users)
        i_embed = self.item_latent(items)
        out += (u_embed * i_embed).sum(dim=1).unsqueeze(-1)   # Dot product between user and item latent representations
        if self.n_feature > 0:
            # Interactions between features
            xfeature = x[:, 2:]

            out += ((u_embed @ self.feat_latent.T) * xfeature).sum(dim=1).unsqueeze(-1) # Dot product between user and feature latent representations
            out += ((i_embed @ self.feat_latent.T) * xfeature).sum(dim=1).unsqueeze(-1) # Dot product between item and feature latent representations
            out_1 = torch.matmul(xfeature, self.feat_latent).pow(2).sum(1, keepdim=True) #S_1^2
            out_2 = torch.matmul(xfeature.pow(2), self.feat_latent.pow(2)).sum(1, keepdim=True) # S_2
            out_inter = 0.5*(out_1 - out_2)
            out += out_inter
        return out


In [10]:
played_model = FactorizationMachine(len(df['userID'].unique()), len(df['itemID'].unique()), 24, 32, False).to(device) # 24 features

# if os.path.isfile('./played_model.pt'):
#     played_model.load_state_dict(torch.load('./played_model.pt'))
optimizer = optim.SGD(played_model.parameters(), lr=0.01)
def BPRLoss(pos_score, neg_score):
     return torch.sum(-(pos_score - neg_score).sigmoid().log())

In [11]:
batch_sz=20
print_iter=500
played_dl = DataLoader(dataset=played_ds,
                       batch_size=batch_sz,
                       shuffle=False, num_workers=2)
played_valid_ds = PlayedDataset(df_played_valid)
played_valid_dl = DataLoader(dataset=played_valid_ds,
                             batch_size=batch_sz,
                             num_workers=2)

In [12]:
def validate():
    with torch.no_grad():
        posscores = torch.zeros((0, 1), dtype=torch.float64, device=device)
        negscores = torch.zeros((0, 1), dtype=torch.float64, device=device)
        for i, (pos, neg) in tqdm(enumerate(played_valid_dl)):
            pos_score = played_model(pos.to(device))
            neg_score = played_model(neg.to(device))
            posscores = torch.concatenate((posscores, pos_score), dim=0)
            negscores = torch.concatenate((negscores, neg_score), dim=0)
        combined = torch.concatenate((posscores, negscores), dim=0)
        print(torch.sum(combined.argsort(dim=0)[len(played_valid_ds):] < len(played_valid_ds)) / (len(played_valid_ds)))
def epoch():
    running_loss = 0
    running_auc = 0
    for i, (pos, neg) in tqdm(enumerate(played_dl)):
        pos_score = played_model(pos.to(device))
        neg_score = played_model(neg.to(device))
        loss = BPRLoss(pos_score, neg_score)
        running_loss += loss
        running_auc += torch.sum(pos_score > neg_score) / batch_sz
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if i == 0:
            print(f"{i}/{len(played_dl)}")
            print(f"loss: {running_loss}")
            print(f"AUC: {running_auc}")
        elif (i + 1) % print_iter == 0:
            print(f"{i}/{len(played_dl)}")
            print(f"loss: {running_loss / print_iter}")
            running_loss = 0
            print(f"AUC: {running_auc / print_iter}")
            running_auc = 0

In [None]:
EPOCHS = 100

for i in range(EPOCHS):
    print(f"Epoch: {i} / {EPOCHS}")
    epoch()
    print(f'-----EPOCH {i} VALIDATION ACC %-----')
    validate()

Epoch: 0 / 100


6it [00:00, 10.17it/s]

0/7500
loss: 186.02142484397862
AUC: 0.6000000000000001


502it [00:23, 21.59it/s]

499/7500
loss: 130.07783733898788
AUC: 0.5156000000000004


1004it [00:46, 21.45it/s]

999/7500
loss: 121.66597074446067
AUC: 0.5150000000000001


1504it [01:09, 21.48it/s]

1499/7500
loss: 113.20636922607457
AUC: 0.5349000000000004


2004it [01:32, 21.51it/s]

1999/7500
loss: 108.90708467598601
AUC: 0.5446999999999997


2502it [01:55, 21.55it/s]

2499/7500
loss: 103.01829962342421
AUC: 0.5562000000000001


3004it [02:18, 21.76it/s]

2999/7500
loss: 94.93251507642938
AUC: 0.5810999999999998


3504it [02:40, 21.37it/s]

3499/7500
loss: 91.86766159722688
AUC: 0.5908000000000002


4004it [03:03, 21.30it/s]

3999/7500
loss: 88.53235479680569
AUC: 0.6010999999999999


4504it [03:26, 21.18it/s]

4499/7500
loss: 87.74503738745075
AUC: 0.6116999999999998


5004it [03:49, 22.00it/s]

4999/7500
loss: 83.25707562650251
AUC: 0.6264999999999997


5504it [04:12, 22.09it/s]

5499/7500
loss: 82.12260034824007
AUC: 0.6226999999999991


6003it [04:34, 22.18it/s]

5999/7500
loss: 79.81171854577501
AUC: 0.6344999999999996


6503it [04:57, 21.26it/s]

6499/7500
loss: 80.45356624903484
AUC: 0.6310999999999999


7003it [05:20, 21.95it/s]

6999/7500
loss: 72.88778276304055
AUC: 0.6477999999999996


7500it [05:43, 21.85it/s]

7499/7500
loss: 74.56478132348354
AUC: 0.6486999999999999
-----EPOCH 0 VALIDATION ACC %-----



1250it [00:56, 21.95it/s]

tensor(0.6046, device='cuda:0')
Epoch: 1 / 100



5it [00:00, 15.98it/s]

0/7500
loss: 39.81037464110209
AUC: 0.75


503it [00:23, 21.37it/s]

499/7500
loss: 71.27638036174693
AUC: 0.6458000000000008


1003it [00:45, 22.25it/s]

999/7500
loss: 66.52061881961136
AUC: 0.6628000000000004


1504it [01:08, 22.20it/s]

1499/7500
loss: 67.4157700780415
AUC: 0.6622000000000002


2002it [01:31, 21.10it/s]

1999/7500
loss: 68.07274493416294
AUC: 0.6571999999999999


2504it [01:53, 22.07it/s]

2499/7500
loss: 62.893953569911815
AUC: 0.6709000000000004


3004it [02:16, 21.44it/s]

2999/7500
loss: 62.54021933526082
AUC: 0.6736999999999999


3502it [02:38, 22.55it/s]

3499/7500
loss: 63.39827730045293
AUC: 0.6675000000000002


4003it [03:01, 22.30it/s]

3999/7500
loss: 61.816864279441695
AUC: 0.6698000000000001


4504it [03:23, 22.10it/s]

4499/7500
loss: 63.0125246521447
AUC: 0.6701000000000003


5004it [03:46, 21.95it/s]

4999/7500
loss: 59.71298256694915
AUC: 0.6868


5504it [04:08, 23.28it/s]

5499/7500
loss: 62.433705039445385
AUC: 0.6692999999999997


6004it [04:31, 21.75it/s]

5999/7500
loss: 61.974874061442385
AUC: 0.6623


6502it [04:54, 21.84it/s]

6499/7500
loss: 60.124769140427816
AUC: 0.6738000000000001


7003it [05:16, 22.12it/s]

6999/7500
loss: 57.467347473095025
AUC: 0.6810000000000003


7500it [05:39, 22.11it/s]

7499/7500
loss: 62.24851612680923
AUC: 0.6615999999999995
-----EPOCH 1 VALIDATION ACC %-----



1250it [00:57, 21.77it/s]

tensor(0.6237, device='cuda:0')
Epoch: 2 / 100



4it [00:00, 16.14it/s]

0/7500
loss: 29.80728405995626
AUC: 0.75


505it [00:23, 23.79it/s]

499/7500
loss: 55.978709546671915
AUC: 0.6771000000000004


1003it [00:45, 21.75it/s]

999/7500
loss: 54.48592064515425
AUC: 0.6850999999999994


1502it [01:08, 22.20it/s]

1499/7500
loss: 56.04498695498751
AUC: 0.681


2003it [01:31, 22.46it/s]

1999/7500
loss: 55.14353010935851
AUC: 0.6877999999999994


2501it [01:54, 23.51it/s]

2499/7500
loss: 53.8372165141339
AUC: 0.6830000000000003


3004it [02:17, 20.73it/s]

2999/7500
loss: 51.91164155265853
AUC: 0.6910000000000004


3503it [02:39, 22.17it/s]

3499/7500
loss: 52.62693345041365
AUC: 0.6789999999999999


4004it [03:02, 22.36it/s]

3999/7500
loss: 51.83880832366421
AUC: 0.6843000000000001


4504it [03:25, 23.19it/s]

4499/7500
loss: 52.52898934234105
AUC: 0.6816


5004it [03:47, 22.03it/s]

4999/7500
loss: 50.16973784284242
AUC: 0.6948999999999996


5504it [04:10, 22.13it/s]

5499/7500
loss: 52.086988772351646
AUC: 0.6858000000000004


6005it [04:32, 23.67it/s]

5999/7500
loss: 50.66720650731717
AUC: 0.6896


6503it [04:55, 22.46it/s]

6499/7500
loss: 52.40658549756572
AUC: 0.6798000000000006


7004it [05:18, 21.41it/s]

6999/7500
loss: 50.10495459343165
AUC: 0.6879000000000001


7500it [05:40, 22.02it/s]

7499/7500
loss: 50.13549989647938
AUC: 0.6848999999999996
-----EPOCH 2 VALIDATION ACC %-----



1250it [00:57, 21.74it/s]

tensor(0.6404, device='cuda:0')
Epoch: 3 / 100



4it [00:00, 14.60it/s]

0/7500
loss: 36.127075604246286
AUC: 0.7000000000000001


503it [00:23, 22.77it/s]

499/7500
loss: 48.325197699672394
AUC: 0.6861999999999997


1004it [00:46, 21.15it/s]

999/7500
loss: 46.60271669804402
AUC: 0.6940999999999992


1504it [01:08, 20.99it/s]

1499/7500
loss: 45.97633755307365
AUC: 0.6967000000000008


2004it [01:31, 21.52it/s]

1999/7500
loss: 47.0424916586093
AUC: 0.6843000000000001


2503it [01:54, 22.60it/s]

2499/7500
loss: 44.478657348618604
AUC: 0.6983999999999996


3003it [02:17, 22.61it/s]

2999/7500
loss: 43.3145274966946
AUC: 0.6984


3504it [02:39, 22.72it/s]

3499/7500
loss: 45.62272525460078
AUC: 0.6874999999999993


4002it [03:02, 22.18it/s]

3999/7500
loss: 44.017313713661295
AUC: 0.6964000000000002


4501it [03:24, 22.11it/s]

4499/7500
loss: 44.34204348486751
AUC: 0.6937000000000001


5004it [03:47, 21.84it/s]

4999/7500
loss: 43.779066142877355
AUC: 0.6941000000000008


5503it [04:10, 22.94it/s]

5499/7500
loss: 44.53795480030637
AUC: 0.6948999999999995


6004it [04:32, 21.84it/s]

5999/7500
loss: 42.643765614736374
AUC: 0.6968000000000005


6504it [04:55, 21.70it/s]

6499/7500
loss: 42.8490138259529
AUC: 0.6947000000000002


7002it [05:18, 20.13it/s]

6999/7500
loss: 42.874479708704705
AUC: 0.6893999999999991


7500it [05:40, 22.02it/s]

7499/7500
loss: 44.1272217486266
AUC: 0.6807000000000003
-----EPOCH 3 VALIDATION ACC %-----



1250it [00:57, 21.85it/s]

tensor(0.6326, device='cuda:0')
Epoch: 4 / 100



4it [00:00, 15.99it/s]

0/7500
loss: 34.552996359963004
AUC: 0.7000000000000001


502it [00:23, 22.19it/s]

499/7500
loss: 41.31839591476288
AUC: 0.6976999999999997


1004it [00:45, 21.83it/s]

999/7500
loss: 39.70302308970494
AUC: 0.6992999999999999


1504it [01:08, 23.12it/s]

1499/7500
loss: 39.35302791462109
AUC: 0.7056000000000003


2003it [01:31, 21.76it/s]

1999/7500
loss: 39.83796396455251
AUC: 0.7026999999999997


2503it [01:54, 21.27it/s]

2499/7500
loss: 39.181394039697516
AUC: 0.7043000000000008


3004it [02:17, 22.87it/s]

2999/7500
loss: 37.93935906785409
AUC: 0.7077999999999999


3502it [02:39, 22.22it/s]

3499/7500
loss: 37.94318002097369
AUC: 0.7008999999999996


4004it [03:02, 22.19it/s]

3999/7500
loss: 38.09571717114601
AUC: 0.7064000000000002


4502it [03:25, 22.06it/s]

4499/7500
loss: 39.849603159941054
AUC: 0.6958999999999995


5002it [03:47, 20.86it/s]

4999/7500
loss: 37.79995462310443
AUC: 0.7087999999999998


5504it [04:10, 22.24it/s]

5499/7500
loss: 38.21511332612021
AUC: 0.6946


6004it [04:33, 21.49it/s]

5999/7500
loss: 36.7092746729963
AUC: 0.7086000000000001


6503it [04:55, 22.60it/s]

6499/7500
loss: 38.52975104105417
AUC: 0.6970000000000004


7001it [05:18, 21.58it/s]

6999/7500
loss: 35.64806044959496
AUC: 0.7074999999999999


7500it [05:41, 21.98it/s]

7499/7500
loss: 37.51221204659701
AUC: 0.7004999999999999
-----EPOCH 4 VALIDATION ACC %-----



1250it [00:57, 21.76it/s]

tensor(0.6225, device='cuda:0')
Epoch: 5 / 100



4it [00:00, 16.36it/s]

0/7500
loss: 28.776236853780304
AUC: 0.75


501it [00:23, 20.68it/s]

499/7500
loss: 36.80494930262551
AUC: 0.6986


1003it [00:46, 21.56it/s]

999/7500
loss: 34.54640669544318
AUC: 0.7088999999999999


1503it [01:08, 23.51it/s]

1499/7500
loss: 34.28665415211216
AUC: 0.7088000000000001


2004it [01:31, 21.13it/s]

1999/7500
loss: 34.587494528817
AUC: 0.7032000000000002


2504it [01:54, 21.56it/s]

2499/7500
loss: 33.890476315585275
AUC: 0.7055999999999998


3002it [02:16, 20.71it/s]

2999/7500
loss: 33.2493469148613
AUC: 0.7130999999999998


3504it [02:39, 21.59it/s]

3499/7500
loss: 34.21984659155445
AUC: 0.7132000000000005


4003it [03:02, 22.34it/s]

3999/7500
loss: 33.0345187789148
AUC: 0.7208000000000001


4504it [03:25, 22.44it/s]

4499/7500
loss: 33.992669882011896
AUC: 0.6984999999999996


5004it [03:47, 21.93it/s]

4999/7500
loss: 31.18645666196381
AUC: 0.7252999999999998


5502it [04:10, 22.07it/s]

5499/7500
loss: 33.29838702560942
AUC: 0.7083000000000004


6002it [04:33, 21.79it/s]

5999/7500
loss: 33.07072688486383
AUC: 0.7103000000000006


6504it [04:55, 22.31it/s]

6499/7500
loss: 31.851292728404175
AUC: 0.7108999999999995


7001it [05:18, 21.78it/s]

6999/7500
loss: 31.962673807356712
AUC: 0.7176999999999997


7500it [05:40, 22.00it/s]

7499/7500
loss: 31.489984743219043
AUC: 0.7151000000000003
-----EPOCH 5 VALIDATION ACC %-----



1250it [00:57, 21.67it/s]

tensor(0.6396, device='cuda:0')
Epoch: 6 / 100



4it [00:00, 14.68it/s]

0/7500
loss: 13.91748123092309
AUC: 0.75


503it [00:23, 21.91it/s]

499/7500
loss: 31.86166607626414
AUC: 0.7083000000000005


1002it [00:46, 21.81it/s]

999/7500
loss: 31.51255852720766
AUC: 0.7125000000000002


1504it [01:08, 22.42it/s]

1499/7500
loss: 31.890976531667146
AUC: 0.7130000000000005


2001it [01:31, 21.72it/s]

1999/7500
loss: 31.023348193148106
AUC: 0.7072999999999996


2504it [01:54, 22.72it/s]

2499/7500
loss: 29.396414907125546
AUC: 0.7266999999999998


3004it [02:17, 22.40it/s]

2999/7500
loss: 28.489268188954703
AUC: 0.7296999999999993


3503it [02:39, 23.49it/s]

3499/7500
loss: 30.09806865254054
AUC: 0.7101


4004it [03:02, 22.31it/s]

3999/7500
loss: 29.777975535550404
AUC: 0.7165999999999995


4502it [03:24, 22.13it/s]

4499/7500
loss: 29.988361847322892
AUC: 0.7144999999999999


5003it [03:47, 22.09it/s]

4999/7500
loss: 29.025272908508178
AUC: 0.7172


5504it [04:10, 22.67it/s]

5499/7500
loss: 30.197636848356396
AUC: 0.7078999999999988


6002it [04:32, 21.88it/s]

5999/7500
loss: 27.991775738755972
AUC: 0.7200999999999997


6504it [04:55, 21.23it/s]

6499/7500
loss: 29.190779080097716
AUC: 0.7133000000000004


7003it [05:18, 21.24it/s]

6999/7500
loss: 28.772295272418756
AUC: 0.7112999999999993


7500it [05:40, 22.02it/s]

7499/7500
loss: 28.50639593387053
AUC: 0.7165000000000001
-----EPOCH 6 VALIDATION ACC %-----



1250it [00:56, 21.95it/s]

tensor(0.6361, device='cuda:0')
Epoch: 7 / 100



4it [00:00, 16.47it/s]

0/7500
loss: 33.39602730937887
AUC: 0.8


503it [00:23, 22.07it/s]

499/7500
loss: 27.483237849141528
AUC: 0.7181999999999987


1004it [00:46, 21.43it/s]

999/7500
loss: 28.146619586332534
AUC: 0.7201999999999992


1501it [01:08, 22.22it/s]

1499/7500
loss: 26.077978534770157
AUC: 0.7246999999999999


2003it [01:31, 22.38it/s]

1999/7500
loss: 26.91964178522447
AUC: 0.7161000000000004


2504it [01:53, 23.59it/s]

2499/7500
loss: 26.872560134979782
AUC: 0.7191999999999998


3004it [02:16, 22.52it/s]

2999/7500
loss: 26.23112791260232
AUC: 0.7323000000000008


3502it [02:38, 21.91it/s]

3499/7500
loss: 26.780784813272167
AUC: 0.7207999999999999


4002it [03:01, 22.12it/s]

3999/7500
loss: 26.03725969410038
AUC: 0.7267999999999998


4503it [03:24, 21.98it/s]

4499/7500
loss: 25.755328917225015
AUC: 0.7211


5003it [03:47, 20.62it/s]

4999/7500
loss: 25.5132328983401
AUC: 0.7246


5503it [04:09, 22.65it/s]

5499/7500
loss: 26.040548822074317
AUC: 0.7217000000000008


6001it [04:32, 22.13it/s]

5999/7500
loss: 25.746409820387033
AUC: 0.727999999999999


6503it [04:54, 21.89it/s]

6499/7500
loss: 25.93993092584236
AUC: 0.7155


7003it [05:17, 21.55it/s]

6999/7500
loss: 24.869123524302683
AUC: 0.7283000000000008


7500it [05:40, 22.05it/s]

7499/7500
loss: 25.546991378123792
AUC: 0.7203999999999994
-----EPOCH 7 VALIDATION ACC %-----



1250it [00:57, 21.81it/s]

tensor(0.6363, device='cuda:0')
Epoch: 8 / 100



4it [00:00, 17.09it/s]

0/7500
loss: 17.93685338598973
AUC: 0.75


503it [00:23, 21.18it/s]

499/7500
loss: 25.403896947422382
AUC: 0.7192


1001it [00:46, 22.10it/s]

999/7500
loss: 24.311325943853134
AUC: 0.7321000000000005


1503it [01:09, 21.96it/s]

1499/7500
loss: 24.30442277234133
AUC: 0.7263999999999996


2003it [01:31, 21.99it/s]

1999/7500
loss: 23.429095708628193
AUC: 0.7309000000000003


2501it [01:54, 21.35it/s]

2499/7500
loss: 24.10605952912903
AUC: 0.7262999999999996


3005it [02:17, 23.43it/s]

2999/7500
loss: 24.534293125574177
AUC: 0.7288000000000004


3504it [02:40, 21.84it/s]

3499/7500
loss: 23.2709346904723
AUC: 0.7351000000000005


4004it [03:02, 22.02it/s]

3999/7500
loss: 22.836688109070057
AUC: 0.7310999999999999


4504it [03:25, 22.40it/s]

4499/7500
loss: 23.858052828691246
AUC: 0.7320000000000005


5004it [03:48, 21.44it/s]

4999/7500
loss: 22.69629142701019
AUC: 0.7312000000000012


5502it [04:10, 21.95it/s]

5499/7500
loss: 24.281731974856573
AUC: 0.7261000000000007


6003it [04:33, 22.26it/s]

5999/7500
loss: 23.314917938558967
AUC: 0.7295000000000004


6502it [04:56, 22.22it/s]

6499/7500
loss: 24.679685966585392
AUC: 0.7197999999999998


7002it [05:18, 20.91it/s]

6999/7500
loss: 24.04435297553135
AUC: 0.7195999999999998


7500it [05:41, 21.97it/s]

7499/7500
loss: 23.449845858471505
AUC: 0.7318000000000001
-----EPOCH 8 VALIDATION ACC %-----



1250it [00:57, 21.87it/s]

tensor(0.6456, device='cuda:0')
Epoch: 9 / 100



4it [00:00, 15.29it/s]

0/7500
loss: 13.482288714394956
AUC: 0.8500000000000001


502it [00:23, 21.43it/s]

499/7500
loss: 22.638720264762537
AUC: 0.7298999999999998


972it [00:44, 21.88it/s]

In [None]:
# 2 epochs - .625
with torch.no_grad():
    posscores = torch.zeros((0, 1), dtype=torch.float64, device=device)
    negscores = torch.zeros((0, 1), dtype=torch.float64, device=device)
    for i, (pos, neg) in tqdm(enumerate(played_dl)):
        pos_score = played_model(pos.to(device))
        neg_score = played_model(neg.to(device))
        posscores = torch.concatenate((posscores, pos_score), dim=0)
        negscores = torch.concatenate((negscores, neg_score), dim=0)
    combined = torch.concatenate((posscores, negscores), dim=0)
    print(torch.sum(combined.argsort(dim=0)[len(played_valid_ds):] < len(played_valid_ds)) / (len(played_valid_ds)))

In [None]:
cnt = 0
for i in range(1000):
    pair = played_valid_ds[100]
    p1 = played_model(pair[0].to(device))
    p2 = played_model(pair[1].to(device))
    if p1 > p2:
        cnt += 1
print(cnt / 1000)

In [None]:
test = pd.read_csv('./pairs_Played.csv')
test['itemID'] = test['gameID']
# Map unseen entries to default user (this user is already grouped with other users due to their few # of reviews in training set)
test['userID'] = test['userID'].map(lambda x: x if x in userset else 'u03473346')
test['userIDX'] = user_oe.transform(test[['userID']])
test['itemIDX'] = item_oe.transform(test[['gameID']])
test.drop(columns=['gameID', 'prediction'], inplace=True)

In [None]:
test.head()

In [None]:
played_test_ds = PlayedDataset(test)
played_test_dl = DataLoader(dataset=played_test_ds,
                           batch_size=len(played_test_ds),
                           shuffle=True, num_workers=1)

In [None]:
preds = 0
for pos, neg in played_test_dl:
    preds = model(pos.to(device))

In [None]:
validate()

In [None]:
played_model(played_ds[0][0].to(device))

In [None]:
played_model(played_ds[0][0].to(device))

In [None]:
played_model(played_ds[0][1].to(device))

In [None]:
played_model(played_ds[0][1].to(device))

In [None]:
torch.save(played_model.state_dict(), './played_model.pt')