# CSE 258: Assignment 1
### Benjamin Xia

### Setup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import preprocessing
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel

import random
from collections import defaultdict
from tqdm import tqdm
import gzip

import os

RANDOM_SEED = 0
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7efffccb8e10>

### Preprocessing

#### Preprocess user/item ID's, compensation, early_access, and time

In [2]:
user_oe = preprocessing.OrdinalEncoder(dtype=np.int32, min_frequency=5)
item_oe = preprocessing.OrdinalEncoder(dtype=np.int32, min_frequency=5)

itemset = set() # Set of all unique users
userset = set() # Set of all unique items
U = defaultdict(set)
I = defaultdict(set)

ft = ['early_access', 'hours_transformed', 'found_funny', 'compensation'] # features unavailable/cannot be approximated in inference
def read_json(path):
    f: gzip.GzipFile = gzip.open(path)
    f.readline()
    for line in f:
        entry = eval(line)
        yield entry

# Encode userID and itemID as integers
def process_data():
    global itemset, userset, U, I
    data = []
    for entry in read_json('train.json.gz'):
        data.append(entry)

    df: pd.DataFrame = pd.DataFrame(data)
    del data
    itemset = set(df['gameID'].unique())
    userset = set(df['userID'].unique())

    U = dict(df.groupby('gameID')['userID'].unique())
    I = dict(df.groupby('userID')['gameID'].unique())
    U = { g : set(U[g]) for g in U }
    I = { u : set(I[u]) for u in I }

    df['userIDX'] = user_oe.fit_transform(df[['userID']])
    df['itemIDX'] = item_oe.fit_transform(df[['gameID']])
    df.rename({'gameID' : 'itemID'}, axis=1, inplace=True)

    # Get features for time
    df['date'] = pd.to_datetime(df['date'])
    df['month'] = df['date'].map(lambda x : x.month)
    df['year'] = df['date'].map(lambda x : x.year)
    df['day_of_month'] = df['date'].map(lambda x : x.day)
    df['day_of_wk'] = df['date'].map(lambda x : x.dayofweek)
    df['day_of_yr'] = df['date'].map(lambda x : x.dayofyear)
    df['wk_of_yr'] = df['date'].map(lambda x : x.weekofyear)
    mme = preprocessing.MinMaxScaler() # Normalize time to range [0, 1]
    df[['month', 'year', 'day_of_month', 'day_of_wk', 'day_of_yr', 'wk_of_yr']] = mme.fit_transform(df[['month', 'year', 'day_of_month', 'day_of_wk', 'day_of_yr', 'wk_of_yr']])
    df.drop(labels=['hours', 'user_id', 'date'], axis=1, inplace=True)

    # Use Fourier features to help with representating cyclic nature of time
    for time_unit in [ 'month', 'day_of_month', 'day_of_wk', 'day_of_yr', 'wk_of_yr']:
        df[time_unit + '_cos'] = df[time_unit].map(lambda x: np.cos(x * 2 * np.pi))
        df[time_unit + '_sin'] = df[time_unit].map(lambda x: np.sin(x * 2 * np.pi))


    # Get features that won't be available
    df.fillna(value=0, axis=1, inplace=True)
    df['compensation'] = df['compensation'].map(lambda x : x if x == 0 else 1)
    df[['early_access', 'compensation']] = df[['early_access', 'compensation']].astype(np.int32)

    time_label = df['hours_transformed']

    return df, time_label

df, time_label = process_data()
df.head()

Unnamed: 0,userID,early_access,hours_transformed,found_funny,text,itemID,compensation,userIDX,itemIDX,month,...,month_cos,month_sin,day_of_month_cos,day_of_month_sin,day_of_wk_cos,day_of_wk_sin,day_of_yr_cos,day_of_yr_sin,wk_of_yr_cos,wk_of_yr_sin
0,u70666506,0,6.011227,1.0,If you want to sit in queue for 10-20min and h...,g49368897,0,4740,1209,0.363636,...,-0.654861,0.75575,-0.669131,-0.743145,0.5,-0.866025,-0.732494,0.680773,-0.663123,0.748511
1,u18612571,0,0.263034,0.0,I was really not a fan of the gameplay. Games ...,g73495588,0,1240,1800,0.0,...,1.0,0.0,0.669131,-0.743145,-0.5,-0.866025,0.901502,0.432776,0.935016,0.354605
2,u34283088,0,3.689299,0.0,Vaas Montenegro is the reason why you should g...,g68047320,0,2314,1652,0.181818,...,0.415415,0.909632,0.5,0.866025,-1.0,0.0,0.452072,0.891981,0.464723,0.885456
3,u16220374,0,1.263034,0.0,"8/10 Wonderful game, simple controls and platf...",g51234623,0,1067,1244,0.454545,...,-0.959493,0.281733,-0.809017,0.587785,0.5,-0.866025,-0.944188,0.329408,-0.935016,0.354605
4,u01499286,0,1.432959,0.0,Never knew a guns had THAT many parts!,g25723374,0,92,609,0.0,...,1.0,0.0,-0.978148,-0.207912,0.5,-0.866025,0.962309,0.271958,0.970942,0.239316


#### Preprocess user text and convert to descriptors

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# NOTE: Using pretrained sentiment similarity transformer
# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def get_text_embedding():
    if not os.path.isfile('./text_embed.npy'): # Generate new descriptors for each review using pretrained transformer
        tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
        transformer = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2').to(device)
        text_embed = np.zeros((len(df), 384))
        with torch.no_grad():
            for i in tqdm(range(0, len(df), 1)):
                encoded_input = tokenizer(df.iloc[i:i+1]['text'].tolist(),
                                        padding=True,
                                        truncation=True,
                                        return_tensors='pt').to(device)
                model_output = transformer(**encoded_input)
                embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
                embeddings: torch.Tensor = nn.functional.normalize(embeddings, p=2, dim=1)
                text_embed[i:i+1] = embeddings.cpu().numpy()
        np.save('text_embed.npy', text_embed)
    else: # Text descriptors already computed
        text_embed = np.load('./text_embed.npy')

    return text_embed

text_embed = get_text_embedding()

text_cols = ['te_' + str(i) for i in range(text_embed.shape[1])]

df.drop('text', axis=1, inplace=True)

# Add text descriptor features to dataframe
df = df.join(pd.DataFrame(text_embed, columns=text_cols))


In [4]:
user_mean = df.groupby('userIDX')[ft + text_cols].mean()
item_mean = df.groupby('itemIDX')[ft + text_cols].mean()
df.drop(labels=ft + text_cols, axis=1, inplace=True)

In [5]:
df_played_train = df.iloc[:150000]
df_played_valid = df.iloc[150000:]

#### Played dataset

In [6]:
class PlayedDataset(Dataset):
    def __init__(self, df) -> None:
        super().__init__()
        self.df = df
    def __len__(self):
        return len(self.df)
    def __getitem__(self, index):
        row = self.df.iloc[index]
        userID = row['userID']
        itemID = row['itemID']
        userIDX = row['userIDX']
        itemIDX = row['itemIDX']
        negaID = random.choice(tuple(itemset.difference(I[userID]))) # Negative item ID
        negaIDX = item_oe.transform([[negaID]])[0][0]

        # Build positive pair
        pos = np.concatenate((row[2:].to_numpy().astype(np.float32),
                              user_mean.iloc[userIDX].to_numpy().astype(np.float32),
                              item_mean.iloc[itemIDX].to_numpy().astype(np.float32)))
        negrow = row.copy()
        negrow['userIDX'] = negaIDX
        neg = np.concatenate((negrow[2:].to_numpy().astype(np.float32),
                              user_mean.iloc[userIDX].to_numpy().astype(np.float32),
                              item_mean.iloc[negaIDX].to_numpy().astype(np.float32)))
        return torch.from_numpy(pos).to(dtype=torch.float32), torch.from_numpy(neg).to(dtype=torch.float32)

played_ds = PlayedDataset(df_played_train)
played_ds[0]



(tensor([ 4.7400e+03,  1.2090e+03,  3.6364e-01,  8.7500e-01,  6.3333e-01,
          8.3333e-01,  3.8082e-01,  3.6538e-01, -6.5486e-01,  7.5575e-01,
         -6.6913e-01, -7.4314e-01,  5.0000e-01, -8.6603e-01, -7.3249e-01,
          6.8077e-01, -6.6312e-01,  7.4851e-01,  2.1429e-01,  4.9497e+00,
          3.2643e+01,  0.0000e+00, -2.0878e-02, -7.3889e-03, -3.8625e-03,
         -5.0535e-02, -1.1783e-02,  3.1898e-02,  6.5037e-03, -1.3966e-05,
         -2.4322e-02,  4.0201e-02, -4.6274e-02,  2.1103e-02, -1.7140e-02,
          1.4387e-02,  9.4856e-04, -6.8986e-03,  5.9447e-02, -2.6645e-02,
          1.9625e-03, -1.8561e-02, -2.1035e-02, -1.9221e-02,  8.7146e-03,
         -1.4833e-03, -7.6345e-03, -7.9339e-03, -1.9430e-02,  2.4253e-02,
         -3.8123e-02, -4.6568e-02, -1.8670e-02,  5.1211e-02, -4.2783e-03,
         -3.8196e-04, -4.0363e-02, -9.5039e-03,  2.4656e-02, -6.1482e-02,
         -1.7572e-02, -1.1578e-02, -2.3224e-02, -7.8378e-03,  1.7691e-02,
          3.5012e-02,  6.3939e-03, -1.

### Model Definition

In [7]:
class FactorizationMachine(nn.Module):
    def __init__(self, n_user, n_item, n_feature, latent_dim) -> None:
        """
        n_user: Number of unique users
        n_item: Number of unique items
        n_feature: Number of extra features to use
        latent_dim: Dimension of latent representations of users/items/features
        """
        super().__init__()
        self.n_user = n_user
        self.n_item = n_item
        self.n_feature = n_feature
        self.latent_dim = latent_dim

        self.user_latent = nn.Embedding(n_user, latent_dim)
        self.item_latent = nn.Embedding(n_item, latent_dim)
        self.feat_latent = nn.Embedding(n_feature, latent_dim)
        self.user_weight = nn.Embedding(n_user, 1)
        self.item_weight = nn.Embedding(n_item, 1)
        # "alpha" or "w_0" term will be absorbed into feat_weight linear's bias
        self.feat_weight = nn.Linear(n_feature, 1)
    def forward(self, x) -> torch.Tensor:
        """
        Input shape: batch_size x (user idx, item idx, features) - 2 dimensional
        Returns: n x 1 tensor of predictions
        """
        if len(x.size()) == 1:
            x = x.unsqueeze(0)
        # f(u, i) = w_0 + \sum_{j=1}^{d} w_j * x_j
        out = self.feat_weight(x[:, 2:])
        users = x[:, 0].to(dtype=torch.int32)
        items = x[:, 1].to(dtype=torch.int32)
        out += self.user_weight(users)
        out += self.item_weight(items)
        # Nested summation thingy
        # Interactions between users/items and features
        u_embed = self.user_latent(users)
        i_embed = self.item_latent(items)
        f_embed = self.feat_latent(torch.Tensor(range(0, self.n_feature)).to(device, dtype=torch.int32))
        out += (u_embed * i_embed).sum(dim=1).unsqueeze(-1)   # Dot product between user and item latent representations
        out += (u_embed @ f_embed.T).sum(dim=1).unsqueeze(-1) # Dot product between user and feature latent representations
        out += (i_embed @ f_embed.T).sum(dim=1).unsqueeze(-1) # Dot product between item and feature latent representations
        # Interactions between features
        for i in range(0, self.n_feature):
            for j in range(0, i):
                out += (f_embed[i] @ f_embed[j].T) * (x[:, 2 + i] * x[:, 2 + j]).unsqueeze(-1)
        return out


In [8]:
played_model = FactorizationMachine(len(df['userID'].unique()), len(df['itemID'].unique()), 792, 16).to(device)
played_dl = DataLoader(dataset=played_ds,
                       batch_size=20,
                       num_workers=4)
optimizer = optim.RMSprop(played_model.parameters(), lr=0.001, weight_decay=0.2)
def BPRLoss(pos_score, neg_score):
     return -(pos_score - neg_score).sigmoid().log().sum()

In [9]:
played_model(played_ds[0][0].to(device))

  out += (f_embed[i] @ f_embed[j].T) * (x[:, 2 + i] * x[:, 2 + j]).unsqueeze(-1)


tensor([[2303.8462]], device='cuda:0', grad_fn=<AddBackward0>)

In [None]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
for i, (pos, neg) in enumerate(played_dl):
    print(f"{i}/{len(played_dl)}")
    pos_score = played_model(pos.to(device))
    neg_score = played_model(neg.to(device))
    loss = BPRLoss(pos_score, neg_score)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    print(f"ACU: {torch.sum(pos_score > neg_score) / len(pos_score)}")

0/7500


In [None]:
torch.cuda.is_available()