# CSE 258: Assignment 1
### Benjamin Xia

### Setup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import preprocessing
import torch
from torch import nn, optim
from transformers import AutoTokenizer, AutoModel

from tqdm import tqdm
import gzip

import os

RANDOM_SEED = 0
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x16ef164e170>

### Preprocessing

#### Preprocess user/item ID's, compensation, early_access, and time

In [2]:
def read_json(path):
    f: gzip.GzipFile = gzip.open(path)
    f.readline()
    for line in f:
        entry = eval(line)
        yield entry

# Encode userID and itemID as integers
def process_data() -> pd.DataFrame:
    data = []
    for entry in read_json('train.json.gz'):
        data.append(entry)

    df: pd.DataFrame = pd.DataFrame(data)

    del data
    oe = preprocessing.OrdinalEncoder(dtype=np.int32) # User/item id encoder
    df[['userID', 'gameID']] = oe.fit_transform(df[['userID', 'gameID']])
    df.rename({'gameID' : 'itemID'}, axis=1, inplace=True)
    df['date'] = pd.to_datetime(df['date'])
    df['month'] = df['date'].map(lambda x : x.month)
    df['year'] = df['date'].map(lambda x : x.year)
    df['day_of_month'] = df['date'].map(lambda x : x.day)
    df['day_of_wk'] = df['date'].map(lambda x : x.dayofweek)
    df['day_of_yr'] = df['date'].map(lambda x : x.dayofyear)
    df['wk_of_yr'] = df['date'].map(lambda x : x.weekofyear)
    df.fillna(value=0, axis=1, inplace=True)
    df['compensation'] = df['compensation'].map(lambda x : x if x == 0 else 1)
    df[['early_access', 'compensation']] = df[['early_access', 'compensation']].astype(np.int32)
    mme = preprocessing.MinMaxScaler()
    df[['month', 'year', 'day_of_month', 'day_of_wk', 'day_of_yr', 'wk_of_yr']] = mme.fit_transform(df[['month', 'year', 'day_of_month', 'day_of_wk', 'day_of_yr', 'wk_of_yr']])
    df.drop(labels=['hours', 'user_id', 'date'], axis=1, inplace=True)
    df.head()
    return df

df = process_data()

#### Preprocess user text and convert to descriptors

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# NOTE: Using pretrained sentiment similarity transformer
# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def get_text_embedding():
    if not os.path.isfile('./text_embed.npy'): # Generate new descriptors for each review using pretrained transformer
        tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
        transformer = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2').to(device)
        text_embed = np.zeros((len(df), 384))
        with torch.no_grad():
            for i in tqdm(range(0, len(df), 1)):
                encoded_input = tokenizer(df.iloc[i:i+1]['text'].tolist(),
                                        padding=True,
                                        truncation=True,
                                        return_tensors='pt').to(device)
                model_output = transformer(**encoded_input)
                embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
                embeddings: torch.Tensor = nn.functional.normalize(embeddings, p=2, dim=1)
                text_embed[i:i+1] = embeddings.cpu().numpy()
        np.save('text_embed.npy', text_embed)
    else: # Text descriptors already computed
        text_embed = np.load('./text_embed.npy')

    return text_embed

text_embed = get_text_embedding()

### Model Definition

In [26]:
class FactorizationMachine(nn.Module):
    def __init__(self, n_user, n_item, n_feature, latent_dim) -> None:
        """
        n_user: Number of unique users
        n_item: Number of unique items
        n_feature: Number of extra features to use
        latent_dim: Dimension of latent representations of users/items/features
        """
        super().__init__()
        self.n_user = n_user
        self.n_item = n_item
        self.n_feature = n_feature
        self.latent_dim = latent_dim

        self.user_latent = nn.Embedding(n_user, latent_dim)
        self.item_latent = nn.Embedding(n_item, latent_dim)
        self.feat_latent = nn.Embedding(n_feature, latent_dim)
        self.user_weight = nn.Embedding(n_user, 1)
        self.item_weight = nn.Embedding(n_item, 1)
        # "alpha" or "w_0" term will be absorbed into feat_weight linear's bias
        self.feat_weight = nn.Linear(n_feature, 1)
    def forward(self, x) -> torch.Tensor:
        """
        Input shape: batch_size x (user idx, item idx, features) - 2 dimensional
        Returns: n x 1 tensor of predictions
        """
        # f(u, i) = w_0 + \sum_{j=1}^{d} w_j * x_j
        out = self.feat_weight(x[:, 2:])
        out += self.user_weight(x[:, 0].to(dtype=torch.int32))
        out += self.item_weight(x[:, 1].to(dtype=torch.int32))
        # Nested summation thingy
        u_embed = self.user_latent(x[:, 0].to(dtype=torch.int32))
        i_embed = self.item_latent(x[:, 1].to(dtype=torch.int32))
        f_embed = self.feat_latent(torch.Tensor(range(0, self.n_feature)).to(device, dtype=torch.int32))
        out += (u_embed * i_embed).sum(dim=1).unsqueeze(-1)
        out += (u_embed @ f_embed.T).sum(dim=1).unsqueeze(-1)
        out += (i_embed @ f_embed.T).sum(dim=1).unsqueeze(-1)
        for i in range(0, self.n_feature):
            for j in range(0, i):
                out += (f_embed[i] @ f_embed[j].T) * (x[:, 2 + i] * x[:, 2 + j]).unsqueeze(-1)
        return out


In [29]:
model = FactorizationMachine(2, 2, 3, 5).to(device)
ui = torch.randint(0, 2, (10, 2))
d = torch.concatenate([ui, torch.randn((10, 3))], dim=1).to(device)
model(d)

tensor([[4.5396],
        [1.1414],
        [6.9715],
        [4.4861],
        [3.5364],
        [2.8154],
        [5.3209],
        [2.4965],
        [4.4025],
        [1.8168]], grad_fn=<AddBackward0>)