# CSE 258: Assignment 1
### Benjamin Xia

### Setup

In [24]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.decomposition import LatentDirichletAllocation
from sklearn import feature_extraction

from lightfm import LightFM
from lightfm import data

import random
from collections import defaultdict
from tqdm import tqdm
import gzip

import os

RANDOM_SEED = 0
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

### Preprocessing

#### Preprocess user/item ID's, compensation, early_access, and time

In [25]:
user_oe = preprocessing.OrdinalEncoder(dtype=np.int32, min_frequency=5, handle_unknown='use_encoded_value', unknown_value=6710)
item_oe = preprocessing.OrdinalEncoder(dtype=np.int32, min_frequency=5)

itemset = set() # Set of all unique users
userset = set() # Set of all unique items
U = defaultdict(set)
I = defaultdict(set)

ft = ['early_access', 'compensation'] # features unavailable/cannot be approximated in inference
def read_json(path):
    f: gzip.GzipFile = gzip.open(path)
    f.readline()
    for line in f:
        entry = eval(line)
        yield entry

# Encode userID and itemID as integers
def process_data():
    global itemset, userset, U, I
    data = []
    for entry in read_json('train.json.gz'):
        data.append(entry)

    df: pd.DataFrame = pd.DataFrame(data)
    del data
    itemset = set(df['gameID'].unique())
    userset = set(df['userID'].unique())

    U = dict(df.groupby('gameID')['userID'].unique())
    I = dict(df.groupby('userID')['gameID'].unique())
    U = { g : set(U[g]) for g in U }
    I = { u : set(I[u]) for u in I }

    df['userIDX'] = user_oe.fit_transform(df[['userID']])
    df['itemIDX'] = item_oe.fit_transform(df[['gameID']])
    df.rename({'gameID' : 'itemID'}, axis=1, inplace=True)


    df.drop(labels=['hours', 'user_id', 'date'], axis=1, inplace=True)


    # Get features that won't be available
    df.fillna(value=0, axis=1, inplace=True)
    df['compensation'] = df['compensation'].map(lambda x : x if x == 0 else 1)
    df[['early_access', 'compensation']] = df[['early_access', 'compensation']].astype(np.int32)

    time_label = df['hours_transformed']

    return df, time_label

df, time_label = process_data()
user_mean = df.groupby('userIDX')[ft].mean()
item_mean = df.groupby('itemIDX')[ft].mean()

In [26]:
df.drop(labels=ft + ['hours_transformed', 'found_funny'], axis=1, inplace=True)
df.head()

Unnamed: 0,userID,text,itemID,userIDX,itemIDX
0,u70666506,If you want to sit in queue for 10-20min and h...,g49368897,4740,1209
1,u18612571,I was really not a fan of the gameplay. Games ...,g73495588,1240,1800
2,u34283088,Vaas Montenegro is the reason why you should g...,g68047320,2314,1652
3,u16220374,"8/10 Wonderful game, simple controls and platf...",g51234623,1067,1244
4,u01499286,Never knew a guns had THAT many parts!,g25723374,92,609


#### Preprocess user text and convert to descriptors

In [27]:
df.head()

Unnamed: 0,userID,text,itemID,userIDX,itemIDX
0,u70666506,If you want to sit in queue for 10-20min and h...,g49368897,4740,1209
1,u18612571,I was really not a fan of the gameplay. Games ...,g73495588,1240,1800
2,u34283088,Vaas Montenegro is the reason why you should g...,g68047320,2314,1652
3,u16220374,"8/10 Wonderful game, simple controls and platf...",g51234623,1067,1244
4,u01499286,Never knew a guns had THAT many parts!,g25723374,92,609


In [12]:
def get_text_embedding():
    if not os.path.isfile('./text_embed.npy'): # Generate new descriptors for each review using pretrained transformer
        dftext = df.groupby('itemIDX')['text'].apply(' '.join).reset_index()
        counter = feature_extraction.text.CountVectorizer(min_df=0.05, max_df=0.5, stop_words='english', max_features=2000, ngram_range=(1, 2))
        wordcount = counter.fit_transform(dftext['text'])
        LDA = LatentDirichletAllocation(n_components=20, random_state=RANDOM_SEED)
        text_embed = LDA.fit_transform(wordcount)
        np.save('text_embed.npy', text_embed)
    else: # Text descriptors already computed
        text_embed = np.load('./text_embed.npy')

    return text_embed

text_embed = get_text_embedding()
text_embed = text_embed / np.linalg.norm(text_embed, axis=1)[...,None]

df.drop('text', axis=1, inplace=True)


In [13]:
df_played_train = df.iloc[:150000]
df_played_valid = df.iloc[150000:]

In [52]:
def embed2dict(embedding):
    return { i : embedding[i] for i in range(len(embedding)) }

In [53]:
textft = [(i, embed2dict(text_embed[i])) for i in range(len(text_embed))]

In [46]:
interactions = [(row[1]['userIDX'], row[1]['itemIDX']) for row in df_played_train.iterrows()]

In [54]:
fmdata = data.Dataset()
fmdata.fit(df_played_train['userIDX'].unique(),
           df_played_train['itemIDX'].unique())
fmdata.fit_partial(item_features=textft)
fmdata.build_interactions(interactions)
# fmdata.build_item_features(textft)

TypeError: unhashable type: 'dict'

#### Played dataset

In [None]:
test = pd.read_csv('./pairs_Played.csv')
test['itemID'] = test['gameID']
# Map unseen entries to default user (this user is already grouped with other users due to their few # of reviews in training set)
test['userID'] = test['userID'].map(lambda x: x if x in userset else 'u03473346')
test['userIDX'] = user_oe.transform(test[['userID']])
test['itemIDX'] = item_oe.transform(test[['gameID']])
test.drop(columns=['gameID', 'prediction'], inplace=True)

In [None]:
test.head()