# Modelling

The purpose of this Notebook is to construct our recommendation system that will generate both user recommendations and similar item recommendations

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import textwrap
%matplotlib inline

from scipy import sparse
from lightfm import LightFM
from sklearn.metrics.pairwise import cosine_similarity
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score
from scipy.spatial import distance
from sklearn.manifold import TSNE
from gensim.models.keyedvectors import WordEmbeddingsKeyedVectors
from resources import *

import warnings
warnings.filterwarnings("ignore")

In [None]:
%run resources.py

Load `recdata.csv` with user and owned games ids

In [None]:
recdata = pd.read_csv('recdata.csv', index_col=0)
recdata = recdata.rename(columns = {'variable':'id', 'value': 'owned'})

Load file with game details

In [None]:
gamesdata = pd.read_csv('gamesdata.csv', index_col = 0)

## Preprocessing

Our approach involves generating an interactions matrix from the user-item data. This can be achieved through implementation of the create_interaction_matrix function, which is located in resources.py.

In [None]:
interactions = create_interaction_matrix(df = recdata,
                                         user_col = 'uid',
                                         item_col = 'id',
                                         rating_col = 'owned')

Manually split interactions matrix into training set and test set for analysis

In [None]:
len(interactions)

In [None]:
train_num = int((80/100)*len(interactions))
test_num = len(interactions) - train_num

print("{} users in training set.".format(train_num))
print("{} users in test set.".format(test_num))

In [None]:
# Define sets
train = interactions[:55422]
test = interactions[55422:]

### User Dictionary

A dictionary will be made to pair users with a counter ID. This will be achieved by utilizing the create_user_dict function, which is present in resources.py.

In [None]:
# Generate a user dictionary utilizing the create_user_dict helper function
user_dict = create_user_dict(interactions)

### Item Dictionary

A dictionary will be generated to match every game ID with its corresponding title

In [None]:
# Generate a game dictionary through utilization of the create_item_dict helper function
games_dict = create_item_dict(gamesdata, 'id', 'title')

### Sparse Matrices

In [None]:
# To enhance the efficiency of computations, we will convert the interaction into a sparse matrix. 
# For the trainset, we will use the sparse.csr_matrix() function. However, for the test set, we have 
# to add more rows to match the number of rows in the train set, due to a known issue.

train_sparse = sparse.csr_matrix(train.values)

N = train.shape[0] # Total rows in Train set
n, m = test.shape # Rows and columns in Test set
z = np.zeros([(N - n), m]) # Create blank rows with m columns to fulfill missing rows
test = np.vstack((test, z)) # Stack Test vertically over the empty users' rows
test_sparse = sparse.csr_matrix(test) # Convert back to sparse matrix

## Modelling using LightFM

WARP Loss Function

In [None]:
# Initialize and train the model
mf_model_warp = run_model(train, 30, 'warp', 30, 4)

In [None]:
# Precision
train_precision = np.mean(precision_at_k(mf_model_warp, train_sparse, k=10))
test_precision = np.mean(precision_at_k(mf_model_warp, test_sparse, k=10))
print('Precision: train {:.2f}, test {:.2f}.'.format(train_precision, test_precision))

In [None]:
# AUC
train_auc = np.mean(auc_score(mf_model_warp, train_sparse))
test_auc = np.mean(auc_score(mf_model_warp, test_sparse))
print('AUC: train {:.2f}, test {:.2f}.'.format(train_auc, test_auc))

BPR Loss Function

In [None]:
mf_model_bpr = run_model(train, 30, 'bpr', 30, 4)

In [None]:
train_precision = np.mean(precision_at_k(mf_model_bpr, train_sparse, k=10))
test_precision = np.mean(precision_at_k(mf_model_bpr, test_sparse, k=10))
print('Precision: train {:.2f}, test {:.2f}.'.format(train_precision, test_precision))

In [None]:
train_auc = np.mean(auc_score(mf_model_bpr, train_sparse))
test_auc = np.mean(auc_score(mf_model_bpr, test_sparse))
print('AUC: train {:.2f}, test {:.2f}.'.format(train_auc, test_auc))

### Adjusting Components

The number of embeddings, which determines the dimension of the features in the latent space, can be regulated by adjusting the n_components parameter. To observe the impact of the parameter on the model's effectiveness, we will first reduce the number to 5 and then elevate it to 100.

In [None]:
# Init + fit
mf_model_warp_2 = run_model(train, 5, 'warp', 30, 4)

In [None]:
# Precision Metrics
train_precision = np.mean(precision_at_k(mf_model_warp_2, train_sparse, k=10))
test_precision = np.mean(precision_at_k(mf_model_warp_2, test_sparse, k=10))
print('Precision: train {:.2f}, test {:.2f}.'.format(train_precision, test_precision))

In [None]:
# AUC Metrics
train_auc = auc_score(mf_model_warp_2, train_sparse).mean()
test_auc = auc_score(mf_model_warp_2, test_sparse).mean()
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))

In [None]:
# Init + fit
mf_model_warp_50 = run_model(train, 100, 'warp', 30, 4)

In [None]:
# Precision Metrics
train_precision = precision_at_k(mf_model_warp_50, train_sparse, k=10).mean()
test_precision = precision_at_k(mf_model_warp_50, test_sparse, k=10).mean()
print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))

In [None]:
# AUC Metrics
train_auc = auc_score(model=mf_model_warp_50, user_items=train_sparse).mean()
test_auc = auc_score(model=mf_model_warp_50, user_items=test_sparse).mean()
print(f'AUC: Train {train_auc:.2f}, Test {test_auc:.2f}.')

In general, it seems that modifying the n_components attribute has minimal influence on the overall performance of the model.

### Model

In [4]:
# Train chosen model (WARP + 30 components) on interactions matrix.
mf_model = run_model(interactions, 30, 'warp', 30, 4)

### Embedding Space

In [None]:
# Get item embeddings from the trained model
item_embeddings = mf_model.item_embeddings
item_embeddings

In [None]:
# sample game vector
item_embeddings[0]

In [None]:
# retrieve name of game
firstgameid = interactions.columns[0]
games_dict[firstgameid]

### Examining Pair Similarity in Embedding Space with Games

We will perform vector similarity with two games we consider to be very similar, in this case `Call of Duty` and `Battlefield`.

In [None]:
# Get data for both games
mask = gamesdata['title'].isin(['Call of Duty', 'Battlefield'])
gamesdata[mask]

Check out vectors for COD

In [None]:
cod_index = gamesdata[gamesdata['title']=='Call of Duty'].index.values[0]
cod_vector = embeddings[cod_index]
cod_vector

Check out vectors for Battlefield

In [None]:
# Retrieve game id for Battlefield
bf_id = gamesdata.loc[gamesdata['title'] == 'Battlefield', 'id'].iloc[0]
bf_index = np.where(interactions.columns == bf_id)[0][0]

# Embeddings vector
bf_vector = embeddings[bf_index]
bf_vector

Compute Euclidean Distance

In [None]:
distance.euclidean(cod_vector, bf_vector)

#### We will now compare two games we consider to be 'different' in order to comprehend if this metric is accurate or not

In [None]:
# Filter gamesdata to get only Call of Duty and Bloons TD 6
game_titles = ['Counter-Strike', 'Bloons TD 6']
game_data = gamesdata[gamesdata['title'].isin(game_titles)]
game_data

In [None]:
# Retrieve game id for Bloons
bloons_id = gamesdata.loc[gamesdata['title']=='Bloons TD 6', 'id'].values[0]

# Obtain index
bloons_index = np.where(interactions.columns==bloons_id)[0][0]

# Obtain embeddings vector
bloons_vector = embeddings[bloons_index]
bloons_vector

Euclidean Distance

In [None]:
distance.euclidean(cod_vector, bloons_vector)

Cosine Distances, comparing the two scenarios:

In [None]:
cosine_cs_cod = distance.cosine(cod_vector, bf_vector)
cosine_cs_bloons = distance.cosine(cod_vector, bloons_vector)

print('Cos distance between COD and Battlefield: {:.2f}'.format(cosine_cs_lfd2))
print('Cosine distance between COD and Bloons TD 6: {:.2f}'.format(cosine_cs_room))

### Embedding Exploration

In [None]:
embedding_size = embeddings.shape[1]

# Create instance
kv = KeyedVectors(embedding_size)

# Add game names and embeddings to kv
for idx, game_id in enumerate(interactions.columns):
    name = games_dict[games_dict['id']==game_id]['title'].values[0]
    kv.add([name], [embeddings[idx]])

Check for games similar to `Call of Duty`

In [None]:
kv.most_similar('Call of Duty')

In [None]:
# Similar to Battlefield
kv.most_similar('Battlefield')

In [None]:
# Similar to Bloons TD 6
kv.most_similar('Bloons TD 6')

### Plots

In [None]:
def plot_similar_items(game_title, ax, num_items=5):
    '''
    Plots a horizontal bar chart of similar game items
    Arguments:
        - game_title, string representing the game title
        - ax, the axes on which to plot the bar chart
        - num_items (default=5), the number of similar items to plot
    '''
    similar_items = kv.most_similar(game_title, topn=num_items)[::-1]
    y_pos = np.arange(len(similar_items))
    item_similarities = [t[1] for t in similar_items]
    ax.barh(y_pos, item_similarities)
    left_margin = min(.6, min(item_similarities))
    ax.set_xlim(right=1.0, left=left_margin)
    
    # Split long titles over multiple lines
    item_labels = [textwrap.fill(t[0] , width=24) for t in similar_items]
    ax.set_yticks(y_pos)
    ax.set_yticklabels(item_labels)
    ax.set_title(game_title)

In [None]:
# Define list of games to visualise similar items for
games = ['Call of Duty', 'Battlefield', 'Bloons TD 6']

# Create figure and set layout
fig = plt.figure(figsize=(15, 9))
gs = fig.add_gridspec(3, 2)

# Loop through games and use plot_similar function 
for i, game in enumerate(games):
    ax = fig.add_subplot(gs[i])
    plot_similar(game, ax)

# Add title and adjust layout
fig.suptitle('Games and Their Most Similar Items', fontsize=16)
fig.tight_layout(pad=2, rect=[0, 0, 1, 0.96])

### Visualization

To visualize the embeddings, we will apply the t-SNE algorithm, which will transform the embeddings from a 30-dimensional space (determined by the number of components) into a 2-dimensional space.

In [None]:
# Instantiate TSNE object
tsne = TSNE(n_components=2, metric='cosine', perplexity=30, n_iter=1000, random_state=0)

# Fit and transform embeddings
embeddings2d = tsne.fit_transform(embeddings.T)

In [None]:
# New df with names and embeddings

embeddingsdf = pd.DataFrame({'game': gameslist,
                             'x': embeddings2d[:,0],
                             'y': embeddings2d[:,1]})

#### Plots

In [None]:
plt.figure(figsize=(10,8))
sns.set_style('white')

# Scatter points, set alpha low to make points translucent
sns.scatterplot(x='x', y='y', data=embeddingsdf, alpha=0.1)
plt.show()

### Recommendations (User)

In [None]:
# Get recommendations for user with ID 5000
user_id = user_dict['5000']
interactions_user = interactions.loc[user_id,:]

# Get the list of games that the user has interacted with
known_user_likes = list(interactions_user[interactions_user>0].index)

# Get the predicted score for each game
scores = []
for item_id in range(len(games_dict)):
    if item_id not in known_user_likes:
        score = mf_model.predict(user_id, item_id)
        scores.append((item_id, score))

# Sort scores and get top 5
scores.sort(reverse=True, key=lambda x: x[1])
topn_scores = scores[:5]

# Convert game IDs to game names
topn_games = []
for score in topn_scores:
    game_id = score[0]
    topn_games.append(games_dict[game_id])

# Show known games and recommendations
if show_known:
    print(f"Known likes for user {user_id}:")
    for game in known_user_likes:
        print(f"- {games_dict[game]}")
if show_recs:
    print(f"Recommended games for user {user_id}:")
    for game in topn_games:
        print(f"- {game}")

# Return list of recommended games
rec_list_u12 = topn_games

### Recommendations (Item)