In [1]:
import numpy as np
import pandas as pd
import json
import ast
import os
import math
import statistics
import sklearn
import scipy.spatial
from collections import defaultdict
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import roc_auc_score

In [2]:
notebook_path = os.path.abspath("Similarity_Based_Collaborative_Filtering.ipynb")
users_items_file_path = os.path.join(os.path.dirname(notebook_path), "data/australian_users_items.json")
items_file_path = os.path.join(os.path.dirname(notebook_path), "data/items_meta_data.json")

In [3]:
users_items = []
with open(users_items_file_path, 'r') as data:
    for line in data:
        users_items.append(ast.literal_eval(line))

In [4]:
with open(items_file_path, 'r') as data:
    games_dict = json.load(data)

In [5]:
usersPerItem = defaultdict(set)
itemsPerUser = defaultdict(set)
playtimesPerItem = defaultdict(dict)
playtimesPerUser = defaultdict(dict)
itemNames = defaultdict(str)

In [6]:
for game in games_dict:
    if 'owners' in games_dict[game]:
        usersPerItem[game] = set(games_dict[game]['owners'].keys())

In [9]:
for user in users_items:
    itemsPerUser[user['user_id']] = [item['item_id'] for item in user['items']]
    playtimesPerUser[user['user_id']] = dict((item['item_id'], item['playtime_forever']) for item in user['items'])    
    for item in user['items']:
        itemNames[item['item_id']] = item['item_name']
        playtimesPerItem[item['item_id']][user['user_id']] = item['playtime_forever']

In [10]:
users = list(itemsPerUser.keys())
items = list(usersPerItem.keys())

# Similarity-Based Recommendations

In [11]:
def isEmpty(game_a_id, game_b_id):
    return len(usersPerItem[game_a_id]) == 0 or len(usersPerItem[game_b_id]) == 0

In [12]:
def mostSimilar(game_a_id, sim):
    similarities = []
    for game_b_id in games_dict:
        if game_b_id == game_a_id: continue
        similarity = sim(game_a_id, game_b_id)
        similarities.append((similarity, game_b_id))
    similarities.sort(reverse=True)
    return similarities[:10]

In [13]:
def mostSimilarNames(similarities):
    return [itemNames[sim[1]] for sim in similarities]

## Jaccard Similarity

This is a generic implementation of the Jaccard similarity between two items: we find the union and intersection between their owners 

In [14]:
def jaccard(game_a_id, game_b_id):
    if isEmpty(game_a_id, game_b_id):
        return 0
    
    owners_a = usersPerItem[game_a_id]
    owners_b = usersPerItem[game_b_id]
    
    intersect_owners = owners_a.intersection(owners_b)
    union_owners = owners_a.union(owners_b)
    
    return len(intersect_owners) / len(union_owners)

In [15]:
def mostSimilarJaccardFast(game_a_id):
    similarities = []
    users = usersPerItem[game_a_id]
    candidateItems = set()
    for user_id in users:
        candidateItems = candidateItems.union(itemsPerUser[user_id])
    for game_b_id in candidateItems:
        if game_b_id == game_a_id: continue
        sim = jaccard(game_a_id, game_b_id)
        similarities.append((sim, game_b_id))
    similarities.sort(reverse=True)
    return similarities[:10]

In [16]:
mostSimilarNames(mostSimilar('70', jaccard))

['Half-Life: Blue Shift',
 'Half-Life: Opposing Force',
 'Team Fortress Classic',
 'Half-Life: Source',
 'Half-Life 2: Episode Two',
 'Half-Life Deathmatch: Source',
 'Half-Life 2: Episode One',
 'Half-Life 2: Deathmatch',
 'Half-Life 2',
 'Half-Life 2: Lost Coast']

In [17]:
mostSimilarNames(mostSimilarJaccardFast('70'))

['Half-Life: Blue Shift',
 'Half-Life: Opposing Force',
 'Team Fortress Classic',
 'Half-Life: Source',
 'Half-Life 2: Episode Two',
 'Half-Life Deathmatch: Source',
 'Half-Life 2: Episode One',
 'Half-Life 2: Deathmatch',
 'Half-Life 2',
 'Half-Life 2: Lost Coast']

## Cosine Similarity

- Mark a item that is purchased and played (playtime > 0) **1**
- Mark a item that is purchased but not played (playtime = 0) **-1**
- Mark a item that is not purchased **0**

In [18]:
def markOpinion(playtime):
    return 1 if playtime > 0 else -1

In [22]:
def count(game_a_id, game_b_id):
    owners_a = usersPerItem[game_a_id]
    owners_b = usersPerItem[game_b_id]
    
    counter_a = dict((x, markOpinion(playtimesPerUser[x][game_a_id])) for x in owners_a)
    counter_b = dict((x, markOpinion(playtimesPerUser[x][game_b_id])) for x in owners_b)
    
    owners = owners_a.union(owners_b)
    
    return counter_a, counter_b, owners

In [23]:
def cosine(game_a_id, game_b_id):
    if isEmpty(game_a_id, game_b_id):
        return 0
    
    counter_a, counter_b, terms = count(game_a_id, game_b_id)
    magA = np.array([[counter_a.get(k, 0) for k in terms]])
    magB = np.array([[counter_b.get(k, 0) for k in terms]])
    
    return cosine_similarity(magA, magB)

In [24]:
mostSimilarNames(mostSimilar('70', cosine))

['Half-Life 2',
 'Team Fortress Classic',
 'Portal',
 'Portal 2',
 "Garry's Mod",
 'Counter-Strike',
 'Half-Life 2: Episode One',
 'Counter-Strike: Source',
 'Left 4 Dead 2',
 'Half-Life: Opposing Force']

## Pearson Correlation

- Mark a item that is purchased and played (playtime > 0) with the value of its **playtime**
- Mark a item that is purchased but not played (playtime = 0) **-1**
- Mark a item that is not purchased **0**

In [26]:
def markValue(playtime):
    return playtime if playtime > 0 else -1    

In [27]:
def count(game_a_id, game_b_id):
    owners_a = usersPerItem[game_a_id]
    owners_b = usersPerItem[game_b_id]
    
    counter_a = dict((x, playtimesPerUser[x][game_a_id]) for x in owners_a)
    counter_b = dict((x, playtimesPerUser[x][game_b_id]) for x in owners_b)
    
    owners = owners_a.union(owners_b)

    return counter_a, counter_b, owners

In [28]:
def pearson(game_a_id, game_b_id):
    if isEmpty(game_a_id, game_b_id):
        return 0
            
    counter_a, counter_b, terms = count(game_a_id, game_b_id)
    magA = np.array([counter_a.get(k, 0) for k in terms])
    magB = np.array([counter_b.get(k, 0) for k in terms])
    return np.corrcoef(magA, magB)[0, 1]

In [29]:
mostSimilarNames(mostSimilar('70', pearson))

  c /= stddev[:, None]
  c /= stddev[None, :]


['BRAINPIPE: A Plunge to Unhumanity',
 'Mini Motor Racing EVO',
 'Violett',
 'The Plan',
 'Procyon',
 'The Forgotten Ones',
 'Our Love Will Grow',
 'Road Madness',
 'Millie',
 'Little Racers STREET']

# Similarity Playtime Estimation

We can also use the similarity-based recommender we developed above to make predictions about user's playtime. Although this is not an example of machine learning, it is a simple heuristic that can be used to estimate a user's future playtime based on their playtime in the past.

Specifically, a user's playtime for an item is assumed to be a weighted sum of their previous playtime, weighted by how similar the query item is to each of their previous purchases.

Use median playtime and mean playtime as bench mark 

In [30]:
import time, sys
from IPython.display import clear_output
def update_progress(progress):
    bar_length = 40
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1
        
    block = int(round(bar_length * progress))
    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)

In [50]:
def predictPlaytime(user, game_a, sim, default):
    playtimes = []
    similarities = []
    for game_b in itemsPerUser[user]:
        if game_b == game_a: continue
        playtimes.append(playtimesPerUser[user][game_b])
        similarities.append(sim(game_a, game_b))
        
    if (sum(similarities) > 0):
        weightedPlaytime = [(x*y) for x,y in zip(playtimes, similarities)]
        return sum(weightedPlaytime) / sum(similarities)
    else:
        return default

In [35]:
def getPlaytime(users_items):
    playtimes = []
    for user in users_items:
        for item in user['items']:
            playtimes.append(item['playtime_forever'])
            
    return playtimes

In [36]:
playtimes = getPlaytime(users_items)
meanPlaytime = sum(playtimes) / len(playtimes)
medianPlaytime = statistics.median(playtimes)

In [56]:
def markBinaryLabel(playtime, threshold):
    return 1 if playtime >= threshold else 0

In [57]:
def getPairsAndLabels(users_items):
    pairs = []
    labels = []
    for user in users_items:
        for item in user['items']:
            pairs.append((user['user_id'], item['item_id']))
            labels.append(item['playtime_forever'])
    return pairs, labels

In [59]:
pairs, labels = getPairsAndLabels(users_items)

In [None]:
def accuracy(predictions, labels):
    differences = [1 if (x == y) else 0 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [None]:
pairs_train, pairs_test, labels_train, labels_test = train_test_split(pairs, labels, test_size = 0.00005)

In [60]:
meanPredictions = [markBinaryLabel(meanPlaytime, games_dict[p[1]]['median_playtime']) for p in pairs_test]

In [61]:
medianPredictions = [markBinaryLabel(medianPlaytime, games_dict[p[1]]['median_playtime']) for p in pairs_test]

In [47]:
def computePredictions(sim, default):
    predictions = []
    num = len(pairs_test)
    for index, pair in enumerate(pairs_test):
        predictions.append(predictPlaytime(pair[0], pair[1], sim, default))
        update_progress(index / num)
    update_progress(1)
    return predictions

In [62]:
test_binary_labels = [markBinaryLabel(x, games_dict[p[1]]['median_playtime']) for x, p in zip(labels_test, pairs_test)]

In [51]:
jaccardMedianPredictions = computePredictions(jaccard, medianPlaytime)

Progress: [########################################] 100.0%


In [64]:
accuracy([markBinaryLabel(x, games_dict[p[1]]['median_playtime']) for x, p in zip(jaccardMedianPredictions, pairs_test)], test_binary_labels)

0.3953488372093023

In [53]:
cosineMedianPredictions = computePredictions(cosine, medianPlaytime)

Progress: [########################################] 100.0%


In [65]:
accuracy([markBinaryLabel(x, games_dict[p[1]]['median_playtime']) for x, p in zip(cosineMedianPredictions, pairs_test)], test_binary_labels)

0.3953488372093023

In [71]:
pearsonMedianPredictions = computePredictions(pearson, medianPlaytime)

Progress: [########################################] 100.0%


In [73]:
accuracy([markBinaryLabel(x, games_dict[p[1]]['median_playtime']) for x, p in zip(pearsonMedianPredictions, pairs_test)], test_binary_labels)

0.5891472868217055