In [1]:
import numpy as np
import pandas as pd
import json
import ast
import os
import math
import statistics
import sklearn
import scipy.spatial
from collections import defaultdict
from collections import Counter

In [2]:
notebook_path = os.path.abspath("Similarity_Based_Memory_Model.ipynb")
users_items_file_path = os.path.join(os.path.dirname(notebook_path), "data/australian_users_items.json")
items_file_path = os.path.join(os.path.dirname(notebook_path), "data/items_meta_data.json")

In [3]:
users_items = []
with open(users_items_file_path, 'r') as data:
    for line in data:
        users_items.append(ast.literal_eval(line))

In [4]:
with open(items_file_path, 'r') as data:
    games_dict = json.load(data)

In [5]:
usersPerItem = defaultdict(set)
itemsPerUser = defaultdict(set)
playtimesPerItem = defaultdict(list)
playtimesPerUser = defaultdict(list)
itemNames = defaultdict(str)

In [6]:
for game in games_dict:
    if 'owners' in games_dict[game]:
        usersPerItem[game] = set(games_dict[game]['owners'].keys())
        for owner in games_dict[game]['owners']:
            playtimesPerItem[game].append(games_dict[game]['owners'][owner])
            playtimesPerUser[owner].append(games_dict[game]['owners'][owner])

In [7]:
for user in users_items:
    itemsPerUser[user['user_id']] = [item['item_id'] for item in user['items']]
    for item in user['items']:
        itemNames[item['item_id']] = item['item_name']

# Similarity-Based Recommendations

## Jaccard Similarity

This is a generic implementation of the Jaccard similarity between two items: we find the union and intersection between their owners 

In [8]:
def jaccard(game_a_id, game_b_id):
    owners_a = usersPerItem[game_a_id]
    owners_b = usersPerItem[game_b_id]
    
    intersect_owners = owners_a.intersection(owners_b)
    union_owners = owners_a.union(owners_b)
    
    if len(union_owners) == 0:
        return 0
    
    return len(intersect_owners) / len(union_owners)

In [9]:
def mostSimilarJaccard(game_a_id):
    similarities = []
    for game_b_id in games_dict:
        if game_b_id == game_a_id: continue
        sim = jaccard(game_a_id, game_b_id)
        similarities.append((sim, game_b_id))
    similarities.sort(reverse=True)
    return similarities[:10]

In [10]:
def mostSimilarJaccardFast(game_a_id):
    similarities = []
    users = usersPerItem[game_a_id]
    candidateItems = set()
    for user_id in users:
        candidateItems = candidateItems.union(itemsPerUser[user_id])
    for game_b_id in candidateItems:
        if game_b_id == game_a_id: continue
        sim = jaccard(game_a_id, game_b_id)
        similarities.append((sim, game_b_id))
    similarities.sort(reverse=True)
    return similarities[:10]

In [11]:
def mostSimilarNames(similarities):
    return [itemNames[sim[1]] for sim in similarities]

In [12]:
mostSimilarNames(mostSimilarJaccardFast('70'))

['Half-Life: Blue Shift',
 'Half-Life: Opposing Force',
 'Team Fortress Classic',
 'Half-Life: Source',
 'Half-Life 2: Episode Two',
 'Half-Life Deathmatch: Source',
 'Half-Life 2: Episode One',
 'Half-Life 2: Deathmatch',
 'Half-Life 2',
 'Half-Life 2: Lost Coast']

## Cosine Similarity

In [13]:
def count(game_a_id, game_b_id):
    owners_a = usersPerItem[game_a_id]
    owners_b = usersPerItem[game_b_id]
    
    counter_a = Counter(owners_a)
    counter_b = Counter(owners_b)
    
    owners = owners_a.union(owners_b)
    
    return counter_a, counter_b, owners

In [19]:
def cosine(game_a_id, game_b_id):
    counter_a, counter_b, terms = count(game_a_id, game_b_id)
    magA = [counter_a.get(k, 0) for k in terms]
    magB = [counter_b.get(k, 0) for k in terms]
    return scipy.spatial.distance.cosine(magA, magB)

In [20]:
def mostSimilarCosine(game_a_id):
    similarities = []
    for game_b_id in games_dict:
        if game_b_id == game_a_id or len(usersPerItem[game_b_id]) == 0: continue
        sim = cosine(game_a_id, game_b_id)
        similarities.append((sim, game_b_id))
    similarities.sort(reverse=True)
    return similarities[:10]

In [21]:
mostSimilarNames(mostSimilarCosine('70'))

['Attack on Pearl Harbor',
 'Simplz Zoo',
 "Grim Tales: The Bride Collector's Edition",
 'CloudBound',
 'Thick Air',
 'Microcosmum: survival of cells - Soundtrack',
 "Dark Parables: Curse of Briar Rose Collector's Edition",
 'Footbrawl Playground',
 'Eclipse: New Dawn for the Galaxy',
 'Toadled']

In [22]:
def arrayToDict(array, keyStr):
    array_dict = {}
    for element in array:
        array_dict[element[keyStr]] = element
    return array_dict

In [23]:
users_dict = arrayToDict(users_items, 'user_id')

In [24]:
def predictPlaytime(user, game_a, sim, default):
    playtimes = []
    similarities = []
    for item in users_dict[user]['items']:
        game_b = item['item_id']
        if game_b == game_a: continue
        playtimes.append(item['playtime_forever'])
        similarities.append(sim(game_a, game_b))
        
    if (sum(similarities) > 0):
        weightedPlaytime = [(x*y) for x,y in zip(playtimes, similarities)]
        return sum(weightedPlaytime) / sum(similarities)
    else:
        return default

In [25]:
def getPlaytime(users_items):
    playtimes = []
    for user in users_items:
        for item in user['items']:
            playtimes.append(item['playtime_forever'])
            
    return playtimes

In [26]:
playtimes = getPlaytime(users_items)
meanPlaytime = sum(playtimes) / len(playtimes)
medianPlaytime = statistics.median(playtimes)

In [27]:
def getPairsAndLabels(users_items):
    pairs = []
    labels = []
    for user in users_items:
        for item in user['items']:
            pairs.append((user['user_id'], item['item_id']))
            labels.append(item['playtime_forever'])
    return pairs, labels

In [28]:
pairs, label = getPairsAndLabels(users_items)

In [29]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [30]:
jaccardPredictions = [predictPlaytime(p[0], p[1], jaccard, meanPlaytime) for p in pairs]

KeyboardInterrupt: 

In [None]:
MSE(jaccardPredictions, labels)