In [1]:
import numpy as np
import pandas as pd
import json
import ast
import os
import math
import statistics
import sklearn
import scipy.spatial
from collections import defaultdict
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
notebook_path = os.path.abspath("Similarity_Based_Memory_Model.ipynb")
users_items_file_path = os.path.join(os.path.dirname(notebook_path), "data/australian_users_items.json")
items_file_path = os.path.join(os.path.dirname(notebook_path), "data/items_meta_data.json")

In [3]:
users_items = []
with open(users_items_file_path, 'r') as data:
    for line in data:
        users_items.append(ast.literal_eval(line))

In [4]:
with open(items_file_path, 'r') as data:
    games_dict = json.load(data)

In [5]:
usersPerItem = defaultdict(set)
itemsPerUser = defaultdict(set)
playtimesPerItem = defaultdict(list)
playtimesPerUser = defaultdict(list)
itemNames = defaultdict(str)

In [6]:
for game in games_dict:
    if 'owners' in games_dict[game]:
        usersPerItem[game] = set(games_dict[game]['owners'].keys())

In [7]:
for user in users_items:
    itemsPerUser[user['user_id']] = [item['item_id'] for item in user['items']]
    playtimesPerUser[user['user_id']] = user['items']
    for item in user['items']:
        itemNames[item['item_id']] = item['item_name']
        playtimesPerItem[item['item_id']].append(item)

# Similarity-Based Recommendations

In [8]:
def isEmpty(game_a_id, game_b_id):
    return len(usersPerItem[game_a_id]) == 0 or len(usersPerItem[game_b_id]) == 0

In [9]:
def mostSimilar(game_a_id, sim):
    similarities = []
    for game_b_id in games_dict:
        if game_b_id == game_a_id: continue
        similarity = sim(game_a_id, game_b_id)
        similarities.append((similarity, game_b_id))
    similarities.sort(reverse=True)
    return similarities[:10]

In [10]:
def mostSimilarNames(similarities):
    return [itemNames[sim[1]] for sim in similarities]

## Jaccard Similarity

This is a generic implementation of the Jaccard similarity between two items: we find the union and intersection between their owners 

In [11]:
def jaccard(game_a_id, game_b_id):
    if isEmpty(game_a_id, game_b_id):
        return 0
    
    owners_a = usersPerItem[game_a_id]
    owners_b = usersPerItem[game_b_id]
    
    intersect_owners = owners_a.intersection(owners_b)
    union_owners = owners_a.union(owners_b)
    
    return len(intersect_owners) / len(union_owners)

In [12]:
def mostSimilarJaccardFast(game_a_id):
    similarities = []
    users = usersPerItem[game_a_id]
    candidateItems = set()
    for user_id in users:
        candidateItems = candidateItems.union(itemsPerUser[user_id])
    for game_b_id in candidateItems:
        if game_b_id == game_a_id: continue
        sim = jaccard(game_a_id, game_b_id)
        similarities.append((sim, game_b_id))
    similarities.sort(reverse=True)
    return similarities[:10]

In [13]:
mostSimilarNames(mostSimilar('70', jaccard))

['Half-Life: Blue Shift',
 'Half-Life: Opposing Force',
 'Team Fortress Classic',
 'Half-Life: Source',
 'Half-Life 2: Episode Two',
 'Half-Life Deathmatch: Source',
 'Half-Life 2: Episode One',
 'Half-Life 2: Deathmatch',
 'Half-Life 2',
 'Half-Life 2: Lost Coast']

In [14]:
mostSimilarNames(mostSimilarJaccardFast('70'))

['Half-Life: Blue Shift',
 'Half-Life: Opposing Force',
 'Team Fortress Classic',
 'Half-Life: Source',
 'Half-Life 2: Episode Two',
 'Half-Life Deathmatch: Source',
 'Half-Life 2: Episode One',
 'Half-Life 2: Deathmatch',
 'Half-Life 2',
 'Half-Life 2: Lost Coast']

## Cosine Similarity

In [15]:
def count(game_a_id, game_b_id):
    owners_a = usersPerItem[game_a_id]
    owners_b = usersPerItem[game_b_id]
    
    counter_a = Counter(owners_a)
    counter_b = Counter(owners_b)
    
    owners = owners_a.union(owners_b)
    
    return counter_a, counter_b, owners

In [16]:
def cosine(game_a_id, game_b_id):
    if isEmpty(game_a_id, game_b_id):
        return 0
    
    counter_a, counter_b, terms = count(game_a_id, game_b_id)
    magA = np.array([[counter_a.get(k, 0) for k in terms]])
    magB = np.array([[counter_b.get(k, 0) for k in terms]])
    
    return cosine_similarity(magA, magB)

In [17]:
mostSimilarNames(mostSimilar('70', cosine))

['Half-Life: Blue Shift',
 'Half-Life: Opposing Force',
 'Team Fortress Classic',
 'Half-Life: Source',
 'Half-Life Deathmatch: Source',
 'Half-Life 2: Episode Two',
 'Half-Life 2: Episode One',
 'Half-Life 2: Deathmatch',
 'Half-Life 2',
 'Half-Life 2: Lost Coast']

## Pearson Correlation

In [18]:
def valueVocab(vocabs):
    dictionary = {}
    for index, vocab in enumerate(vocabs):
        dictionary[vocab] = index
    return dictionary

In [19]:
def count(game_a_id, game_b_id):
    owners_a = usersPerItem[game_a_id]
    owners_b = usersPerItem[game_b_id]
    
    counter_a = Counter(owners_a)
    counter_b = Counter(owners_b)
    
    owners = owners_a.union(owners_b)
    owners_dict = valueVocab(owners)
    counter_a = dict((x,owners_dict[x]*y) for x,y in counter_a.items())
    counter_b = dict((x,owners_dict[x]*y) for x,y in counter_b.items())

    return counter_a, counter_b, owners

In [20]:
def pearson(game_a_id, game_b_id):
    if isEmpty(game_a_id, game_b_id):
        return 0
            
    counter_a, counter_b, terms = count(game_a_id, game_b_id)
    magA = np.array([counter_a.get(k, 0) for k in terms])
    magB = np.array([counter_b.get(k, 0) for k in terms])
    return np.corrcoef(magA, magB)[0, 1]

In [21]:
mostSimilarNames(mostSimilar('70', pearson))

['Half-Life: Blue Shift',
 'Half-Life: Opposing Force',
 'Team Fortress Classic',
 'Half-Life: Source',
 'Half-Life 2',
 'Half-Life 2: Lost Coast',
 'Half-Life 2: Deathmatch',
 'Half-Life Deathmatch: Source',
 'Half-Life 2: Episode One',
 'Half-Life 2: Episode Two']

# Similarity Playtime Estimation

We can also use the similarity-based recommender we developed above to make predictions about user's playtime. Although this is not an example of machine learning, it is a simple heuristic that can be used to estimate a user's future playtime based on their playtime in the past.

Specifically, a user's playtime for an item is assumed to be a weighted sum of their previous playtime, weighted by how similar the query item is to each of their previous purchases.

Use median playtime and mean playtime as base line percision 

In [22]:
import time, sys
from IPython.display import clear_output
def update_progress(progress):
    bar_length = 40
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1
        
    block = int(round(bar_length * progress))
    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)

In [23]:
def arrayToDict(array, keyStr):
    array_dict = {}
    for element in array:
        array_dict[element[keyStr]] = element
    return array_dict

In [24]:
def makeIndexMap(dictionary):
    index = {}
    reverse = {}
    i = 0
    for item in dictionary:
        index[item] = i
        index[i] = item
        i += 1
    return index, reverse

In [25]:
index, reverse = makeIndexMap(games_dict)

In [26]:
def predictPlaytime(user, game_a, sim, default):
    playtimes = []
    similarities = []
    for item in playtimesPerUser[user]:
        game_b = item['item_id']
        if game_b == game_a: continue
        playtimes.append(item['playtime_forever'])
        similarities.append(sim(game_a, game_b))
        
    if (sum(similarities) > 0):
        weightedPlaytime = [(x*y) for x,y in zip(playtimes, similarities)]
        return sum(weightedPlaytime) / sum(similarities)
    else:
        return default

In [27]:
def getPlaytime(users_items):
    playtimes = []
    for user in users_items:
        for item in user['items']:
            playtimes.append(item['playtime_forever'])
            
    return playtimes

In [28]:
playtimes = getPlaytime(users_items)
meanPlaytime = sum(playtimes) / len(playtimes)
medianPlaytime = statistics.median(playtimes)

In [29]:
def markLabel(game, playtime):
    return 1 if (games_dict[game]['median_playtime'] <= playtime) else 0

In [30]:
def getPairsAndLabels(users_items):
    pairs = []
    labels = []
    for user in users_items:
        for item in user['items']:
            pairs.append((user['user_id'], item['item_id']))
            labels.append(markLabel(item['item_id'], item['playtime_forever']))
    return pairs, labels

In [31]:
pairs, labels = getPairsAndLabels(users_items)

In [32]:
len(labels)

5153209

In [33]:
def percision(predictions, labels):
    differences = [1 if (x == y) else 0 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [34]:
pairs_train, pairs_test, labels_train, labels_test = train_test_split(pairs, labels, test_size = 0.0005)

In [35]:
meanPredictions = [markLabel(p[1], meanPlaytime) for p in pairs_test]

In [36]:
medianPredictions = [markLabel(p[1], medianPlaytime) for p in pairs_test]

In [37]:
def computePredictions(sim, default):
    predictions = []
    num = len(pairs_test)
    for index, pair in enumerate(pairs_test):
        predictions.append(markLabel(pair[1], predictPlaytime(pair[0], pair[1], sim, default)))
        update_progress(index / num)
    update_progress(1)
    return predictions

In [39]:
jaccardMedianPredictions = computePredictions(jaccard, medianPlaytime)

Progress: [########################################] 100.0%


In [41]:
percision(jaccardMedianPredictions, labels_test)

0.38494373302289486

In [42]:
cosineMedianPredictions = computePredictions(cosine, medianPlaytime)

Progress: [########################################] 100.0%


In [43]:
percision(cosineMedianPredictions, labels_test)

0.37407838571982927

In [44]:
pearsonMedianPredictions = computePredictions(pearson, medianPlaytime)

Progress: [########################################] 100.0%


In [45]:
percision(pearsonMedianPredictions, labels_test)

0.5638339154055103