In [1]:
import numpy as np
import pandas as pd
import json
import ast
import os
import math
import statistics
import sklearn
import scipy.spatial
from collections import defaultdict
from collections import Counter

In [2]:
notebook_path = os.path.abspath("Similarity_Based_Memory_Model.ipynb")
users_items_file_path = os.path.join(os.path.dirname(notebook_path), "data/australian_users_items.json")
items_file_path = os.path.join(os.path.dirname(notebook_path), "data/items_meta_data.json")

In [3]:
users_items = []
with open(users_items_file_path, 'r') as data:
    for line in data:
        users_items.append(ast.literal_eval(line))

In [4]:
with open(items_file_path, 'r') as data:
    games_dict = json.load(data)

In [5]:
usersPerItem = defaultdict(set)
itemsPerUser = defaultdict(set)
playtimesPerItem = defaultdict(list)
playtimesPerUser = defaultdict(list)
itemNames = defaultdict(str)

In [6]:
for game in games_dict:
    if 'owners' in games_dict[game]:
        usersPerItem[game] = set(games_dict[game]['owners'].keys())
        for owner in games_dict[game]['owners']:
            playtimesPerItem[game].append(games_dict[game]['owners'][owner])
            playtimesPerUser[owner].append(games_dict[game]['owners'][owner])

In [7]:
for user in users_items:
    itemsPerUser[user['user_id']] = [item['item_id'] for item in user['items']]
    for item in user['items']:
        itemNames[item['item_id']] = item['item_name']

# Similarity-Based Recommendations

In [44]:
def isEmpty(game_a_id, game_b_id):
    return len(usersPerItem[game_a_id]) == 0 or len(usersPerItem[game_b_id]) == 0

In [45]:
def mostSimilar(game_a_id, sim):
    similarities = []
    for game_b_id in games_dict:
        if game_b_id == game_a_id: continue
        similarity = sim(game_a_id, game_b_id)
        similarities.append((similarity, game_b_id))
    similarities.sort(reverse=True)
    return similarities[:10]

In [46]:
def mostSimilarNames(similarities):
    return [itemNames[sim[1]] for sim in similarities]

## Jaccard Similarity

This is a generic implementation of the Jaccard similarity between two items: we find the union and intersection between their owners 

In [47]:
def jaccard(game_a_id, game_b_id):
    if isEmpty(game_a_id, game_b_id):
        return 0
    
    owners_a = usersPerItem[game_a_id]
    owners_b = usersPerItem[game_b_id]
    
    intersect_owners = owners_a.intersection(owners_b)
    union_owners = owners_a.union(owners_b)
    
    return len(intersect_owners) / len(union_owners)

In [48]:
def mostSimilarJaccardFast(game_a_id):
    similarities = []
    users = usersPerItem[game_a_id]
    candidateItems = set()
    for user_id in users:
        candidateItems = candidateItems.union(itemsPerUser[user_id])
    for game_b_id in candidateItems:
        if game_b_id == game_a_id: continue
        sim = jaccard(game_a_id, game_b_id)
        similarities.append((sim, game_b_id))
    similarities.sort(reverse=True)
    return similarities[:10]

In [49]:
mostSimilarNames(mostSimilar('70', jaccard))

['Half-Life: Blue Shift',
 'Half-Life: Opposing Force',
 'Team Fortress Classic',
 'Half-Life: Source',
 'Half-Life 2: Episode Two',
 'Half-Life Deathmatch: Source',
 'Half-Life 2: Episode One',
 'Half-Life 2: Deathmatch',
 'Half-Life 2',
 'Half-Life 2: Lost Coast']

In [50]:
mostSimilarNames(mostSimilarJaccardFast('70'))

['Half-Life: Blue Shift',
 'Half-Life: Opposing Force',
 'Team Fortress Classic',
 'Half-Life: Source',
 'Half-Life 2: Episode Two',
 'Half-Life Deathmatch: Source',
 'Half-Life 2: Episode One',
 'Half-Life 2: Deathmatch',
 'Half-Life 2',
 'Half-Life 2: Lost Coast']

## Cosine Similarity

In [51]:
def count(game_a_id, game_b_id):
    owners_a = usersPerItem[game_a_id]
    owners_b = usersPerItem[game_b_id]
    
    counter_a = Counter(owners_a)
    counter_b = Counter(owners_b)
    
    owners = owners_a.union(owners_b)
    
    return counter_a, counter_b, owners

In [52]:
def cosine(game_a_id, game_b_id):
    if isEmpty(game_a_id, game_b_id):
        return 0
    
    counter_a, counter_b, terms = count(game_a_id, game_b_id)
    magA = np.array([counter_a.get(k, 0) for k in terms])
    magB = np.array([counter_b.get(k, 0) for k in terms])
    
    
    return scipy.spatial.distance.cosine(magA, magB)

In [53]:
mostSimilarNames(mostSimilar('70', cosine))

['Attack on Pearl Harbor',
 'Simplz Zoo',
 "Grim Tales: The Bride Collector's Edition",
 'CloudBound',
 'Thick Air',
 'Microcosmum: survival of cells - Soundtrack',
 "Dark Parables: Curse of Briar Rose Collector's Edition",
 'Footbrawl Playground',
 'Eclipse: New Dawn for the Galaxy',
 'Toadled']

## Pearson Correlation

In [68]:
def valueVocab(vocabs):
    dictionary = {}
    for index, vocab in enumerate(vocabs):
        dictionary[vocab] = index
    return dictionary

In [72]:
def count(game_a_id, game_b_id):
    owners_a = usersPerItem[game_a_id]
    owners_b = usersPerItem[game_b_id]
    
    counter_a = Counter(owners_a)
    counter_b = Counter(owners_b)
    
    owners = owners_a.union(owners_b)
    owners_dict = valueVocab(owners)
    counter_a = dict((x,owners_dict[x]*y) for x,y in counter_a.items())
    counter_b = dict((x,owners_dict[x]*y) for x,y in counter_b.items())

    return counter_a, counter_b, owners

In [73]:
def pearson(game_a_id, game_b_id):
    if isEmpty(game_a_id, game_b_id):
        return 0
            
    counter_a, counter_b, terms = count(game_a_id, game_b_id)
    magA = np.array([counter_a.get(k, 0) for k in terms])
    magB = np.array([counter_b.get(k, 0) for k in terms])
    return np.corrcoef(magA, magB)[0, 1]

In [74]:
mostSimilarNames(mostSimilar('70', pearson))

['Half-Life: Blue Shift',
 'Half-Life: Opposing Force',
 'Team Fortress Classic',
 'Half-Life: Source',
 'Half-Life 2: Lost Coast',
 'Half-Life 2',
 'Half-Life 2: Deathmatch',
 'Half-Life Deathmatch: Source',
 'Half-Life 2: Episode One',
 'Half-Life 2: Episode Two']

# Similarity Playtime Estimation

We can also use the similarity-based recommender we developed above to make predictions about user's playtime. Although this is not an example of machine learning, it is a simple heuristic that can be used to estimate a user's future playtime based on their playtime in the past.

Specifically, a user's playtime for an item is assumed to be a weighted sum of their previous playtime, weighted by how similar the query item is to each of their previous purchases.

Use median playtime and mean playtime as base line percision 

In [75]:
def arrayToDict(array, keyStr):
    array_dict = {}
    for element in array:
        array_dict[element[keyStr]] = element
    return array_dict

In [76]:
users_dict = arrayToDict(users_items, 'user_id')

In [77]:
def predictPlaytime(user, game_a, sim, default):
    playtimes = []
    similarities = []
    for item in users_dict[user]['items']:
        game_b = item['item_id']
        if game_b == game_a: continue
        playtimes.append(item['playtime_forever'])
        similarities.append(sim(game_a, game_b))
        
    if (sum(similarities) > 0):
        weightedPlaytime = [(x*y) for x,y in zip(playtimes, similarities)]
        return sum(weightedPlaytime) / sum(similarities)
    else:
        return default

In [78]:
def getPlaytime(users_items):
    playtimes = []
    for user in users_items:
        for item in user['items']:
            playtimes.append(item['playtime_forever'])
            
    return playtimes

In [87]:
playtimes = getPlaytime(users_items)
meanPlaytime = sum(playtimes) / len(playtimes)
medianPlaytime = statistics.median(playtimes)

In [88]:
def markLabel(game, playtime):
    return 1 if (games_dict[game]['median_playtime'] <= playtime) else 0

In [91]:
def getPairsAndLabels(users_items):
    pairs = []
    labels = []
    for user in users_items:
        for item in user['items']:
            pairs.append((user['user_id'], item['item_id']))
            labels.append(markLabel(item['item_id'], item['playtime_forever']))
    return pairs, labels

In [96]:
pairs, labels = getPairsAndLabels(users_items)

In [104]:
pairs[:100]

[('76561197970982479', '10'),
 ('76561197970982479', '20'),
 ('76561197970982479', '30'),
 ('76561197970982479', '40'),
 ('76561197970982479', '50'),
 ('76561197970982479', '60'),
 ('76561197970982479', '70'),
 ('76561197970982479', '130'),
 ('76561197970982479', '300'),
 ('76561197970982479', '240'),
 ('76561197970982479', '3830'),
 ('76561197970982479', '2630'),
 ('76561197970982479', '3900'),
 ('76561197970982479', '34440'),
 ('76561197970982479', '3920'),
 ('76561197970982479', '6400'),
 ('76561197970982479', '6910'),
 ('76561197970982479', '7670'),
 ('76561197970982479', '409710'),
 ('76561197970982479', '220'),
 ('76561197970982479', '320'),
 ('76561197970982479', '340'),
 ('76561197970982479', '360'),
 ('76561197970982479', '380'),
 ('76561197970982479', '400'),
 ('76561197970982479', '420'),
 ('76561197970982479', '9340'),
 ('76561197970982479', '228200'),
 ('76561197970982479', '11450'),
 ('76561197970982479', '7940'),
 ('76561197970982479', '4700'),
 ('76561197970982479', '12

In [105]:
len(labels)

5153209

In [93]:
def percision(predictions, labels):
    differences = [1 if (x == y) else 0 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [98]:
meanPredictions = [markLabel(p[1], meanPlaytime) for p in pairs]

In [99]:
medianPredictions = [markLabel(p[1], medianPlaytime) for p in pairs]

In [103]:
jaccardPredictions = [markLabel(p[1], predictPlaytime(p[0], p[1], jaccard, meanPlaytime)) for p in pairs]

KeyboardInterrupt: 

In [None]:
cosinePredictions = [markLabel(p[1], predictPlaytime(p[0], p[1], cosine, meanPlaytime)) for p in pairs[:10000]]

In [None]:
pearsonPredictions = [markLabel(p[1], predictPlaytime(p[0], p[1], pearson, meanPlaytime)) for p in pairs[:10000]]

In [101]:
percision(meanPredictions, labels)

0.327922077292033

In [102]:
percision(medianPredictions, labels)

0.5554179929438142

In [None]:
percision(jaccardPredictions, labels)

In [None]:
percision()