In [323]:
import gzip
from collections import defaultdict
import scipy
import scipy.optimize
import numpy as np
import random
from sklearn.metrics import balanced_accuracy_score

In [324]:
# def readJSON(path):
#     for l in gzip.open(path, 'rt'):
#         d = eval(l)
#         u = d['userID']
#         try:
#             g = d['gameID']
#         except Exception as e:
#             g = None
#         yield u,g,d
        
def extract(line):
    d = eval(line)
    u = d['userID']
    try:
        g = d['gameID']
    except Exception as e:
        g = None
    return u,g,d

dataset = [extract(l) for l in gzip.open('train.json.gz', 'rt')]

random.shuffle(dataset)

In [325]:
len(dataset)

175000

In [326]:
training_data = dataset[:165000]
validation_data = dataset[165000:]

In [327]:
usersPerGame = defaultdict(set)
gamesPerUser = defaultdict(set)

for user,game,d in training_data:
    usersPerGame[game].add(user)
    gamesPerUser[user].add(game)

In [328]:
gamesPerUser['u36851597']

{'b06111821',
 'b09870670',
 'b20315259',
 'b22054824',
 'b22472780',
 'b26574228',
 'b28735043',
 'b31566356',
 'b35108009',
 'b36105300',
 'b41908564',
 'b47439123',
 'b54071831',
 'b65643011',
 'b70025240',
 'b70454867',
 'b70704558',
 'b74265892',
 'b75155505',
 'b77184292',
 'b82987255',
 'b84742953',
 'b90353136',
 'b93370213',
 'b96697924',
 'b97704723'}

In [329]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    return numer / denom

In [330]:
def mostSimilar(i, n):
    similarities = []
    users = usersPerGame[i]
    candidateGames = set()
    for u in users:
        candidateGames = candidateGames.union(gamesPerUser[u])
    for i2 in candidateGames:
        if i2 == i: continue
        sim = Jaccard(users, usersPerGame[i2])
        similarities.append((sim,i2))
    similarities.sort(reverse=True)
    return similarities[:n]

In [331]:
query = 'b30237103'
mostSimilar(query, 20)

[(0.05647382920110193, 'b12882927'),
 (0.05394190871369295, 'b33355095'),
 (0.05137614678899083, 'b65718005'),
 (0.04985337243401759, 'b81322639'),
 (0.047556142668428, 'b24479894'),
 (0.04716981132075472, 'b61085487'),
 (0.04655172413793104, 'b55199118'),
 (0.04568527918781726, 'b35341763'),
 (0.04460966542750929, 'b05337263'),
 (0.043373493975903614, 'b08114745'),
 (0.043076923076923075, 'b37544219'),
 (0.0427807486631016, 'b21645341'),
 (0.042600896860986545, 'b98464326'),
 (0.04221954161640531, 'b34765006'),
 (0.04220779220779221, 'b19304591'),
 (0.04215851602023609, 'b20439501'),
 (0.04215116279069767, 'b66296493'),
 (0.04200913242009133, 'b69475426'),
 (0.04161073825503356, 'b50099026'),
 (0.04152823920265781, 'b58310300')]

In [332]:
def userPlayedSimilar(user, game, n):
    similarGames = mostSimilar(game, n)
    for jac, g in similarGames:
        if g in gamesPerUser[user]:
            return True
    return False

In [333]:
userPlayedSimilar('u42434461', 'b91625775', 70) # was true for baseline

True

In [334]:
gamesPerUser_all = defaultdict(set)
gameCount = defaultdict(int)
totalPlayed = 0

# Create false pairs
all_games_set = set()
all_users_set = set()

for u,g,d in dataset:
    all_games_set.add(g)
    all_users_set.add(u)
    gamesPerUser_all[u].add(g)
    gameCount[g] += 1
    totalPlayed += 1

all_games = list(all_games_set)
all_users = list(all_users_set)
false_pairs = []
    
random.shuffle(all_games)
random.shuffle(all_users)
for g in all_games:
    if len(false_pairs) > 5000:
        break
    for u in all_users:
        if g not in gamesPerUser_all[u]:
            false_pairs.append((u,g))
            break
            
len(all_games)

2437

In [347]:
mostPopular = [(gameCount[x], x) for x in gameCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalPlayed/2: break

In [362]:
def predict(user, game, popular, n):
    if game in popular:
        if userPlayedSimilar(user, game, 3*n):
            return 1
        return 0
    elif userPlayedSimilar(user, game, n):
        return 1
    return 0

In [342]:
return2 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return2.add(i)
    if count > totalPlayed/2: break

pred1 = []
actual1 = []
for user, game, d in validation_data:
    pred1.append(game in return2)
    actual1.append(True)

TP_ratio = sum(pred1) / len(pred1)

pred2 = []
actual2 = []
for user, game in false_pairs:
    pred2.append(game in return2)
    actual2.append(False)
    
TN_ratio = sum(pred2) / len(pred2)

baseline = pred1 + pred2
actual = actual1 + actual2

print(balanced_accuracy_score(actual, baseline, adjusted=True), TP_ratio, TN_ratio)

0.35477045547804664 0.4988 0.1440295445219532


In [348]:
len(return1)

351

In [368]:
N = 70
pop = return1
# pop = set()

count1 = 0
count2 = 0
small_set = False

pred1 = []
actual1 = []
for user, game, d in validation_data:
    if small_set and count1 > 1000:
        break
    count1 += 1
    pred1.append(predict(user, game, pop, N))
    actual1.append(1)

TP_ratio = sum(pred1) / len(pred1)

pred2 = []
actual2 = []
for user, game in false_pairs:
    if small_set and count2 > 250:
        break
    count2 += 1
    pred2.append(predict(user, game, pop, N))
    actual2.append(0)
    
TN_ratio = sum(pred2) / len(pred2)

actual = actual1 + actual2
pred = pred1 + pred2
print(balanced_accuracy_score(actual, pred), TP_ratio, TN_ratio)

0.7629323758719737 0.7852 0.25933524825605253


In [None]:
0.7609601148953631 0.8108 0.2888797702092737

In [246]:
predictions = open("predictions_Played.csv", 'w')
for l in open("pairs_Played.txt"):
    if l.startswith("userID"):
        #header
        predictions.write(l)
        continue
    u,g = l.strip().split('-')
    result = predict(u,g,pop,N)
    predictions.write(u + '-' + g + "," + str(result) + "\n")

predictions.close()

In [None]:
### Category prediction baseline: Just consider some of the most common words from each category

catDict = {
    "Action": 0,
    "Strategy": 1,
    "RPG": 2,
    "Adventure": 3,
    "Sport": 4
}

predictions = open("predictions_Category.txt", 'w')
predictions.write("userID-reviewID,prediction\n")
for u,_,d in readJSON("test_Category.json.gz"):
    cat = catDict['Action'] # If there's no evidence, just choose the most common category in the dataset
    words = d['text'].lower()
    if 'strategy' in words:
        cat = catDict['Strategy']
    if 'rpg' in words:
        cat = catDict['RPG']
    if 'adventure' in words:
        cat = catDict['Adventure']
    if 'sport' in words:
        cat = catDict['Sport']
    predictions.write(u + '-' + d['reviewID'] + "," + str(cat) + "\n")

predictions.close()
