In [1]:
import gzip
from collections import defaultdict
import scipy
import scipy.optimize
import numpy as np
import random
from sklearn.metrics import balanced_accuracy_score

In [2]:
# def readJSON(path):
#     for l in gzip.open(path, 'rt'):
#         d = eval(l)
#         u = d['userID']
#         try:
#             g = d['gameID']
#         except Exception as e:
#             g = None
#         yield u,g,d
        
def extract(line):
    d = eval(line)
    u = d['userID']
    try:
        g = d['gameID']
    except Exception as e:
        g = None
    return u,g,d

dataset = [extract(l) for l in gzip.open('train.json.gz', 'rt')]

In [3]:
len(dataset)

175000

In [4]:
training_data = dataset[:165000]
validation_data = dataset[165000:]

In [5]:
usersPerGame = defaultdict(set)
gamesPerUser = defaultdict(set)

for user,game,d in training_data:
    usersPerGame[game].add(user)
    gamesPerUser[user].add(game)

In [6]:
gamesPerUser['u36851597']

{'b06111821',
 'b08997406',
 'b09870670',
 'b20315259',
 'b22054824',
 'b22472780',
 'b26574228',
 'b28735043',
 'b31566356',
 'b35108009',
 'b41908564',
 'b47439123',
 'b54071831',
 'b65643011',
 'b70025240',
 'b70454867',
 'b70704558',
 'b71758149',
 'b74265892',
 'b75155505',
 'b77184292',
 'b82987255',
 'b84742953',
 'b86131432',
 'b87439597',
 'b90353136',
 'b93370213',
 'b96697924',
 'b97704723'}

In [7]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    return numer / denom

In [8]:
def mostSimilar(i, n):
    similarities = []
    users = usersPerGame[i]
    candidateGames = set()
    for u in users:
        candidateGames = candidateGames.union(gamesPerUser[u])
    for i2 in candidateGames:
        if i2 == i: continue
        sim = Jaccard(users, usersPerGame[i2])
        similarities.append((sim,i2))
    similarities.sort(reverse=True)
    return similarities[:n]

In [9]:
query = 'b30237103'
mostSimilar(query, 20)

[(0.05496828752642706, 'b33355095'),
 (0.05311355311355311, 'b65718005'),
 (0.052704576976421634, 'b12882927'),
 (0.051655629139072845, 'b24479894'),
 (0.04664723032069971, 'b81322639'),
 (0.04655172413793104, 'b55199118'),
 (0.045627376425855515, 'b05337263'),
 (0.04491725768321513, 'b49167324'),
 (0.044709388971684055, 'b66296493'),
 (0.044009779951100246, 'b32709027'),
 (0.04321728691476591, 'b34765006'),
 (0.04295942720763723, 'b61085487'),
 (0.042682926829268296, 'b08114745'),
 (0.0425, 'b05996210'),
 (0.04217687074829932, 'b50099026'),
 (0.0420017873100983, 'b14306623'),
 (0.04173106646058733, 'b37544219'),
 (0.04130808950086059, 'b35341763'),
 (0.04081632653061224, 'b58310300'),
 (0.04068522483940043, 'b90883081')]

In [10]:
def userPlayedSimilar(user, game, n):
    similarGames = mostSimilar(game, n)
    for jac, g in similarGames:
        if g in gamesPerUser[user]:
            return True
    return False

In [11]:
userPlayedSimilar('u42434461', 'b91625775', 70) # was true for baseline

True

In [12]:
gamesPerUser_all = defaultdict(set)
gameCount = defaultdict(int)
totalPlayed = 0

# Create false pairs
all_games_set = set()
all_users_set = set()

for u,g,d in dataset:
    all_games_set.add(g)
    all_users_set.add(u)
    gamesPerUser_all[u].add(g)
    gameCount[g] += 1
    totalPlayed += 1

all_games = list(all_games_set)
all_users = list(all_users_set)
false_pairs = []
    
random.shuffle(all_games)
random.shuffle(all_users)
for g in all_games:
    if len(false_pairs) > 5000:
        break
    for u in all_users:
        if g not in gamesPerUser_all[u]:
            false_pairs.append((u,g))
            break
            
len(all_games)

2437

In [13]:
mostPopular = [(gameCount[x], x) for x in gameCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalPlayed/2: break

In [14]:
def predict(user, game, popular, n):
    if game in popular:
        if userPlayedSimilar(user, game, 300):
            return 1
        return 0
    elif userPlayedSimilar(user, game, n):
        return 1
    return 0

In [15]:
return2 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return2.add(i)
    if count > totalPlayed/2: break

pred1 = []
actual1 = []
for user, game, d in validation_data:
    pred1.append(game in return2)
    actual1.append(True)

TP_ratio = sum(pred1) / len(pred1)

pred2 = []
actual2 = []
for user, game in false_pairs:
    pred2.append(game in return2)
    actual2.append(False)
    
TN_ratio = sum(pred2) / len(pred2)

baseline = pred1 + pred2
actual = actual1 + actual2

print(balanced_accuracy_score(actual, baseline), TP_ratio, TN_ratio)

0.6805852277390234 0.5052 0.1440295445219532


In [16]:
len(return1)

351

In [17]:
N = 20
pop = return1
# pop = set()

count1 = 0
count2 = 0
small_set = True

pred1 = []
actual1 = []
for user, game, d in validation_data:
    if small_set and count1 > 1000:
        break
    count1 += 1
    pred1.append(predict(user, game, pop, N))
    actual1.append(1)

TP_ratio = sum(pred1) / len(pred1)

pred2 = []
actual2 = []
for user, game in false_pairs:
    if small_set and count2 > 250:
        break
    count2 += 1
    pred2.append(predict(user, game, pop, N))
    actual2.append(0)
    
TN_ratio = sum(pred2) / len(pred2)

actual = actual1 + actual2
pred = pred1 + pred2
print(balanced_accuracy_score(actual, pred), TP_ratio, TN_ratio)

0.7026678500782086 0.6523476523476524 0.24701195219123506


0.6745067681322661 0.8031968031968032 0.4541832669322709

In [None]:
predictions = open("predictions_Played.csv", 'w')
for l in open("pairs_Played.txt"):
    if l.startswith("userID"):
        #header
        predictions.write(l)
        continue
    u,g = l.strip().split('-')
    result = predict(u,g,pop,N)
    predictions.write(u + '-' + g + "," + str(result) + "\n")

predictions.close()