In [5]:
import gzip
from collections import defaultdict
import scipy
import scipy.optimize
import numpy as np
import random
from sklearn.metrics import balanced_accuracy_score

In [6]:
# def readJSON(path):
#     for l in gzip.open(path, 'rt'):
#         d = eval(l)
#         u = d['userID']
#         try:
#             g = d['gameID']
#         except Exception as e:
#             g = None
#         yield u,g,d
        
def extract(line):
    d = eval(line)
    u = d['userID']
    try:
        g = d['gameID']
    except Exception as e:
        g = None
    return u,g,d

dataset = [extract(l) for l in gzip.open('train.json.gz', 'rt')]

In [7]:
len(dataset)

175000

In [8]:
training_data = dataset[:165000]
validation_data = dataset[165000:]

In [12]:
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)

usersPerGame = defaultdict(set)
gamesPerUser = defaultdict(set)

for user,game,d in training_data:
    reviewsPerUser[user].append(d)
    reviewsPerItem[game].append(d)
    usersPerGame[game].add(user)
    gamesPerUser[user].add(game)

In [13]:
gamesPerUser['u36851597']

{'b06111821',
 'b08997406',
 'b09870670',
 'b20315259',
 'b22054824',
 'b22472780',
 'b26574228',
 'b28735043',
 'b31566356',
 'b35108009',
 'b41908564',
 'b47439123',
 'b54071831',
 'b65643011',
 'b70025240',
 'b70454867',
 'b70704558',
 'b71758149',
 'b74265892',
 'b75155505',
 'b77184292',
 'b82987255',
 'b84742953',
 'b86131432',
 'b87439597',
 'b90353136',
 'b93370213',
 'b96697924',
 'b97704723'}

In [14]:
gamesPerUser_all = defaultdict(set)
gameCount = defaultdict(int)
totalPlayed = 0

# Create false pairs
all_games_set = set()
all_users_set = set()

for u,g,d in dataset:
    all_games_set.add(g)
    all_users_set.add(u)
    gamesPerUser_all[u].add(g)
    gameCount[g] += 1
    totalPlayed += 1

all_games = list(all_games_set)
all_users = list(all_users_set)
false_pairs = []
    
random.shuffle(all_games)
random.shuffle(all_users)
for g in all_games:
    if len(false_pairs) > 5000:
        break
    for u in all_users:
        if g not in gamesPerUser_all[u]:
            false_pairs.append((u,g))
            break
            
len(all_games)

2437

In [82]:
mostPopular = [(gameCount[x], x) for x in gameCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalPlayed/2: break

In [16]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    return numer / denom

In [17]:
def Cosine(i1, i2):
    s1, s2 = usersPerItem[i1], usersPerItem[i2]
    inter = s1.intersection(s2)
    denom = np.sqrt(len(s1) * len(s2))

In [18]:
def predictRatingCos(user,item):
    ratings = []
    similarities = []
    i1_review = reviewsPerItem[item]
    i1_mean = sum([d['rating'] for d in i1_review]) / len(i1_review)
    for d in reviewsPerUser[user]:
        i2 = d['book_id']
        if i2 == item: continue
        i2_review = reviewsPerItem[i2]
        i2_mean = sum([d['rating'] for d in i2_review]) / len(i2_review)
        ratings.append(d['rating'] - i2_mean)
        similarities.append(itemCosine(item, i2))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return sum(weightedRatings) / sum(similarities) + i1_mean
    else:
        # User hasn't rated any similar items
        return 

In [88]:
gameCount[mostPopular[200][1]] * 50 / totalPlayed

0.05342857142857143

In [160]:
popularity = defaultdict(float)
for g in gameCount:
    popularity[g] = gameCount[g] * 50 / totalPlayed

In [142]:
def predict(user, game, jacval):
    similarities = []
    for i2 in gamesPerUser[user]:
        if i2 == game: continue
#         ratings.append(1)
#         jac = Jaccard(usersPerGame[game],usersPerGame[i2])
        jac = Cosine(game,i2)
        jac += popularity[game]
        similarities.append(jac)
    if len(similarities) == 0:
        return 0
    avgSim = sum(similarities) / len(similarities)
    return avgSim > jacval

In [143]:
return2 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return2.add(i)
    if count > totalPlayed/2: break

pred1 = []
actual1 = []
for user, game, d in validation_data:
    pred1.append(game in return2)
    actual1.append(True)

TP_ratio = sum(pred1) / len(pred1)

pred2 = []
actual2 = []
for user, game in false_pairs:
    pred2.append(game in return2)
    actual2.append(False)
    
TN_ratio = sum(pred2) / len(pred2)

baseline = pred1 + pred2
actual = actual1 + actual2

print(balanced_accuracy_score(actual, baseline), TP_ratio, TN_ratio)

0.6805852277390234 0.5052 0.1440295445219532


In [144]:
len(return1)

351

In [172]:
N = 20
pop = return1
# pop = set()

count1 = 0
count2 = 0
small_set = True

jac = 0.027

pred1 = []
actual1 = []
for user, game, d in validation_data:
    if small_set and count1 > 2000:
        break
    count1 += 1
    pred1.append(predict(user, game, jac))
    actual1.append(True)
    
pred1[:10]

TP_ratio = sum(pred1) / len(pred1)

pred2 = []
actual2 = []
for user, game in false_pairs:
    if small_set and count2 > 500:
        break
    count2 += 1
    pred2.append(predict(user, game, jac))
    actual2.append(False)
    
TN_ratio = sum(pred2) / len(pred2)

actual = actual1 + actual2
pred = pred1 + pred2
print(balanced_accuracy_score(actual, pred), TP_ratio, TN_ratio)

0.7156212313005175 0.736631684157921 0.30538922155688625


0.6745067681322661 0.8031968031968032 0.4541832669322709

In [173]:
predictions = open("predictions_Played.csv", 'w')
for l in open("pairs_Played.txt"):
    if l.startswith("userID"):
        #header
        predictions.write(l)
        continue
    u,g = l.strip().split('-')
    result = 1 if predict(u,g,jac) else 0
    predictions.write(u + '-' + g + "," + str(result) + "\n")

predictions.close()