In [54]:
from collections import OrderedDict
from collections import Counter
import numpy as np
import pandas
import heapq

In [55]:
visit_popularity = Counter()
purchase_popularity = Counter()

with open('train.txt', 'r') as f:
    for line in f.xreadlines():
        visits, purchases = line.strip().split(';')
        visits    = [int(s) for s in visits.split(',')]
        if purchases:
            purchases = [int(s) for s in purchases.split(',')]
        for num in visits:
            visit_popularity[num] += 1
        for num in purchases:
            purchase_popularity[num] += 1
all_visits = sum(visit_popularity.values())
all_purchases = sum(purchase_popularity.values())
print all_visits, all_purchases
for num in visit_popularity:
    visit_popularity[num] = visit_popularity[num] / float(all_visits)
for num in purchase_popularity:
    purchase_popularity[num] = purchase_popularity[num] / float(all_purchases)

356177 5374


In [56]:
def recommend_by_purchase(items, max_count):
    return heapq.nlargest(max_count, OrderedDict.fromkeys(items), key=lambda x: purchase_popularity.get(x, 0))

def recommend_by_visit(items, max_count):
    return heapq.nlargest(max_count, OrderedDict.fromkeys(items), key=lambda x: visit_popularity.get(x, 0))

In [57]:
def calculate_metrics(recommend, max_count, sessions_file):
    with open(sessions_file, 'r') as f:
        avg_recall = np.zeros(max_count)
        avg_precision = np.zeros(max_count)
        sessions_count = 0
        for line in f.xreadlines():
            visits, purchases = line.strip().split(';')
            if purchases != '':
                visits = visits.split(',')
                purchases = purchases.split(',')
                rec = recommend(visits, max_count)
                k = len(rec)
                intersection = list(set(rec) & set(purchases))
                avg_precision[k - 1] += (len(intersection)/float(k))
                avg_recall[k - 1]    += (len(intersection)/float(len(purchases)))
                sessions_count += 1
   
    return pandas.DataFrame({
            'k': np.arange(max_count) + 1,
            'avg_recall@k': [round(x, 2) for x in avg_recall / sessions_count],
            'avg_precision@k': [round(x, 2) for x in avg_precision / sessions_count]
    }).set_index('k')


In [58]:
calculate_metrics(recommend_by_purchase, 5, 'train.txt')

Unnamed: 0_level_0,avg_precision@k,avg_recall@k
k,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.2,0.19
2,0.09,0.17
3,0.05,0.11
4,0.03,0.09
5,0.08,0.25


In [59]:
calculate_metrics(recommend_by_purchase, 5, 'test.txt')

Unnamed: 0_level_0,avg_precision@k,avg_recall@k
k,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.2,0.19
2,0.1,0.18
3,0.05,0.11
4,0.03,0.08
5,0.08,0.26


In [60]:
calculate_metrics(recommend_by_visit, 5, 'train.txt')

Unnamed: 0_level_0,avg_precision@k,avg_recall@k
k,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.2,0.19
2,0.09,0.17
3,0.05,0.11
4,0.03,0.09
5,0.08,0.25


In [61]:
calculate_metrics(recommend_by_visit, 5, 'test.txt')

Unnamed: 0_level_0,avg_precision@k,avg_recall@k
k,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.2,0.19
2,0.1,0.18
3,0.05,0.11
4,0.03,0.08
5,0.08,0.26
