In [1]:
import pandas as pd
import numpy as np
from collections import Counter, defaultdict

In [2]:
with open('coursera_sessions_train.txt') as fp:
    raw_train_data = [x.strip() for x in fp if x.strip()]

with open('coursera_sessions_test.txt') as fp:
    raw_test_data = [x.strip() for x in fp if x.strip()]

In [3]:
train_data = []
for sess in raw_train_data:
    seen, bought = sess.split(';')
    train_data.append((
        [int(x) for x in seen.split(',') if x],
        [int(x) for x in bought.split(',') if x],
    ))

test_data = []
for sess in raw_test_data:
    seen, bought = sess.split(';')
    test_data.append((
        [int(x) for x in seen.split(',') if x],
        [int(x) for x in bought.split(',') if x],
    ))

In [4]:
len(train_data)

50000

In [5]:
seen_counts = defaultdict(int)
seen_len = 0
bought_counts = defaultdict(int)
bought_len = 0

for seen, bought in train_data:
    seen_len += len(seen)
    for x in seen:
        seen_counts[x] += 1

    bought_len += len(bought)
    for x in bought:
        bought_counts[x] += 1

In [6]:
len(seen_counts)

77064

In [7]:
len(bought_counts)

4479

In [8]:
max(seen_counts)

102806

In [9]:
max(bought_counts)

102646

In [19]:
def seen_key(x):
    return seen_counts[x]

def bought_key(x):
    return bought_counts[x]

def unique(x):
    res = []
    for y in x:
        if y not in res:
            res.append(y)
    return res

def precision(rec, bought, k):
    x = 0
    for y in bought:
        if y in rec[:k]:
            x += 1
    return x / k

def recall(rec, bought, k):
    d = len(bought)
    if not d:
        return 0

    x = 0
    for y in bought:
        if y in rec[:k]:
            x += 1
    return x / d

train_precision5_sum_seen = 0
train_precision5_sum_bought = 0
train_precision1_sum_seen = 0
train_precision1_sum_bought = 0

train_recall5_sum_seen = 0
train_recall5_sum_bought = 0
train_recall1_sum_seen = 0
train_recall1_sum_bought = 0

sess_count = 0
for seen, bought in train_data:
    if not bought:
        continue

    sess_count += 1

    rec_seen = sorted(unique(seen), key=seen_key, reverse=True)
    rec_bought = sorted(unique(seen), key=bought_key, reverse=True)

    train_precision5_sum_seen += precision(rec_seen, bought, 5)
    train_precision1_sum_seen += precision(rec_seen, bought, 1)
    train_precision5_sum_bought += precision(rec_bought, bought, 5)
    train_precision1_sum_bought += precision(rec_bought, bought, 1)

    train_recall5_sum_seen += recall(rec_seen, bought, 5)
    train_recall1_sum_seen += recall(rec_seen, bought, 1)
    train_recall5_sum_bought += recall(rec_bought, bought, 5)
    train_recall1_sum_bought += recall(rec_bought, bought, 1)

train_precision5_avg_seen = train_precision5_sum_seen / sess_count
train_precision5_avg_bought = train_precision5_sum_bought / sess_count
train_precision1_avg_seen = train_precision1_sum_seen / sess_count
train_precision1_avg_bought = train_precision1_sum_bought / sess_count

train_recall5_avg_seen = train_recall5_sum_seen / sess_count
train_recall5_avg_bought = train_recall5_sum_bought / sess_count
train_recall1_avg_seen = train_recall1_sum_seen / sess_count
train_recall1_avg_bought = train_recall1_sum_bought / sess_count

In [20]:
with open('recommendations.answer1.txt', 'w') as fp:
    fp.write(
        f'{round(train_recall1_avg_seen, 2)} '
        f'{round(train_precision1_avg_seen, 2)} '
        f'{round(train_recall5_avg_seen, 2)} '
        f'{round(train_precision5_avg_seen, 2)}'
    )

In [25]:
with open('recommendations.answer3.txt', 'w') as fp:
    fp.write(
        f'{round(train_recall1_avg_bought, 2)} '
        f'{round(train_precision1_avg_bought, 2)} '
        f'{round(train_recall5_avg_bought, 2)} '
        f'{round(train_precision5_avg_bought, 2)}'
    )

In [26]:
test_precision5_sum_seen = 0
test_precision5_sum_bought = 0
test_precision1_sum_seen = 0
test_precision1_sum_bought = 0

test_recall5_sum_seen = 0
test_recall5_sum_bought = 0
test_recall1_sum_seen = 0
test_recall1_sum_bought = 0

sess_count = 0
for seen, bought in test_data:
    if not bought:
        continue

    sess_count += 1

    rec_seen = sorted(unique(seen), key=seen_key, reverse=True)
    rec_bought = sorted(unique(seen), key=bought_key, reverse=True)

    test_precision5_sum_seen += precision(rec_seen, bought, 5)
    test_precision1_sum_seen += precision(rec_seen, bought, 1)
    test_precision5_sum_bought += precision(rec_bought, bought, 5)
    test_precision1_sum_bought += precision(rec_bought, bought, 1)

    test_recall5_sum_seen += recall(rec_seen, bought, 5)
    test_recall1_sum_seen += recall(rec_seen, bought, 1)
    test_recall5_sum_bought += recall(rec_bought, bought, 5)
    test_recall1_sum_bought += recall(rec_bought, bought, 1)

test_precision5_avg_seen = test_precision5_sum_seen / sess_count
test_precision5_avg_bought = test_precision5_sum_bought / sess_count
test_precision1_avg_seen = test_precision1_sum_seen / sess_count
test_precision1_avg_bought = test_precision1_sum_bought / sess_count

test_recall5_avg_seen = test_recall5_sum_seen / sess_count
test_recall5_avg_bought = test_recall5_sum_bought / sess_count
test_recall1_avg_seen = test_recall1_sum_seen / sess_count
test_recall1_avg_bought = test_recall1_sum_bought / sess_count

In [27]:
with open('recommendations.answer2.txt', 'w') as fp:
    fp.write(
        f'{round(test_recall1_avg_seen, 2)} '
        f'{round(test_precision1_avg_seen, 2)} '
        f'{round(test_recall5_avg_seen, 2)} '
        f'{round(test_precision5_avg_seen, 2)}'
    )

In [28]:
with open('recommendations.answer4.txt', 'w') as fp:
    fp.write(
        f'{round(test_recall1_avg_bought, 2)} '
        f'{round(test_precision1_avg_bought, 2)} '
        f'{round(test_recall5_avg_bought, 2)} '
        f'{round(test_precision5_avg_bought, 2)}'
    )