In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from typing import List
from tqdm import tqdm

from RandomSplit import RandomSplit
from TopPopular import ModifiedTopPopular
from metrics import ndcg_metric, dcg, recall_metric, evaluate_recommender, get_metrics

%matplotlib inline

### MovieLens - 100k

In [2]:
df = pd.read_csv('data/ML_100k.csv')
df = df.rename(columns={'userId': 'user_id', 'movieId': 'item_id'})
df['timestamp'] = pd.to_datetime(df['timestamp'],unit='s')
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,31,2.5,2009-12-14 02:52:24
1,1,1029,3.0,2009-12-14 02:52:59
2,1,1061,3.0,2009-12-14 02:53:02
3,1,1129,2.0,2009-12-14 02:53:05
4,1,1172,4.0,2009-12-14 02:53:25


In [15]:
splitter = RandomSplit(test_fraction=0.1)
train_df, valid_df, test_df = splitter(df)

In [16]:
train_grouped = train_df.groupby('user_id').apply(
    lambda x: [(t1, t2) for t1, t2 in sorted(zip(x.item_id, 
                                                 x.timestamp), key=lambda x: x[1])]
).reset_index()
train_grouped.rename({0:'train_interactions'}, axis=1, inplace=True)

valid_grouped = valid_df.groupby('user_id').apply(
    lambda x: [(t1, t2) for t1, t2 in sorted(zip(x.item_id,
                                                 x.timestamp), key=lambda x: x[1])]
).reset_index()
valid_grouped.rename({0:'valid_interactions'}, axis=1, inplace=True)

test_grouped = test_df.groupby('user_id').apply(
    lambda x: [(t1, t2) for t1, t2 in sorted(zip(x.item_id,
                                                 x.timestamp), key=lambda x: x[1])]
).reset_index()
test_grouped.rename({0:'test_interactions'}, axis=1, inplace=True)


train_grouped.head()

Unnamed: 0,user_id,train_interactions
0,1,"[(2294, 2009-12-14 02:51:48), (2455, 2009-12-1..."
1,2,"[(150, 1996-06-21 11:09:55), (296, 1996-06-21 ..."
2,3,"[(355, 2011-02-28 02:53:09), (1271, 2011-02-28..."
3,4,"[(1210, 2000-02-05 19:25:14), (2734, 2000-02-0..."
4,5,"[(1380, 2006-11-12 23:10:44), (1035, 2006-11-1..."


In [17]:
joined = train_grouped.merge(valid_grouped).merge(test_grouped)
joined.head()

Unnamed: 0,user_id,train_interactions,valid_interactions,test_interactions
0,1,"[(2294, 2009-12-14 02:51:48), (2455, 2009-12-1...","[(2968, 2009-12-14 02:53:20)]","[(1405, 2009-12-14 02:53:23), (1172, 2009-12-1..."
1,2,"[(150, 1996-06-21 11:09:55), (296, 1996-06-21 ...","[(485, 1996-06-21 11:18:38), (370, 1996-06-21 ...","[(314, 1996-06-21 11:20:44), (372, 1996-06-21 ..."
2,3,"[(355, 2011-02-28 02:53:09), (1271, 2011-02-28...","[(1721, 2011-02-28 20:00:36), (377, 2011-02-28...","[(2716, 2011-02-28 20:13:37), (44191, 2011-02-..."
3,4,"[(1210, 2000-02-05 19:25:14), (2734, 2000-02-0...","[(3108, 2000-02-07 10:35:38), (3255, 2000-02-0...","[(2085, 2000-02-07 18:50:44), (3034, 2000-02-0..."
4,5,"[(1380, 2006-11-12 23:10:44), (1035, 2006-11-1...","[(5995, 2006-11-12 23:35:33), (1544, 2006-11-1...","[(8376, 2006-11-12 23:39:02), (2770, 2006-11-1..."


In [18]:
toppop = ModifiedTopPopular()
toppop.fit(joined)
joined['toppopular_recs10'] = toppop.predict(joined, topn=10)
joined['toppopular_recs5'] = toppop.predict(joined, topn=5)
joined.head()  

Unnamed: 0,user_id,train_interactions,valid_interactions,test_interactions,toppopular_recs10,toppopular_recs5
0,1,"[(2294, 2009-12-14 02:51:48), (2455, 2009-12-1...","[(2968, 2009-12-14 02:53:20)]","[(1405, 2009-12-14 02:53:23), (1172, 2009-12-1...","[356, 296, 318, 593, 260, 480, 2571, 1, 527, 110]","[356, 296, 318, 593, 260]"
1,2,"[(150, 1996-06-21 11:09:55), (296, 1996-06-21 ...","[(485, 1996-06-21 11:18:38), (370, 1996-06-21 ...","[(314, 1996-06-21 11:20:44), (372, 1996-06-21 ...","[318, 260, 2571, 1, 1196, 1210, 1270, 2858, 11...","[318, 260, 2571, 1, 1196]"
2,3,"[(355, 2011-02-28 02:53:09), (1271, 2011-02-28...","[(1721, 2011-02-28 20:00:36), (377, 2011-02-28...","[(2716, 2011-02-28 20:13:37), (44191, 2011-02-...","[260, 480, 2571, 1, 589, 1196, 1270, 457, 1198...","[260, 480, 2571, 1, 589]"
3,4,"[(1210, 2000-02-05 19:25:14), (2734, 2000-02-0...","[(3108, 2000-02-07 10:35:38), (3255, 2000-02-0...","[(2085, 2000-02-07 18:50:44), (3034, 2000-02-0...","[318, 593, 2571, 1, 527, 110, 457, 2858, 608, ...","[318, 593, 2571, 1, 527]"
4,5,"[(1380, 2006-11-12 23:10:44), (1035, 2006-11-1...","[(5995, 2006-11-12 23:35:33), (1544, 2006-11-1...","[(8376, 2006-11-12 23:39:02), (2770, 2006-11-1...","[296, 318, 593, 260, 480, 2571, 1, 527, 110, 589]","[296, 318, 593, 260, 480]"


In [19]:
evaluate_recommender(joined, model_preds='toppopular_recs10')

{'ndcg': 0.1120063889083689, 'recall': 0.04022268848009084}

In [20]:
evaluate_recommender(joined, model_preds='toppopular_recs5')

{'ndcg': 0.0718264266490125, 'recall': 0.02520608171711447}

### MovieLens-1m

In [28]:
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
df = pd.read_table('data/ratings.dat', sep='::',header=None, names=rnames, engine='python')
df = df.rename(columns={'userId': 'user_id', 'movie_id': 'item_id'})
df['timestamp'] = pd.to_datetime(df['timestamp'],unit='s')
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1193,5,2000-12-31 22:12:40
1,1,661,3,2000-12-31 22:35:09
2,1,914,3,2000-12-31 22:32:48
3,1,3408,4,2000-12-31 22:04:35
4,1,2355,5,2001-01-06 23:38:11


In [29]:
splitter = RandomSplit(test_fraction=0.1)
train_df, valid_df, test_df = splitter(df)

In [30]:
train_grouped = train_df.groupby('user_id').apply(
    lambda x: [(t1, t2) for t1, t2 in sorted(zip(x.item_id, 
                                                 x.timestamp), key=lambda x: x[1])]
).reset_index()
train_grouped.rename({0:'train_interactions'}, axis=1, inplace=True)

valid_grouped = valid_df.groupby('user_id').apply(
    lambda x: [(t1, t2) for t1, t2 in sorted(zip(x.item_id,
                                                 x.timestamp), key=lambda x: x[1])]
).reset_index()
valid_grouped.rename({0:'valid_interactions'}, axis=1, inplace=True)

test_grouped = test_df.groupby('user_id').apply(
    lambda x: [(t1, t2) for t1, t2 in sorted(zip(x.item_id,
                                                 x.timestamp), key=lambda x: x[1])]
).reset_index()
test_grouped.rename({0:'test_interactions'}, axis=1, inplace=True)


train_grouped.head()

Unnamed: 0,user_id,train_interactions
0,1,"[(3186, 2000-12-31 22:00:19), (1270, 2000-12-3..."
1,2,"[(1198, 2000-12-31 21:28:44), (1210, 2000-12-3..."
2,3,"[(593, 2000-12-31 21:10:18), (2858, 2000-12-31..."
3,4,"[(1210, 2000-12-31 20:18:44), (1097, 2000-12-3..."
4,5,"[(2717, 2000-12-31 05:37:52), (908, 2000-12-31..."


In [31]:
joined = train_grouped.merge(valid_grouped).merge(test_grouped)
joined.head()

Unnamed: 0,user_id,train_interactions,valid_interactions,test_interactions
0,1,"[(3186, 2000-12-31 22:00:19), (1270, 2000-12-3...","[(2687, 2001-01-06 23:37:48), (745, 2001-01-06...","[(2294, 2001-01-06 23:38:11), (783, 2001-01-06..."
1,2,"[(1198, 2000-12-31 21:28:44), (1210, 2000-12-3...","[(1552, 2000-12-31 21:59:01), (2490, 2000-12-3...","[(2628, 2000-12-31 22:00:51), (1690, 2000-12-3..."
2,3,"[(593, 2000-12-31 21:10:18), (2858, 2000-12-31...","[(1270, 2000-12-31 21:30:31), (1079, 2000-12-3...","[(2355, 2000-12-31 21:33:50), (3552, 2000-12-3..."
3,4,"[(1210, 2000-12-31 20:18:44), (1097, 2000-12-3...","[(1240, 2000-12-31 20:24:20)]","[(1036, 2000-12-31 20:24:42), (1954, 2000-12-3..."
4,5,"[(2717, 2000-12-31 05:37:52), (908, 2000-12-31...","[(412, 2000-12-31 06:58:11), (6, 2000-12-31 06...","[(3079, 2000-12-31 07:02:42), (1921, 2000-12-3..."


In [32]:
toppop = ModifiedTopPopular()
toppop.fit(joined)
joined['toppopular_recs10'] = toppop.predict(joined, topn=10)
joined['toppopular_recs5'] = toppop.predict(joined, topn=5)
joined.head()  

Unnamed: 0,user_id,train_interactions,valid_interactions,test_interactions,toppopular_recs10,toppopular_recs5
0,1,"[(3186, 2000-12-31 22:00:19), (1270, 2000-12-3...","[(2687, 2001-01-06 23:37:48), (745, 2001-01-06...","[(2294, 2001-01-06 23:38:11), (783, 2001-01-06...","[2858, 1196, 1210, 589, 480, 2571, 593, 1198, ...","[2858, 1196, 1210, 589, 480]"
1,2,"[(1198, 2000-12-31 21:28:44), (1210, 2000-12-3...","[(1552, 2000-12-31 21:59:01), (2490, 2000-12-3...","[(2628, 2000-12-31 22:00:51), (1690, 2000-12-3...","[260, 1270, 1580, 608, 1197, 527, 1617, 858, 2...","[260, 1270, 1580, 608, 1197]"
2,3,"[(593, 2000-12-31 21:10:18), (2858, 2000-12-31...","[(1270, 2000-12-31 21:30:31), (1079, 2000-12-3...","[(2355, 2000-12-31 21:33:50), (3552, 2000-12-3...","[2028, 589, 2571, 608, 110, 2396, 527, 1617, 8...","[2028, 589, 2571, 608, 110]"
3,4,"[(1210, 2000-12-31 20:18:44), (1097, 2000-12-3...","[(1240, 2000-12-31 20:24:20)]","[(1036, 2000-12-31 20:24:42), (1954, 2000-12-3...","[2858, 589, 2571, 1270, 593, 1580, 608, 110, 1...","[2858, 589, 2571, 1270, 593]"
4,5,"[(2717, 2000-12-31 05:37:52), (908, 2000-12-31...","[(412, 2000-12-31 06:58:11), (6, 2000-12-31 06...","[(3079, 2000-12-31 07:02:42), (1921, 2000-12-3...","[1196, 260, 1210, 589, 480, 1270, 1198, 110, 1...","[1196, 260, 1210, 589, 480]"


In [33]:
evaluate_recommender(joined, model_preds='toppopular_recs10')

{'ndcg': 0.13380752232950174, 'recall': 0.038222550050619634}

In [34]:
evaluate_recommender(joined, model_preds='toppopular_recs5')

{'ndcg': 0.08580552013133338, 'recall': 0.022092647185883257}

### MovieLens_20m

In [21]:
df = pd.read_csv('data/ML_20m.csv')
df = df.rename(columns={'userId': 'user_id', 'movieId': 'item_id'})
df['timestamp'] = pd.to_datetime(df['timestamp'],unit='s')
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [22]:
splitter = RandomSplit(test_fraction=0.1)
train_df, valid_df, test_df = splitter(df)

In [23]:
train_grouped = train_df.groupby('user_id').apply(
    lambda x: [(t1, t2) for t1, t2 in sorted(zip(x.item_id, 
                                                 x.timestamp), key=lambda x: x[1])]
).reset_index()
train_grouped.rename({0:'train_interactions'}, axis=1, inplace=True)

valid_grouped = valid_df.groupby('user_id').apply(
    lambda x: [(t1, t2) for t1, t2 in sorted(zip(x.item_id,
                                                 x.timestamp), key=lambda x: x[1])]
).reset_index()
valid_grouped.rename({0:'valid_interactions'}, axis=1, inplace=True)

test_grouped = test_df.groupby('user_id').apply(
    lambda x: [(t1, t2) for t1, t2 in sorted(zip(x.item_id,
                                                 x.timestamp), key=lambda x: x[1])]
).reset_index()
test_grouped.rename({0:'test_interactions'}, axis=1, inplace=True)


train_grouped.head()

Unnamed: 0,user_id,train_interactions
0,1,"[(924, 2004-09-10 03:06:38), (919, 2004-09-10 ..."
1,2,"[(62, 2000-11-21 15:29:58), (469, 2000-11-21 1..."
2,3,"[(589, 1999-12-11 07:25:08), (1188, 1999-12-11..."
3,4,"[(380, 1996-08-24 09:27:05), (165, 1996-08-24 ..."
4,5,"[(17, 1996-12-25 15:15:35), (62, 1996-12-25 15..."


In [24]:
joined = train_grouped.merge(valid_grouped).merge(test_grouped)
joined.head()

Unnamed: 0,user_id,train_interactions,valid_interactions,test_interactions
0,1,"[(924, 2004-09-10 03:06:38), (919, 2004-09-10 ...","[(7046, 2005-04-02 23:52:14), (2143, 2005-04-0...","[(4980, 2005-04-02 23:54:15), (6834, 2005-04-0..."
1,2,"[(62, 2000-11-21 15:29:58), (469, 2000-11-21 1...","[(1969, 2000-11-21 15:36:09), (1970, 2000-11-2...","[(924, 2000-11-21 15:36:54), (1196, 2000-11-21..."
2,3,"[(589, 1999-12-11 07:25:08), (1188, 1999-12-11...","[(2046, 1999-12-14 12:51:10), (2428, 1999-12-1...","[(1544, 1999-12-14 12:53:25), (2615, 1999-12-1..."
3,4,"[(380, 1996-08-24 09:27:05), (165, 1996-08-24 ...","[(596, 1996-08-24 09:37:04), (531, 1996-08-24 ...","[(489, 1996-08-24 09:39:35), (548, 1996-08-24 ..."
4,5,"[(17, 1996-12-25 15:15:35), (62, 1996-12-25 15...","[(1079, 1996-12-26 16:25:53), (60, 1996-12-26 ...","[(1080, 1996-12-26 16:27:54), (1196, 1996-12-2..."


In [25]:
toppop = ModifiedTopPopular()
toppop.fit(joined)
joined['toppopular_recs10'] = toppop.predict(joined, topn=10)
joined['toppopular_recs5'] = toppop.predict(joined, topn=5)
joined.head()  

Unnamed: 0,user_id,train_interactions,valid_interactions,test_interactions,toppopular_recs10,toppopular_recs5
0,1,"[(924, 2004-09-10 03:06:38), (919, 2004-09-10 ...","[(7046, 2005-04-02 23:52:14), (2143, 2005-04-0...","[(4980, 2005-04-02 23:54:15), (6834, 2005-04-0...","[356, 480, 110, 2571, 457, 1, 527, 150, 592, 1...","[356, 480, 110, 2571, 457]"
1,2,"[(62, 2000-11-21 15:29:58), (469, 2000-11-21 1...","[(1969, 2000-11-21 15:36:09), (1970, 2000-11-2...","[(924, 2000-11-21 15:36:54), (1196, 2000-11-21...","[296, 356, 318, 593, 2571, 457, 1, 527, 150, 592]","[296, 356, 318, 593, 2571]"
2,3,"[(589, 1999-12-11 07:25:08), (1188, 1999-12-11...","[(2046, 1999-12-14 12:51:10), (2428, 1999-12-1...","[(1544, 1999-12-14 12:53:25), (2615, 1999-12-1...","[296, 356, 110, 527, 150, 592, 590, 2858, 380,...","[296, 356, 110, 527, 150]"
3,4,"[(380, 1996-08-24 09:27:05), (165, 1996-08-24 ...","[(596, 1996-08-24 09:37:04), (531, 1996-08-24 ...","[(489, 1996-08-24 09:39:35), (548, 1996-08-24 ...","[296, 318, 593, 260, 110, 2571, 457, 1, 527, 150]","[296, 318, 593, 260, 110]"
4,5,"[(17, 1996-12-25 15:15:35), (62, 1996-12-25 15...","[(1079, 1996-12-26 16:25:53), (60, 1996-12-26 ...","[(1080, 1996-12-26 16:27:54), (1196, 1996-12-2...","[296, 356, 2571, 1, 527, 592, 1210, 50, 1196, 32]","[296, 356, 2571, 1, 527]"


In [26]:
evaluate_recommender(joined, model_preds='toppopular_recs10')

{'ndcg': 0.12622532554969262, 'recall': 0.046513203197171754}

In [27]:
evaluate_recommender(joined, model_preds='toppopular_recs5')

{'ndcg': 0.0820827985396127, 'recall': 0.027898303348681208}