In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from typing import List
from tqdm import tqdm

from RandomSplit import RandomSplit
from TopPopular import ModifiedTopPopular
from metrics import ndcg_metric, dcg, recall_metric, evaluate_recommender, get_metrics

%matplotlib inline

### MovieLens - 100k

In [3]:
df = pd.read_csv('data/ML_100.csv')
df = df.rename(columns={'userId': 'user_id', 'movieId': 'item_id'})
df['timestamp'] = pd.to_datetime(df['timestamp'],unit='s')
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,31,2.5,2009-12-14 02:52:24
1,1,1029,3.0,2009-12-14 02:52:59
2,1,1061,3.0,2009-12-14 02:53:02
3,1,1129,2.0,2009-12-14 02:53:05
4,1,1172,4.0,2009-12-14 02:53:25


In [28]:
splitter = RandomSplit()
train_df, valid_df, test_df = splitter(df)

In [29]:
train_grouped = train_df.groupby('user_id').apply(
    lambda x: [(t1, t2) for t1, t2 in sorted(zip(x.item_id, 
                                                 x.timestamp), key=lambda x: x[1])]
).reset_index()
train_grouped.rename({0:'train_interactions'}, axis=1, inplace=True)

valid_grouped = valid_df.groupby('user_id').apply(
    lambda x: [(t1, t2) for t1, t2 in sorted(zip(x.item_id,
                                                 x.timestamp), key=lambda x: x[1])]
).reset_index()
valid_grouped.rename({0:'valid_interactions'}, axis=1, inplace=True)

test_grouped = test_df.groupby('user_id').apply(
    lambda x: [(t1, t2) for t1, t2 in sorted(zip(x.item_id,
                                                 x.timestamp), key=lambda x: x[1])]
).reset_index()
test_grouped.rename({0:'test_interactions'}, axis=1, inplace=True)


train_grouped.head()

Unnamed: 0,user_id,train_interactions
0,1,"[(2294, 2009-12-14 02:51:48), (2455, 2009-12-1..."
1,2,"[(150, 1996-06-21 11:09:55), (296, 1996-06-21 ..."
2,3,"[(355, 2011-02-28 02:53:09), (1271, 2011-02-28..."
3,4,"[(1210, 2000-02-05 19:25:14), (2734, 2000-02-0..."
4,5,"[(1380, 2006-11-12 23:10:44), (1035, 2006-11-1..."


In [30]:
joined = train_grouped.merge(valid_grouped).merge(test_grouped)
joined.head()

Unnamed: 0,user_id,train_interactions,valid_interactions,test_interactions
0,2,"[(150, 1996-06-21 11:09:55), (296, 1996-06-21 ...","[(314, 1996-06-21 11:20:44), (372, 1996-06-21 ...","[(382, 1996-06-21 11:22:45), (537, 1996-06-21 ..."
1,3,"[(355, 2011-02-28 02:53:09), (1271, 2011-02-28...","[(2716, 2011-02-28 20:13:37), (44191, 2011-02-...","[(1197, 2011-02-28 22:39:30), (736, 2011-02-28..."
2,4,"[(1210, 2000-02-05 19:25:14), (2734, 2000-02-0...","[(2085, 2000-02-07 18:50:44), (3034, 2000-02-0...","[(1028, 2000-02-07 18:53:58), (1033, 2000-02-0..."
3,5,"[(1380, 2006-11-12 23:10:44), (1035, 2006-11-1...","[(2770, 2006-11-12 23:42:27), (39, 2006-11-12 ...","[(5266, 2006-11-12 23:43:15), (4018, 2006-11-1..."
4,6,"[(158, 2005-02-11 15:04:23), (1204, 2005-02-11...","[(3300, 2005-02-24 15:17:30), (2723, 2005-02-2...","[(1687, 2005-02-24 15:18:01), (2072, 2005-02-2..."


In [31]:
toppop = ModifiedTopPopular()
toppop.fit(joined)
joined['toppopular_recs10'] = toppop.predict(joined, topn=10)
joined['toppopular_recs5'] = toppop.predict(joined, topn=5)
joined.head()  

Unnamed: 0,user_id,train_interactions,valid_interactions,test_interactions,toppopular_recs10,toppopular_recs5
0,2,"[(150, 1996-06-21 11:09:55), (296, 1996-06-21 ...","[(314, 1996-06-21 11:20:44), (372, 1996-06-21 ...","[(382, 1996-06-21 11:22:45), (537, 1996-06-21 ...","[318, 260, 2571, 1, 1196, 1270, 1210, 608, 285...","[318, 260, 2571, 1, 1196]"
1,3,"[(355, 2011-02-28 02:53:09), (1271, 2011-02-28...","[(2716, 2011-02-28 20:13:37), (44191, 2011-02-...","[(1197, 2011-02-28 22:39:30), (736, 2011-02-28...","[260, 480, 2571, 1, 589, 1196, 1270, 608, 1198...","[260, 480, 2571, 1, 589]"
2,4,"[(1210, 2000-02-05 19:25:14), (2734, 2000-02-0...","[(2085, 2000-02-07 18:50:44), (3034, 2000-02-0...","[(1028, 2000-02-07 18:53:58), (1033, 2000-02-0...","[318, 593, 2571, 1, 527, 110, 608, 2858, 457, ...","[318, 593, 2571, 1, 527]"
3,5,"[(1380, 2006-11-12 23:10:44), (1035, 2006-11-1...","[(2770, 2006-11-12 23:42:27), (39, 2006-11-12 ...","[(5266, 2006-11-12 23:43:15), (4018, 2006-11-1...","[296, 318, 593, 260, 480, 2571, 1, 527, 589, 110]","[296, 318, 593, 260, 480]"
4,6,"[(158, 2005-02-11 15:04:23), (1204, 2005-02-11...","[(3300, 2005-02-24 15:17:30), (2723, 2005-02-2...","[(1687, 2005-02-24 15:18:01), (2072, 2005-02-2...","[356, 296, 318, 593, 260, 480, 1, 527, 589, 110]","[356, 296, 318, 593, 260]"


In [32]:
evaluate_recommender(joined, model_preds='toppopular_recs10')

{'ndcg': 0.0723679384539099, 'recall': 0.03419483893839252}

In [33]:
evaluate_recommender(joined, model_preds='toppopular_recs5')

{'ndcg': 0.048419475016673014, 'recall': 0.02173130720558957}

### MovieLens-1m

In [34]:
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
df = pd.read_table('data/rating.dat', sep='::',header=None, names=rnames, engine='python')
df = df.rename(columns={'userId': 'user_id', 'movie_id': 'item_id'})
df['timestamp'] = pd.to_datetime(df['timestamp'],unit='s')
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1193,5,2000-12-31 22:12:40
1,1,661,3,2000-12-31 22:35:09
2,1,914,3,2000-12-31 22:32:48
3,1,3408,4,2000-12-31 22:04:35
4,1,2355,5,2001-01-06 23:38:11


In [35]:
splitter = RandomSplit()
train_df, valid_df, test_df = splitter(df)

In [36]:
train_grouped = train_df.groupby('user_id').apply(
    lambda x: [(t1, t2) for t1, t2 in sorted(zip(x.item_id, 
                                                 x.timestamp), key=lambda x: x[1])]
).reset_index()
train_grouped.rename({0:'train_interactions'}, axis=1, inplace=True)

valid_grouped = valid_df.groupby('user_id').apply(
    lambda x: [(t1, t2) for t1, t2 in sorted(zip(x.item_id,
                                                 x.timestamp), key=lambda x: x[1])]
).reset_index()
valid_grouped.rename({0:'valid_interactions'}, axis=1, inplace=True)

test_grouped = test_df.groupby('user_id').apply(
    lambda x: [(t1, t2) for t1, t2 in sorted(zip(x.item_id,
                                                 x.timestamp), key=lambda x: x[1])]
).reset_index()
test_grouped.rename({0:'test_interactions'}, axis=1, inplace=True)


train_grouped.head()

Unnamed: 0,user_id,train_interactions
0,1,"[(3186, 2000-12-31 22:00:19), (1270, 2000-12-3..."
1,2,"[(1198, 2000-12-31 21:28:44), (1210, 2000-12-3..."
2,3,"[(593, 2000-12-31 21:10:18), (2858, 2000-12-31..."
3,4,"[(1210, 2000-12-31 20:18:44), (1097, 2000-12-3..."
4,5,"[(2717, 2000-12-31 05:37:52), (908, 2000-12-31..."


In [37]:
joined = train_grouped.merge(valid_grouped).merge(test_grouped)
joined.head()

Unnamed: 0,user_id,train_interactions,valid_interactions,test_interactions
0,1,"[(3186, 2000-12-31 22:00:19), (1270, 2000-12-3...","[(2294, 2001-01-06 23:38:11), (783, 2001-01-06...","[(1907, 2001-01-06 23:38:50), (48, 2001-01-06 ..."
1,2,"[(1198, 2000-12-31 21:28:44), (1210, 2000-12-3...","[(1597, 2000-12-31 22:00:25), (2628, 2000-12-3...","[(292, 2000-12-31 22:02:03), (95, 2000-12-31 2..."
2,3,"[(593, 2000-12-31 21:10:18), (2858, 2000-12-31...","[(2355, 2000-12-31 21:33:50), (3552, 2000-12-3...","[(3868, 2000-12-31 21:34:46), (2081, 2000-12-3..."
3,5,"[(2717, 2000-12-31 05:37:52), (908, 2000-12-31...","[(3079, 2000-12-31 07:02:42), (1921, 2000-12-3...","[(1527, 2000-12-31 07:07:59), (551, 2000-12-31..."
4,6,"[(3072, 2000-12-31 04:14:35), (2006, 2000-12-3...","[(3408, 2000-12-31 04:50:30), (3624, 2000-12-3...","[(920, 2000-12-31 05:00:51), (1569, 2000-12-31..."


In [38]:
toppop = ModifiedTopPopular()
toppop.fit(joined)
joined['toppopular_recs10'] = toppop.predict(joined, topn=10)
joined['toppopular_recs5'] = toppop.predict(joined, topn=5)
joined.head()  

Unnamed: 0,user_id,train_interactions,valid_interactions,test_interactions,toppopular_recs10,toppopular_recs5
0,1,"[(3186, 2000-12-31 22:00:19), (1270, 2000-12-3...","[(2294, 2001-01-06 23:38:11), (783, 2001-01-06...","[(1907, 2001-01-06 23:38:50), (48, 2001-01-06 ...","[2858, 1196, 1210, 589, 480, 2571, 593, 1198, ...","[2858, 1196, 1210, 589, 480]"
1,2,"[(1198, 2000-12-31 21:28:44), (1210, 2000-12-3...","[(1597, 2000-12-31 22:00:25), (2628, 2000-12-3...","[(292, 2000-12-31 22:02:03), (95, 2000-12-31 2...","[260, 1270, 1580, 608, 2762, 1197, 527, 1617, ...","[260, 1270, 1580, 608, 2762]"
2,3,"[(593, 2000-12-31 21:10:18), (2858, 2000-12-31...","[(2355, 2000-12-31 21:33:50), (3552, 2000-12-3...","[(3868, 2000-12-31 21:34:46), (2081, 2000-12-3...","[589, 2028, 2571, 608, 110, 2396, 2762, 527, 1...","[589, 2028, 2571, 608, 110]"
3,5,"[(2717, 2000-12-31 05:37:52), (908, 2000-12-31...","[(3079, 2000-12-31 07:02:42), (1921, 2000-12-3...","[(1527, 2000-12-31 07:07:59), (551, 2000-12-31...","[1196, 260, 1210, 589, 480, 1270, 1198, 110, 2...","[1196, 260, 1210, 589, 480]"
4,6,"[(3072, 2000-12-31 04:14:35), (2006, 2000-12-3...","[(3408, 2000-12-31 04:50:30), (3624, 2000-12-3...","[(920, 2000-12-31 05:00:51), (1569, 2000-12-31...","[1196, 260, 589, 2028, 480, 2571, 1270, 593, 1...","[1196, 260, 589, 2028, 480]"


In [39]:
evaluate_recommender(joined, model_preds='toppopular_recs10')

{'ndcg': 0.09805339332446063, 'recall': 0.03600610489569679}

In [40]:
evaluate_recommender(joined, model_preds='toppopular_recs5')

{'ndcg': 0.060163442487923185, 'recall': 0.02003130164279043}

### MovieLens_20m

In [41]:
df = pd.read_csv('data/ML_20.csv')
df = df.rename(columns={'userId': 'user_id', 'movieId': 'item_id'})
df['timestamp'] = pd.to_datetime(df['timestamp'],unit='s')
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [48]:
splitter = RandomSplit()
train_df, valid_df, test_df = splitter(df)

In [49]:
train_grouped = train_df.groupby('user_id').apply(
    lambda x: [(t1, t2) for t1, t2 in sorted(zip(x.item_id, 
                                                 x.timestamp), key=lambda x: x[1])]
).reset_index()
train_grouped.rename({0:'train_interactions'}, axis=1, inplace=True)

valid_grouped = valid_df.groupby('user_id').apply(
    lambda x: [(t1, t2) for t1, t2 in sorted(zip(x.item_id,
                                                 x.timestamp), key=lambda x: x[1])]
).reset_index()
valid_grouped.rename({0:'valid_interactions'}, axis=1, inplace=True)

test_grouped = test_df.groupby('user_id').apply(
    lambda x: [(t1, t2) for t1, t2 in sorted(zip(x.item_id,
                                                 x.timestamp), key=lambda x: x[1])]
).reset_index()
test_grouped.rename({0:'test_interactions'}, axis=1, inplace=True)


train_grouped.head()

Unnamed: 0,user_id,train_interactions
0,1,"[(924, 2004-09-10 03:06:38), (919, 2004-09-10 ..."
1,2,"[(62, 2000-11-21 15:29:58), (469, 2000-11-21 1..."
2,3,"[(589, 1999-12-11 07:25:08), (1188, 1999-12-11..."
3,4,"[(380, 1996-08-24 09:27:05), (165, 1996-08-24 ..."
4,5,"[(17, 1996-12-25 15:15:35), (62, 1996-12-25 15..."


In [50]:
joined = train_grouped.merge(valid_grouped).merge(test_grouped)
joined.head()

Unnamed: 0,user_id,train_interactions,valid_interactions,test_interactions
0,1,"[(924, 2004-09-10 03:06:38), (919, 2004-09-10 ...","[(5171, 2005-04-02 23:55:04), (7454, 2005-04-0...","[(5999, 2005-04-02 23:55:50), (7449, 2005-04-0..."
1,2,"[(62, 2000-11-21 15:29:58), (469, 2000-11-21 1...","[(1270, 2000-11-21 15:36:54)]","[(3703, 2000-11-21 15:36:54)]"
2,3,"[(589, 1999-12-11 07:25:08), (1188, 1999-12-11...","[(2668, 1999-12-14 12:54:08), (2986, 1999-12-1...","[(173, 1999-12-14 12:54:59), (2034, 1999-12-14..."
3,5,"[(17, 1996-12-25 15:15:35), (62, 1996-12-25 15...","[(1210, 1996-12-26 16:27:54)]","[(1291, 1996-12-26 16:28:48)]"
4,7,"[(1230, 2002-01-16 18:08:06), (908, 2002-01-16...","[(2716, 2002-01-16 19:26:01), (3510, 2002-01-1...","[(2013, 2002-01-16 19:28:04), (2598, 2002-01-1..."


In [51]:
toppop = ModifiedTopPopular()
toppop.fit(joined)
joined['toppopular_recs10'] = toppop.predict(joined, topn=10)
joined['toppopular_recs5'] = toppop.predict(joined, topn=5)
joined.head()  

Unnamed: 0,user_id,train_interactions,valid_interactions,test_interactions,toppopular_recs10,toppopular_recs5
0,1,"[(924, 2004-09-10 03:06:38), (919, 2004-09-10 ...","[(5171, 2005-04-02 23:55:04), (7454, 2005-04-0...","[(5999, 2005-04-02 23:55:50), (7449, 2005-04-0...","[356, 480, 110, 2571, 527, 1, 457, 1210, 2858,...","[356, 480, 110, 2571, 527]"
1,2,"[(62, 2000-11-21 15:29:58), (469, 2000-11-21 1...","[(1270, 2000-11-21 15:36:54)]","[(3703, 2000-11-21 15:36:54)]","[296, 356, 593, 318, 2571, 527, 1, 457, 50, 150]","[296, 356, 593, 318, 2571]"
2,3,"[(589, 1999-12-11 07:25:08), (1188, 1999-12-11...","[(2668, 1999-12-14 12:54:08), (2986, 1999-12-1...","[(173, 1999-12-14 12:54:59), (2034, 1999-12-14...","[296, 356, 110, 527, 2858, 150, 47, 592, 608, ...","[296, 356, 110, 527, 2858]"
3,5,"[(17, 1996-12-25 15:15:35), (62, 1996-12-25 15...","[(1210, 1996-12-26 16:27:54)]","[(1291, 1996-12-26 16:28:48)]","[296, 356, 2571, 527, 1, 50, 2858, 32, 47, 1270]","[296, 356, 2571, 527, 1]"
4,7,"[(1230, 2002-01-16 18:08:06), (908, 2002-01-16...","[(2716, 2002-01-16 19:26:01), (3510, 2002-01-1...","[(2013, 2002-01-16 19:28:04), (2598, 2002-01-1...","[296, 593, 318, 110, 2571, 527, 1, 457, 50, 150]","[296, 593, 318, 110, 2571]"


In [52]:
evaluate_recommender(joined, model_preds='toppopular_recs10')

{'ndcg': 0.07055004296665573, 'recall': 0.038943090085347916}

In [53]:
evaluate_recommender(joined, model_preds='toppopular_recs5')

{'ndcg': 0.04270301064220252, 'recall': 0.022635305906719576}