In [1]:
from spotlight.cross_validation import user_based_train_test_split
from spotlight.datasets.movielens import get_movielens_dataset
from spotlight.evaluation import rmse_score,mrr_score
from spotlight.factorization.explicit import ExplicitFactorizationModel
import pandas as pd
import numpy as np

In [2]:
ratings_df = pd.read_csv("ml-latest-small/ratings.csv")
movies_df = pd.read_csv("ml-latest-small/movies.csv")

from collections import defaultdict
from itertools import count

uid_map = defaultdict(count().__next__)
iid_map = defaultdict(count().__next__)
uids = np.array([uid_map[uid] for uid in ratings_df["userId"].values ], dtype=np.int32)
iids = np.array([iid_map[iid] for iid in ratings_df["movieId"].values ], dtype=np.int32)


uid_rev_map = {v: k for k, v in uid_map.items()}
iid_rev_map = {v: k for k, v in iid_map.items()}


ratings = ratings_df["rating"].values.astype(np.float32)
timestamps = ratings_df["timestamp"].values.astype(np.int32)

from spotlight.interactions import Interactions
from spotlight.cross_validation import random_train_test_split

dataset = Interactions(user_ids=uids,item_ids=iids,ratings=ratings,timestamps=timestamps)

#lets initialise the seed, so that its repeatable and reproducible 
train, test = random_train_test_split(dataset, test_percentage=0.2)

In [3]:
dataset

<Interactions dataset (610 users x 9724 items x 100836 interactions)>

In [4]:
train

<Interactions dataset (610 users x 9724 items x 80668 interactions)>

In [5]:
test

<Interactions dataset (610 users x 9724 items x 20168 interactions)>

In [6]:
import os
print(os.getpid())

32672


In [7]:
import tracemalloc
import time


#tracemalloc.start()
model = ExplicitFactorizationModel(n_iter=1)
intial_time = time.time()
model.fit(train)
#current, peak = tracemalloc.get_traced_memory()

#print(f"Inital memory usage was {current / 10**6} MB; Peak was {peak / 10**6} MB; and difference is {(peak / 10**6) - (current / 10**6)} MB")
print(f"The total time is {time.time() - intial_time} seconds")

tracemalloc.stop()
#snapshot2 = tracemalloc.take_snapshot()



The total time is 1.6274809837341309 seconds


In [8]:
#tracemalloc.start()
intial_time = time.time()
print(mrr_score(model, test))
#current, peak = tracemalloc.get_traced_memory()
print(os.getpid())
#print(f"Inital memory usage was {current / 10**6} MB; Peak was {peak / 10**6} MB; and difference is {(peak / 10**6) - (current / 10**6)} MB")
print(f"The total time is {time.time() - intial_time} seconds")


[0.00149469 0.00293596 0.00552832 0.02825848 0.01294014 0.00291
 0.00177999 0.00674725 0.00042859 0.00194213 0.00465982 0.0005546
 0.00365638 0.01476467 0.0040806  0.00607912 0.01065221 0.00549679
 0.00233565 0.00079771 0.01286626 0.00474011 0.00285476 0.00337315
 0.01356511 0.00988751 0.00798483 0.00342383 0.02136483 0.01249201
 0.00250898 0.06027369 0.03242608 0.00232583 0.00489063 0.0010737
 0.0184407  0.00867079 0.00551542 0.00222378 0.00349359 0.00195821
 0.00261093 0.00119576 0.0017784  0.01724447 0.00233991 0.00165049
 0.00940531 0.00138367 0.00788025 0.00398061 0.00033625 0.0373317
 0.00549767 0.0068645  0.00676805 0.04554592 0.00279413 0.00836734
 0.00501916 0.00323296 0.00747208 0.00202073 0.00148829 0.00226674
 0.00728674 0.00327035 0.00821485 0.02667511 0.00451104 0.01043765
 0.00165912 0.00476526 0.00460975 0.01896416 0.02064956 0.02380907
 0.00198673 0.00244884 0.04927164 0.00378383 0.00856758 0.00491849
 0.00082303 0.00647357 0.00091164 0.00338996 0.00247942 0.000556
 0.

In [9]:
tracemalloc.start()


tracemalloc.start()
intial_time = time.time()
print(f"Root Mean Squared Error is {rmse_score(model, test)}")
current, peak = tracemalloc.get_traced_memory()

print(f"Inital memory usage was {current / 10**6} MB; Peak was {peak / 10**6} MB; and difference is {(peak / 10**6) - (current / 10**6)} MB")
print(f"The total time is {time.time() - intial_time} seconds")

Root Mean Squared Error is 1.0775507688522339
Inital memory usage was 0.004554 MB; Peak was 0.40782 MB; and difference is 0.403266 MB
The total time is 0.005848884582519531 seconds


In [18]:
# scikit-learn bootstrap
from sklearn.utils import resample
# data sample

# prepare bootstrap sample
for i in range(10,0,-1):
    boot_uid = resample(uids, n_samples=int(uids.size * i /10), random_state=1)
    boot_iid = resample(iids, n_samples=int(iids.size * i /10), random_state=1)
    print(boot_uid.size)
    int_uid = []
    int_iid = []
    int_ratings = []
    int_timestamps = []
    
    temp_sample = ratings_df[ratings_df['userId'].isin(boot_uid)]
    final_sample = temp_sample[temp_sample['movieId'].isin(boot_iid)]
    #print(final_sample)
    int_uid = np.array([uid_map[uid] for uid in final_sample["userId"].values ], dtype=np.int32)
    int_iid = np.array([iid_map[iid] for iid in final_sample["movieId"].values ], dtype=np.int32)
    int_ratings = final_sample['rating'].values.astype(np.float32)
    int_timestamps = final_sample['timestamp'].values.astype(np.int32)

    #boot_ratings = resample(ratings, n_samples=int(ratings.size * i /10), random_state=1)
    #boot_timestamps = resample(timestamps,n_samples=int(timestamps.size * i /10), random_state=1)
    #print(boot_ratings.size)
    dataset_boot = Interactions(user_ids=int_uid,item_ids=int_uid,ratings=int_ratings,timestamps=int_timestamps)

    #lets initialise the seed, so that its repeatable and reproducible 
    train, test = random_train_test_split(dataset_boot, test_percentage=0.2)
    #tracemalloc.start()
    model = ExplicitFactorizationModel(n_iter=1)
    #intial_time = time.time()
    model.fit(train)
    print(f"Root Mean Squared Error is {rmse_score(model, test)}")
    #current, peak = tracemalloc.get_traced_memory()

    #tracemalloc.stop()
'''                           
    for rt in ratings_df['rating']:
        for rt_u in ratings_df.loc[ratings_df['rating'] == rt]['userId']:           
            if uid_map[rt_u] in boot_uid:
                abc = ratings_df[(ratings_df['rating'] == rt) & (ratings_df['userId'] == rt_u)]
                for a in abc['movieId']:
                    if iid_map[a] in boot_iid:
                        int_iid.append(uid_map[rt_u])
                        int_uid.append(iid_map[a])
                        int_ratings.append(rt)
                        int_timestamps.append(ratings_df[(ratings_df['movieId'] == a) & (ratings_df['userId'] == rt_u)]['timestamp'].values)
    
'''  

100836
70786
70786
70786
70786
Root Mean Squared Error is 1.015019178390503
90752
70075
70075
70075
70075
Root Mean Squared Error is 1.0068771839141846
80668
69089
69089
69089
69089
Root Mean Squared Error is 1.006486177444458
70585
67825
67825
67825
67825
Root Mean Squared Error is 1.0052525997161865
60501
66297
66297
66297
66297
Root Mean Squared Error is 1.0281777381896973
50418
64091
64091
64091
64091
Root Mean Squared Error is 1.0139468908309937
40334
61139
61139
61139
61139
Root Mean Squared Error is 1.007400393486023
30250
57361
57361
57361
57361
Root Mean Squared Error is 1.0117002725601196
20167
51564
51564
51564
51564
Root Mean Squared Error is 1.0206834077835083
10083
41689
41689
41689
41689
Root Mean Squared Error is 1.010080099105835
