In [2]:
from spotlight.cross_validation import user_based_train_test_split
from spotlight.datasets.movielens import get_movielens_dataset
from spotlight.evaluation import rmse_score,mrr_score
from spotlight.factorization.explicit import ExplicitFactorizationModel
import pandas as pd
import numpy as np

In [3]:
ratings_df = pd.read_csv("ml-latest-small/ratings.csv")
movies_df = pd.read_csv("ml-latest-small/movies.csv")

from collections import defaultdict
from itertools import count

uid_map = defaultdict(count().__next__)
iid_map = defaultdict(count().__next__)
uids = np.array([uid_map[uid] for uid in ratings_df["userId"].values ], dtype=np.int32)
iids = np.array([iid_map[iid] for iid in ratings_df["movieId"].values ], dtype=np.int32)

uid_rev_map = {v: k for k, v in uid_map.items()}
iid_rev_map = {v: k for k, v in iid_map.items()}

ratings = ratings_df["rating"].values.astype(np.float32)
timestamps = ratings_df["timestamp"].values.astype(np.int32)

from spotlight.interactions import Interactions
from spotlight.cross_validation import random_train_test_split

dataset = Interactions(user_ids=uids,item_ids=iids,ratings=ratings,timestamps=timestamps)

#lets initialise the seed, so that its repeatable and reproducible 
train, test = random_train_test_split(dataset, test_percentage=0.2)

In [4]:
dataset

<Interactions dataset (610 users x 9724 items x 100836 interactions)>

In [5]:
train

<Interactions dataset (610 users x 9724 items x 80668 interactions)>

In [6]:
test

<Interactions dataset (610 users x 9724 items x 20168 interactions)>

In [11]:
import tracemalloc
import time
import os

#tracemalloc.start()
model = ExplicitFactorizationModel(n_iter=1)
intial_time = time.time()
model.fit(train)
print(os.getpid())
#current, peak = tracemalloc.get_traced_memory()

#print(f"Inital memory usage was {current / 10**6} MB; Peak was {peak / 10**6} MB; and difference is {(peak / 10**6) - (current / 10**6)} MB")
print(f"The total time is {time.time() - intial_time} seconds")

tracemalloc.stop()
#snapshot2 = tracemalloc.take_snapshot()



9227
The total time is 1.6393001079559326 seconds


In [12]:
#tracemalloc.start()
intial_time = time.time()
print(mrr_score(model, test))
#current, peak = tracemalloc.get_traced_memory()
print(os.getpid())
#print(f"Inital memory usage was {current / 10**6} MB; Peak was {peak / 10**6} MB; and difference is {(peak / 10**6) - (current / 10**6)} MB")
print(f"The total time is {time.time() - intial_time} seconds")


[0.00247671 0.00242271 0.00587621 0.00228246 0.00164699 0.01115478
 0.00174852 0.00121318 0.00119517 0.00216478 0.00129948 0.0006183
 0.00391502 0.0263584  0.00329953 0.00682065 0.00693016 0.00321453
 0.00215898 0.00169433 0.00183556 0.00129561 0.00485761 0.00617322
 0.00140934 0.01665389 0.00179248 0.00300038 0.00440654 0.00533252
 0.00393849 0.01176139 0.00080705 0.00120293 0.00126557 0.01214294
 0.01022836 0.00688377 0.00331988 0.00219459 0.02176536 0.0019942
 0.0015548  0.00074852 0.00159995 0.00808627 0.00353831 0.00072761
 0.00453489 0.00247076 0.00275994 0.0017203  0.00409867 0.01267394
 0.00182878 0.0079901  0.00228664 0.00108938 0.00262628 0.00388569
 0.00894646 0.00226638 0.00262709 0.00249913 0.00960774 0.00199753
 0.00201868 0.00186864 0.00124158 0.03238183 0.00145747 0.00343245
 0.0031011  0.0049707  0.0008534  0.0467567  0.00465864 0.03240914
 0.01101847 0.00171276 0.00422233 0.00124108 0.00610529 0.0031684
 0.00251295 0.00150024 0.00813595 0.01710885 0.00211541 0.0560293

In [106]:
tracemalloc.start()


tracemalloc.start()
intial_time = time.time()
print(f"Root Mean Squared Error is {rmse_score(model, test)}")
current, peak = tracemalloc.get_traced_memory()

print(f"Inital memory usage was {current / 10**6} MB; Peak was {peak / 10**6} MB; and difference is {(peak / 10**6) - (current / 10**6)} MB")
print(f"The total time is {time.time() - intial_time} seconds")

Root Mean Squared Error is 1.08253812789917
Inital memory usage was 0.076533 MB; Peak was 0.859552 MB; and difference is 0.783019 MB
The total time is 0.008342981338500977 seconds


In [147]:
# scikit-learn bootstrap
from sklearn.utils import resample
# data sample


boot_uid = resample(uids, replace=True, n_samples=int(uids.size * i /10), random_state=1)
print(boot_uid.size)


# prepare bootstrap sample
for i in range(9,0,-1):
    boot_uid = resample(uids, replace=True, n_samples=int(uids.size * i /10), random_state=1)
    boot_iid = resample(iids, replace=True, n_samples=int(uids.size * i /10), random_state=1)
    boot_ratings = resample(ratings, replace=True, n_samples=int(uids.size * i /10), random_state=1)
    boot_timestamps = resample(timestamps, replace=True, n_samples=int(uids.size * i /10), random_state=1)
    dataset_boot = Interactions(user_ids=boot_uid,item_ids=boot_iid,ratings=boot_ratings,timestamps=boot_timestamps)

    #lets initialise the seed, so that its repeatable and reproducible 
    train, test = random_train_test_split(dataset_boot, test_percentage=0.2)
    #tracemalloc.start()
    model = ExplicitFactorizationModel(n_iter=1)
    #intial_time = time.time()
    model.fit(train)
    print(f"Root Mean Squared Error is {rmse_score(model, test)}")
    #current, peak = tracemalloc.get_traced_memory()

    #print(f"Inital memory usage was {current / 10**6} MB; Peak was {peak / 10**6} MB; and difference is {(peak / 10**6) - (current / 10**6)} MB")
    #print(f"The total time is {time.time() - intial_time} seconds")

    #tracemalloc.stop()


    
    






90752
Root Mean Squared Error is 1.011805772781372
Root Mean Squared Error is 1.0614092350006104
Root Mean Squared Error is 1.110275149345398
Root Mean Squared Error is 1.218490481376648
Root Mean Squared Error is 1.3490108251571655
Root Mean Squared Error is 1.6921112537384033
Root Mean Squared Error is 2.150165319442749
Root Mean Squared Error is 2.9755358695983887
Root Mean Squared Error is 3.3968894481658936
