In [79]:
from spotlight.cross_validation import user_based_train_test_split
from spotlight.datasets.movielens import get_movielens_dataset
from spotlight.evaluation import rmse_score,mrr_score
from spotlight.factorization.explicit import ExplicitFactorizationModel
import pandas as pd
import numpy as np

In [95]:
ratings_df = pd.read_csv("ml-latest-small/ratings.csv")
movies_df = pd.read_csv("ml-latest-small/movies.csv")

from collections import defaultdict
from itertools import count

uid_map = defaultdict(count().__next__)
iid_map = defaultdict(count().__next__)
uids = np.array([uid_map[uid] for uid in ratings_df["userId"].values ], dtype=np.int32)
iids = np.array([iid_map[iid] for iid in ratings_df["movieId"].values ], dtype=np.int32)

uid_rev_map = {v: k for k, v in uid_map.items()}
iid_rev_map = {v: k for k, v in iid_map.items()}

ratings = ratings_df["rating"].values.astype(np.float32)
timestamps = ratings_df["timestamp"].values.astype(np.int32)

from spotlight.interactions import Interactions
from spotlight.cross_validation import random_train_test_split

dataset = Interactions(user_ids=uids,item_ids=iids,ratings=ratings,timestamps=timestamps)

#lets initialise the seed, so that its repeatable and reproducible 
train, test = random_train_test_split(dataset, test_percentage=0.2)

In [96]:
dataset

<Interactions dataset (610 users x 9724 items x 100836 interactions)>

In [97]:
train

<Interactions dataset (610 users x 9724 items x 80668 interactions)>

In [98]:
test

<Interactions dataset (610 users x 9724 items x 20168 interactions)>

In [99]:
import tracemalloc
import time


tracemalloc.start()
model = ExplicitFactorizationModel(n_iter=1)
intial_time = time.time()
model.fit(train)
current, peak = tracemalloc.get_traced_memory()

print(f"Inital memory usage was {current / 10**6} MB; Peak was {peak / 10**6} MB; and difference is {(peak / 10**6) - (current / 10**6)} MB")
print(f"The total time is {time.time() - intial_time} seconds")

tracemalloc.stop()
#snapshot2 = tracemalloc.take_snapshot()



Inital memory usage was 0.044255 MB; Peak was 3.570009 MB; and difference is 3.525754 MB
The total time is 10.822343000000018


In [100]:
tracemalloc.start()


tracemalloc.start()
intial_time = time.time()
print(mrr_score(model, test))
current, peak = tracemalloc.get_traced_memory()

print(f"Inital memory usage was {current / 10**6} MB; Peak was {peak / 10**6} MB; and difference is {(peak / 10**6) - (current / 10**6)} MB")
print(f"The total time is {time.time() - intial_time}")


[1.39847770e-03 1.97227027e-03 3.31309290e-03 7.71959630e-03
 6.36814589e-03 4.15289503e-03 2.56062692e-03 3.48999785e-03
 9.65368144e-04 1.16500249e-03 2.06166373e-03 9.68103135e-04
 2.67923785e-03 7.31587013e-03 8.71597786e-03 1.60237641e-02
 1.45353036e-02 3.11021066e-03 1.74487285e-03 1.10514214e-03
 1.60368213e-03 2.52437479e-03 4.66612499e-03 3.26055370e-03
 3.65802676e-03 5.01450246e-03 1.39686314e-02 4.21781140e-03
 7.28035490e-03 2.35761576e-03 3.99208968e-03 5.96574131e-03
 2.95717722e-03 6.66572977e-04 1.45904599e-02 1.88708748e-03
 4.86633995e-03 1.25183152e-02 3.05479483e-03 2.16798266e-03
 4.26792843e-03 1.37638818e-03 1.75032736e-03 5.28552559e-03
 1.91762074e-03 1.50769315e-02 2.45088847e-03 2.41801844e-03
 6.55953928e-03 3.61574172e-03 3.25936348e-03 2.42076970e-03
 2.13135972e-03 2.31137342e-03 2.45525630e-03 4.26193366e-03
 1.14514996e-02 9.01627699e-03 4.31398184e-03 1.36576124e-03
 5.15487542e-03 6.63457456e-03 4.03803612e-03 2.77308542e-03
 3.51789014e-03 2.700560

In [101]:
tracemalloc.start()


tracemalloc.start()
intial_time = time.time()
print(f"Root Mean Squared Error is {rmse_score(model, test)}")
current, peak = tracemalloc.get_traced_memory()

print(f"Inital memory usage was {current / 10**6} MB; Peak was {peak / 10**6} MB; and difference is {(peak / 10**6) - (current / 10**6)} MB")
print(f"The total time is {time.time() - intial_time}")

Root Mean Squared Error is 1.0857150554656982
Inital memory usage was 0.100337 MB; Peak was 0.882416 MB; and difference is 0.782079 MB
The total time is 0.010995999999977357


In [102]:
# scikit-learn bootstrap
from sklearn.utils import resample
# data sample
data = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
# prepare bootstrap sample
d_y = datase
boot = resample(dataset, replace=True, n_samples=90000, random_state=1)
print('Bootstrap Sample: %s' % boot)
# out of bag observations
oob = [x for x in data if x not in boot]
print('OOB Sample: %s' % oob)




NameError: name 'datase' is not defined