In [3]:
from spotlight.cross_validation import user_based_train_test_split
from spotlight.datasets.movielens import get_movielens_dataset
from spotlight.evaluation import rmse_score,mrr_score
from spotlight.factorization.explicit import ExplicitFactorizationModel
import pandas as pd
import numpy as np

In [4]:
ratings_df = pd.read_csv("ml-latest-small/ratings.csv")
movies_df = pd.read_csv("ml-latest-small/movies.csv")

from collections import defaultdict
from itertools import count

uid_map = defaultdict(count().__next__)
iid_map = defaultdict(count().__next__)
uids = np.array([uid_map[uid] for uid in ratings_df["userId"].values ], dtype=np.int32)
iids = np.array([iid_map[iid] for iid in ratings_df["movieId"].values ], dtype=np.int32)

uid_rev_map = {v: k for k, v in uid_map.items()}
iid_rev_map = {v: k for k, v in iid_map.items()}

ratings = ratings_df["rating"].values.astype(np.float32)
timestamps = ratings_df["timestamp"].values.astype(np.int32)

from spotlight.interactions import Interactions
from spotlight.cross_validation import random_train_test_split

dataset = Interactions(user_ids=uids,item_ids=iids,ratings=ratings,timestamps=timestamps)

#lets initialise the seed, so that its repeatable and reproducible 
train, test = random_train_test_split(dataset, test_percentage=0.2)

In [5]:
dataset

<Interactions dataset (610 users x 9724 items x 100836 interactions)>

In [6]:
train

<Interactions dataset (610 users x 9724 items x 80668 interactions)>

In [7]:
test

<Interactions dataset (610 users x 9724 items x 20168 interactions)>

In [8]:
import os
print(os.getpid())

13075


In [9]:
import tracemalloc
import time


#tracemalloc.start()
model = ExplicitFactorizationModel(n_iter=1)
intial_time = time.time()
model.fit(train)
#current, peak = tracemalloc.get_traced_memory()

#print(f"Inital memory usage was {current / 10**6} MB; Peak was {peak / 10**6} MB; and difference is {(peak / 10**6) - (current / 10**6)} MB")
print(f"The total time is {time.time() - intial_time} seconds")

tracemalloc.stop()
#snapshot2 = tracemalloc.take_snapshot()



The total time is 1.66420316696167 seconds


In [10]:
#tracemalloc.start()
intial_time = time.time()
print(mrr_score(model, test))
#current, peak = tracemalloc.get_traced_memory()
print(os.getpid())
#print(f"Inital memory usage was {current / 10**6} MB; Peak was {peak / 10**6} MB; and difference is {(peak / 10**6) - (current / 10**6)} MB")
print(f"The total time is {time.time() - intial_time} seconds")


[1.61083490e-03 4.41273806e-03 1.93473901e-03 5.71102026e-03
 6.70354271e-02 6.17115497e-03 4.39624832e-03 3.77967655e-02
 2.10935461e-03 3.53699619e-03 4.08826274e-03 2.13824585e-03
 5.19917063e-04 5.46675179e-03 2.58002484e-03 7.42047208e-03
 5.03623669e-03 2.61629555e-03 2.98533078e-03 3.87751625e-03
 2.97212073e-03 4.11745226e-03 3.67907137e-03 2.24710824e-03
 2.44553201e-03 1.60816874e-02 2.79894318e-03 2.29003725e-03
 1.08111748e-02 9.33023859e-03 3.26063026e-03 7.07072166e-03
 5.71375242e-03 1.37206360e-03 7.97209403e-03 1.03623419e-03
 1.04501132e-01 1.90443837e-02 1.12921747e-02 2.82190353e-02
 5.25222835e-03 3.14677611e-03 6.42234492e-03 7.01531514e-04
 1.99874802e-03 3.85453201e-02 1.67070244e-03 2.11023594e-03
 1.43643963e-03 3.11029301e-03 1.36631824e-03 2.79922977e-03
 6.00970450e-04 1.66069730e-02 1.80215072e-03 4.48887739e-02
 5.93202142e-03 9.67444428e-03 1.57407655e-03 7.86687471e-03
 2.18586382e-03 1.81822041e-03 5.02204178e-03 1.89512044e-03
 2.82136760e-03 1.992789

In [11]:
tracemalloc.start()


tracemalloc.start()
intial_time = time.time()
print(f"Root Mean Squared Error is {rmse_score(model, test)}")
current, peak = tracemalloc.get_traced_memory()

print(f"Inital memory usage was {current / 10**6} MB; Peak was {peak / 10**6} MB; and difference is {(peak / 10**6) - (current / 10**6)} MB")
print(f"The total time is {time.time() - intial_time} seconds")

Root Mean Squared Error is 1.0714614391326904
Inital memory usage was 0.003938 MB; Peak was 0.407204 MB; and difference is 0.403266 MB
The total time is 0.006630897521972656 seconds


In [22]:
# scikit-learn bootstrap
from sklearn.utils import resample
# data sample

# prepare bootstrap sample
for i in range(9,0,-1):
    boot_uid = resample(uids, n_samples=int(uids.size * i /10), random_state=1)
    boot_iid = resample(iids, n_samples=int(iids.size * i /10), random_state=1)
    print(boot_uid.size)
    
    boot_ratings = resample(ratings, n_samples=int(ratings.size * i /10), random_state=1)
    boot_timestamps = resample(timestamps,n_samples=int(timestamps.size * i /10), random_state=1)
    print(boot_ratings.size)
    dataset_boot = Interactions(user_ids=boot_uid,item_ids=boot_iid,ratings=boot_ratings,timestamps=boot_timestamps)

    #lets initialise the seed, so that its repeatable and reproducible 
    train, test = random_train_test_split(dataset_boot, test_percentage=0.2)
    #tracemalloc.start()
    model = ExplicitFactorizationModel(n_iter=1)
    #intial_time = time.time()
    model.fit(train)
    print(f"Root Mean Squared Error is {rmse_score(model, test)}")
    #current, peak = tracemalloc.get_traced_memory()

    #print(f"Inital memory usage was {current / 10**6} MB; Peak was {peak / 10**6} MB; and difference is {(peak / 10**6) - (current / 10**6)} MB")
    #print(f"The total time is {time.time() - intial_time} seconds")

    #tracemalloc.stop()


    
    






90752
90752
Root Mean Squared Error is 1.0128930807113647
80668
80668
Root Mean Squared Error is 1.0671584606170654
70585
70585
Root Mean Squared Error is 1.112313151359558
60501
60501
Root Mean Squared Error is 1.2214707136154175
50418
50418
Root Mean Squared Error is 1.3641984462738037
40334
40334
Root Mean Squared Error is 1.6188830137252808
30250
30250
Root Mean Squared Error is 2.292501449584961
20167
20167
Root Mean Squared Error is 2.9873688220977783
10083
10083
Root Mean Squared Error is 3.379795551300049
