In [1]:
from spotlight.cross_validation import user_based_train_test_split
from spotlight.datasets.movielens import get_movielens_dataset
from spotlight.evaluation import rmse_score,mrr_score
from spotlight.factorization.explicit import ExplicitFactorizationModel
import pandas as pd
import numpy as np

In [2]:
ratings_df = pd.read_csv("ml-latest-small/ratings.csv")
movies_df = pd.read_csv("ml-latest-small/movies.csv")

from collections import defaultdict
from itertools import count

uid_map = defaultdict(count().__next__)
iid_map = defaultdict(count().__next__)
uids = np.array([uid_map[uid] for uid in ratings_df["userId"].values ], dtype=np.int32)
iids = np.array([iid_map[iid] for iid in ratings_df["movieId"].values ], dtype=np.int32)


uid_rev_map = {v: k for k, v in uid_map.items()}
iid_rev_map = {v: k for k, v in iid_map.items()}


ratings = ratings_df["rating"].values.astype(np.float32)
timestamps = ratings_df["timestamp"].values.astype(np.int32)

from spotlight.interactions import Interactions
from spotlight.cross_validation import random_train_test_split

dataset = Interactions(user_ids=uids,item_ids=iids,ratings=ratings,timestamps=timestamps)

#lets initialise the seed, so that its repeatable and reproducible 
train, test = random_train_test_split(dataset, test_percentage=0.2)

In [3]:
dataset

<Interactions dataset (610 users x 9724 items x 100836 interactions)>

In [4]:
train

<Interactions dataset (610 users x 9724 items x 80668 interactions)>

In [5]:
test

<Interactions dataset (610 users x 9724 items x 20168 interactions)>

In [6]:
import os
print(os.getpid())

32212


In [7]:
import tracemalloc
import time


#tracemalloc.start()
model = ExplicitFactorizationModel(n_iter=1)
intial_time = time.time()
model.fit(train)
#current, peak = tracemalloc.get_traced_memory()

#print(f"Inital memory usage was {current / 10**6} MB; Peak was {peak / 10**6} MB; and difference is {(peak / 10**6) - (current / 10**6)} MB")
print(f"The total time is {time.time() - intial_time} seconds")

tracemalloc.stop()
#snapshot2 = tracemalloc.take_snapshot()



The total time is 1.6932282447814941 seconds


In [8]:
#tracemalloc.start()
intial_time = time.time()
print(mrr_score(model, test))
#current, peak = tracemalloc.get_traced_memory()
print(os.getpid())
#print(f"Inital memory usage was {current / 10**6} MB; Peak was {peak / 10**6} MB; and difference is {(peak / 10**6) - (current / 10**6)} MB")
print(f"The total time is {time.time() - intial_time} seconds")


[2.00234926e-03 1.05978307e-02 4.51471994e-04 2.31754161e-03
 1.36842740e-02 3.75362181e-03 2.21120406e-03 3.80223890e-03
 6.84733436e-04 1.47308539e-03 1.58687127e-03 8.58676068e-04
 4.18118676e-03 6.95549517e-03 3.45339013e-03 4.71027333e-03
 5.62293928e-03 7.05923867e-03 1.56520051e-03 8.59024986e-04
 1.78980824e-03 5.95786047e-03 7.64973668e-03 2.10116999e-03
 2.11511956e-03 1.17896715e-03 2.93003789e-03 5.16887324e-03
 5.17057675e-03 1.41886864e-02 8.69838202e-04 4.32888394e-03
 3.80739573e-03 9.17878249e-04 1.45908926e-02 1.09013968e-02
 5.57790692e-02 7.09697082e-02 2.85505953e-03 3.77903520e-03
 3.51218400e-03 1.25895688e-03 1.93230070e-03 5.64327244e-04
 1.27720322e-03 7.93256758e-02 1.73287799e-03 6.89407557e-04
 1.56250000e-02 9.24863282e-04 2.57828845e-03 3.29685927e-03
 8.02557907e-03 4.06125554e-03 2.78054841e-03 5.71253438e-02
 2.35684169e-03 4.70106399e-03 5.43935786e-03 1.15057199e-03
 3.46652192e-03 3.63272276e-03 4.02079718e-03 3.00719423e-03
 4.13598293e-03 4.791354

In [9]:
tracemalloc.start()


tracemalloc.start()
intial_time = time.time()
print(f"Root Mean Squared Error is {rmse_score(model, test)}")
current, peak = tracemalloc.get_traced_memory()

print(f"Inital memory usage was {current / 10**6} MB; Peak was {peak / 10**6} MB; and difference is {(peak / 10**6) - (current / 10**6)} MB")
print(f"The total time is {time.time() - intial_time} seconds")

Root Mean Squared Error is 1.0727179050445557
Inital memory usage was 0.004474 MB; Peak was 0.40774 MB; and difference is 0.403266 MB
The total time is 0.00690007209777832 seconds


In [None]:
# scikit-learn bootstrap
from sklearn.utils import resample
# data sample

# prepare bootstrap sample
for i in range(9,0,-1):
    boot_uid = resample(uids, n_samples=int(uids.size * i /10), random_state=1)
    boot_iid = resample(iids, n_samples=int(iids.size * i /10), random_state=1)
    print(boot_uid.size)
    int_iid = []
    int_uid = []
    int_ratings = []
    int_timestamps = []
    for rt in ratings_df['rating']:
        for rt_u in ratings_df.loc[ratings_df['rating'] == rt]['userId']:           
            if uid_map[rt_u] in boot_uid:
                abc = ratings_df[(ratings_df['rating'] == rt) & (ratings_df['userId'] == rt_u)]
                for a in abc['movieId']:
                    if iid_map[a] in boot_iid:
                        int_iid.append(uid_map[rt_u])
                        int_uid.append(iid_map[a])
                        int_ratings.append(rt)
                        int_timestamps.append(ratings_df[(ratings_df['movieId'] == a) & (ratings_df['userId'] == rt_u)]['timestamp'].values)
    
       


    print(np.array(int_iid).size)
    print(np.array(int_uid).size)
    print(np.array(int_ratings).size)
    print(np.array(int_timestamps).size)


    #boot_ratings = resample(ratings, n_samples=int(ratings.size * i /10), random_state=1)
    #boot_timestamps = resample(timestamps,n_samples=int(timestamps.size * i /10), random_state=1)
    #print(boot_ratings.size)
    dataset_boot = Interactions(user_ids=int_uid,item_ids=int_uid,ratings=int_ratings,timestamps=int_timestamps)

    #lets initialise the seed, so that its repeatable and reproducible 
    train, test = random_train_test_split(dataset_boot, test_percentage=0.2)
    #tracemalloc.start()
    model = ExplicitFactorizationModel(n_iter=1)
    #intial_time = time.time()
    model.fit(train)
    print(f"Root Mean Squared Error is {rmse_score(model, test)}")
    #current, peak = tracemalloc.get_traced_memory()

    #tracemalloc.stop()
    #     for a in np.array(list(set(boot_uid))):
#         all_id = ratings_df.loc[ratings_df['userId'] == uid_rev_map[a]]
#         bid = all_id['movieId']
#         b_iid = [iid_map[iid] for iid in bid]
#         for b in b_iid:
#             boot_iid.append(b)
#         br = all_id['rating']
#         for c in br:
#             boot_ratings.append(c)
#         bts = all_id['timestamp']
#         for d in bts:
#             boot_timestamps.append(d)

    
    






90752
