In [1]:
from spotlight.cross_validation import user_based_train_test_split
from spotlight.datasets.movielens import get_movielens_dataset
from spotlight.evaluation import rmse_score,mrr_score,precision_recall_score
from spotlight.factorization.explicit import ExplicitFactorizationModel
import pandas as pd
import numpy as np

In [2]:
ratings_df = pd.read_csv("ml-latest-small/ratings.csv")
movies_df = pd.read_csv("ml-latest-small/movies.csv")

from collections import defaultdict
from itertools import count

uid_map = defaultdict(count().__next__)
iid_map = defaultdict(count().__next__)
uids = np.array([uid_map[uid] for uid in ratings_df["userId"].values ], dtype=np.int32)
iids = np.array([iid_map[iid] for iid in ratings_df["movieId"].values ], dtype=np.int32)


uid_rev_map = {v: k for k, v in uid_map.items()}
iid_rev_map = {v: k for k, v in iid_map.items()}


ratings = ratings_df["rating"].values.astype(np.float32)
timestamps = ratings_df["timestamp"].values.astype(np.int32)

from spotlight.interactions import Interactions
from spotlight.cross_validation import random_train_test_split

dataset = Interactions(user_ids=uids,item_ids=iids,ratings=ratings,timestamps=timestamps)

#lets initialise the seed, so that its repeatable and reproducible 
train, test = random_train_test_split(dataset, test_percentage=0.2)

In [3]:
dataset

<Interactions dataset (610 users x 9724 items x 100836 interactions)>

In [4]:
train

<Interactions dataset (610 users x 9724 items x 80668 interactions)>

In [5]:
test

<Interactions dataset (610 users x 9724 items x 20168 interactions)>

In [6]:
import os
print(os.getpid())

50310


In [7]:
import tracemalloc
import time


#tracemalloc.start()
model = ExplicitFactorizationModel(n_iter=1)
intial_time = time.time()
model.fit(train)
#current, peak = tracemalloc.get_traced_memory()

#print(f"Inital memory usage was {current / 10**6} MB; Peak was {peak / 10**6} MB; and difference is {(peak / 10**6) - (current / 10**6)} MB")
print(f"The total time is {time.time() - intial_time} seconds")

tracemalloc.stop()
#snapshot2 = tracemalloc.take_snapshot()



The total time is 1.741980791091919 seconds


In [8]:
#tracemalloc.start()
intial_time = time.time()
print(mrr_score(model, test))
#current, peak = tracemalloc.get_traced_memory()
print(os.getpid())
#print(f"Inital memory usage was {current / 10**6} MB; Peak was {peak / 10**6} MB; and difference is {(peak / 10**6) - (current / 10**6)} MB")
print(f"The total time is {time.time() - intial_time} seconds")


[0.01384568 0.00371565 0.00068255 0.00893842 0.02497375 0.00308549
 0.0021226  0.0158528  0.00527613 0.00127559 0.00143893 0.00065225
 0.0013179  0.00480994 0.01036695 0.01078346 0.00941135 0.00453576
 0.00204751 0.00118723 0.00304763 0.00661666 0.00454634 0.00214611
 0.00263105 0.00506843 0.00402386 0.00662681 0.01239005 0.00704696
 0.00152101 0.00658425 0.01841745 0.00151906 0.08423974 0.00244654
 0.0193082  0.03272271 0.01159391 0.01342264 0.00179444 0.00141136
 0.00219772 0.0021053  0.00198837 0.03366072 0.00215349 0.00295902
 0.00317736 0.00249096 0.00343177 0.00183049 0.01048372 0.04470015
 0.00198894 0.01971769 0.00362935 0.01467669 0.00412951 0.00100863
 0.01078182 0.00377763 0.00884867 0.00775572 0.00962144 0.0024
 0.0051236  0.00312802 0.0036493  0.00154631 0.00261772 0.004189
 0.00235175 0.00240003 0.0076801  0.00457066 0.00585499 0.01441395
 0.00933867 0.00271137 0.0285768  0.00152814 0.01305126 0.00411758
 0.01291079 0.00300113 0.00434051 0.00556714 0.00058332 0.00088461
 

In [9]:
tracemalloc.start()


tracemalloc.start()
intial_time = time.time()
print(f"Root Mean Squared Error is {rmse_score(model, test)}")
current, peak = tracemalloc.get_traced_memory()

print(f"Inital memory usage was {current / 10**6} MB; Peak was {peak / 10**6} MB; and difference is {(peak / 10**6) - (current / 10**6)} MB")
print(f"The total time is {time.time() - intial_time} seconds")

Root Mean Squared Error is 1.0830034017562866
Inital memory usage was 0.004638 MB; Peak was 0.4079 MB; and difference is 0.403262 MB
The total time is 0.006904125213623047 seconds


In [22]:
# scikit-learn bootstrap
from sklearn.utils import resample
from multiprocessing import Process, Lock, cpu_count, active_children, Value
import resource
# data sample

resample_train_cbn = []
resample_test_cbn = []
# prepare bootstrap sample
for i in range(10,0,-1):
    from sklearn.utils import resample
    boot_uid = resample(uids, n_samples=int(uids.size * i / 10), random_state=1)
    boot_iid = resample(iids, n_samples=int(iids.size * i / 10), random_state=1)
    print(boot_uid.size)
    int_uid = []
    int_iid = []
    int_ratings = []
    int_timestamps = []
    
    temp_sample = ratings_df[ratings_df['userId'].isin(boot_uid)]
    final_sample = temp_sample[temp_sample['movieId'].isin(boot_iid)]
    #print(final_sample)
    int_uid = np.array([uid_map[uid] for uid in final_sample["userId"].values ], dtype=np.int32)
    int_iid = np.array([iid_map[iid] for iid in final_sample["movieId"].values ], dtype=np.int32)
    int_ratings = final_sample['rating'].values.astype(np.float32)
    int_timestamps = final_sample['timestamp'].values.astype(np.int32)

    dataset_boot = Interactions(user_ids=int_uid,item_ids=int_uid,ratings=int_ratings,timestamps=int_timestamps)
    train, test = random_train_test_split(dataset_boot, test_percentage=0.2)
    resample_train_cbn.append(train)
    resample_test_cbn.append(test)
    print('---------------------')
    print(len(resample_train_cbn))
    print(len(resample_test_cbn))
    print('---------------------')

def train_method(num_1,num_2):
    
    model = ExplicitFactorizationModel(n_iter=1)
    intial_time =  resource.getrusage(resource.RUSAGE_SELF); 
    model.fit(resample_train_cbn[num_1])
    final_time = resource.getrusage(resource.RUSAGE_SELF); 
    overall_time = final_time.ru_stime - intial_time.ru_stime
    print(overall_time)
    print(f"Root Mean Squared Error is {rmse_score(model, resample_test_cbn[num_2])}")
    print(f"Root Mean Squared Error is {precision_recall_score(model, resample_test_cbn[num_2])}")
    #print(f"Root Mean Squared Error is {rmse_score(model, test)}")

train_method(0,0)
'''
if __name__ == '__main__':
    
    p10 = Process(target=train_method, args=(0,))
    p9 = Process(target=train_method, args=(1,))
    p8 = Process(target=train_method, args=(2,))
    p7 = Process(target=train_method, args=(3,))
    p6 = Process(target=train_method, args=(4,))
    p5 = Process(target=train_method, args=(5,))
    p4 = Process(target=train_method, args=(6,))
    p3 = Process(target=train_method, args=(7,))
    p2 = Process(target=train_method, args=(8,))
    p1 = Process(target=train_method, args=(9,))
    
    p10.start()
    p10.join()
    p9.start()
    p9.join()
    p8.start()
    p8.join()
    p7.start()
    p7.join()
    p6.start()
    p6.join()
    p5.start()
    p5.join()
    p4.start()
    p4.join()
    p3.start()
    p3.join()
    p2.start()
    p2.join()
    p1.start()
    p1.join()
#     print("The number of CPU is:" + str(cpu_count()))
#     for p in active_children():
#         print("child   p.name:" + p.name + "\tp.id" + str(p.pid))
#     print ("END!!!!!!!!!!!!!!!!!")
    
# #     for num in range(10,0,-1):
# #         Process(target=m, args=(lock, resample(i))).start()        
'''

100836
---------------------
1
1
---------------------
90752
---------------------
2
2
---------------------
80668
---------------------
3
3
---------------------
70585
---------------------
4
4
---------------------
60501
---------------------
5
5
---------------------
50418
---------------------
6
6
---------------------
40334
---------------------
7
7
---------------------
30250
---------------------
8
8
---------------------
20167
---------------------
9
9
---------------------
10083
---------------------
10
10
---------------------
0.060502000000000056
Root Mean Squared Error is 1.0141733884811401


NameError: name 'precision_recall_score' is not defined

In [None]:

def f(l, i):
    l.acquire()
    try:
        print('hello world', i)
    finally:
        l.release()

if __name__ == '__main__':
    lock = Lock()

    for num in range(10):
        Process(target=f, args=(lock, num)).start()

In [31]:
from multiprocessing import Process
import os

def info(title):
    print(title)
    print('module name:', __name__)
    print('parent process:', os.getppid())
    print('process id:', os.getpid())

def f(name):
    info('function f')
    print('hello', name)

if __name__ == '__main__':
    info('main line')
    p = Process(target=f, args=('bob',))
    p.start()
    p.join()

main line
module name: __main__
parent process: 13043
process id: 32672
function f
module name: __main__
parent process: 32672
process id: 42836
hello bob
