In [1]:
from spotlight.cross_validation import user_based_train_test_split
from spotlight.datasets.movielens import get_movielens_dataset
from spotlight.evaluation import rmse_score,mrr_score,precision_recall_score
from spotlight.factorization.explicit import ExplicitFactorizationModel
import pandas as pd
import numpy as np
import os


In [2]:
ratings_df = pd.read_csv("ml-latest-small/ratings.csv")
movies_df = pd.read_csv("ml-latest-small/movies.csv")

from collections import defaultdict
from itertools import count

uid_map = defaultdict(count().__next__)
iid_map = defaultdict(count().__next__)
uids = np.array([uid_map[uid] for uid in ratings_df["userId"].values ], dtype=np.int32)
iids = np.array([iid_map[iid] for iid in ratings_df["movieId"].values ], dtype=np.int32)


uid_rev_map = {v: k for k, v in uid_map.items()}
iid_rev_map = {v: k for k, v in iid_map.items()}


ratings = ratings_df["rating"].values.astype(np.float32)
timestamps = ratings_df["timestamp"].values.astype(np.int32)

from spotlight.interactions import Interactions
from spotlight.cross_validation import random_train_test_split

dataset = Interactions(user_ids=uids,item_ids=iids,ratings=ratings,timestamps=timestamps)

#lets initialise the seed, so that its repeatable and reproducible 
train, test = random_train_test_split(dataset, test_percentage=0.2)

In [3]:
dataset

<Interactions dataset (610 users x 9724 items x 100836 interactions)>

In [4]:
train

<Interactions dataset (610 users x 9724 items x 80668 interactions)>

In [5]:
test

<Interactions dataset (610 users x 9724 items x 20168 interactions)>

In [6]:
import os
print(os.getpid())

54187


In [7]:
import tracemalloc
import time


#tracemalloc.start()
model = ExplicitFactorizationModel(n_iter=1)
intial_time = time.time()
model.fit(train)
#current, peak = tracemalloc.get_traced_memory()

#print(f"Inital memory usage was {current / 10**6} MB; Peak was {peak / 10**6} MB; and difference is {(peak / 10**6) - (current / 10**6)} MB")
print(f"The total time is {time.time() - intial_time} seconds")

tracemalloc.stop()
#snapshot2 = tracemalloc.take_snapshot()



The total time is 1.728585958480835 seconds


In [8]:
#tracemalloc.start()
intial_time = time.time()
print(mrr_score(model, test))
#current, peak = tracemalloc.get_traced_memory()
print(os.getpid())
#print(f"Inital memory usage was {current / 10**6} MB; Peak was {peak / 10**6} MB; and difference is {(peak / 10**6) - (current / 10**6)} MB")
print(f"The total time is {time.time() - intial_time} seconds")


[1.88265619e-03 2.59680544e-03 3.62936963e-03 3.15752752e-03
 7.96320300e-03 2.18093874e-03 1.50394559e-03 1.23412579e-03
 2.28978259e-03 1.57991841e-03 1.30073256e-03 5.99576279e-04
 1.63444048e-03 1.09141497e-02 2.81478823e-03 2.89648191e-03
 4.74275587e-03 1.59781585e-03 4.38325917e-03 1.61170186e-03
 1.81745418e-03 4.00764234e-03 2.31844663e-03 1.18219720e-03
 8.83454658e-03 2.65955903e-03 1.63661180e-02 2.17161111e-03
 4.07463621e-03 5.88832455e-03 1.87121297e-03 5.69184953e-03
 2.91325884e-03 2.31502089e-03 2.46036255e-03 2.75726568e-03
 5.61327019e-03 6.39249120e-03 2.05672317e-03 1.82746636e-03
 1.36599608e-02 1.34281911e-03 3.21467441e-03 1.66659749e-03
 5.21059185e-03 5.23341335e-03 2.90397364e-03 7.70463030e-03
 6.94778413e-03 5.98426670e-03 1.96977883e-03 3.63156703e-03
 2.83890985e-03 9.82676375e-04 1.27140658e-01 4.68734582e-03
 3.09955459e-03 2.47552165e-03 1.57268503e-03 1.68971918e-03
 1.04516287e-02 1.08273626e-02 2.94571868e-03 2.25298474e-03
 2.85802521e-03 2.018334

In [9]:
tracemalloc.start()


tracemalloc.start()
intial_time = time.time()
print(f"Root Mean Squared Error is {rmse_score(model, test)}")
current, peak = tracemalloc.get_traced_memory()

print(f"Inital memory usage was {current / 10**6} MB; Peak was {peak / 10**6} MB; and difference is {(peak / 10**6) - (current / 10**6)} MB")
print(f"The total time is {time.time() - intial_time} seconds")

Root Mean Squared Error is 1.074101448059082
Inital memory usage was 0.004577 MB; Peak was 0.407844 MB; and difference is 0.403267 MB
The total time is 0.006211042404174805 seconds


In [10]:
# scikit-learn bootstrap
from sklearn.utils import resample
from multiprocessing import Process, Lock, cpu_count, active_children, Value
import resource
# data sample

resample_train_cbn = []
resample_test_cbn = []
# prepare bootstrap sample
for i in range(10,0,-1):
    from sklearn.utils import resample
    boot_uid = resample(uids, n_samples=int(uids.size * i / 10), random_state=1)
    boot_iid = resample(iids, n_samples=int(iids.size * i / 10), random_state=1)
#     print(boot_uid.size)
    int_uid = []
    int_iid = []
    int_ratings = []
    int_timestamps = []
    
    temp_sample = ratings_df[ratings_df['userId'].isin(boot_uid)]
    final_sample = temp_sample[temp_sample['movieId'].isin(boot_iid)]
    #print(final_sample)
    int_uid = np.array([uid_map[uid] for uid in final_sample["userId"].values ], dtype=np.int32)
    int_iid = np.array([iid_map[iid] for iid in final_sample["movieId"].values ], dtype=np.int32)
    int_ratings = final_sample['rating'].values.astype(np.float32)
    int_timestamps = final_sample['timestamp'].values.astype(np.int32)

    dataset_boot = Interactions(user_ids=int_uid,item_ids=int_uid,ratings=int_ratings,timestamps=int_timestamps)
    train, test = random_train_test_split(dataset_boot, test_percentage=0.2)
    resample_train_cbn.append(train)
    resample_test_cbn.append(test)
#     print('---------------------')
#     print(len(resample_train_cbn))
#     print(len(resample_test_cbn))
#     print('---------------------')

def train_method(num_1,num_2):
    print(f"PID is {os.getpid()}")
    model = ExplicitFactorizationModel(n_iter=1)
    intial_time =  resource.getrusage(resource.RUSAGE_SELF); 
    model.fit(resample_train_cbn[num_1])
    final_time = resource.getrusage(resource.RUSAGE_SELF); 
    overall_time = final_time.ru_stime - intial_time.ru_stime
    print(overall_time)
    print(f"Root Mean Squared Error is {rmse_score(model, resample_test_cbn[num_2])}")
#     print(f"Root Mean Squared Error is {precision_recall_score(model, resample_test_cbn[num_2])}")
    #print(f"Root Mean Squared Error is {rmse_score(model, test)}")


#train_method(0,0)

if __name__ == '__main__':
    
    p10 = Process(target=train_method, args=(0,0,))
#     p9 = Process(target=train_method, args=(1,1,))
#     p8 = Process(target=train_method, args=(2,2,))
#     p7 = Process(target=train_method, args=(3,3,))
#     p6 = Process(target=train_method, args=(4,4,))
#     p5 = Process(target=train_method, args=(5,5,))
#     p4 = Process(target=train_method, args=(6,6,))
#     p3 = Process(target=train_method, args=(7,7,))
#     p2 = Process(target=train_method, args=(8,8,))
#     p1 = Process(target=train_method, args=(9,9,))
    
    p10.start()
#     p10.join()
#     p9.start()
#     p9.join()
#     p8.start()
#     p8.join()
#     p7.start()
#     p7.join()
#     p6.start()
#     p6.join()
#     p5.start()
#     p5.join()
#     p4.start()
#     p4.join()
#     p3.start()
#     p3.join()
#     p2.start()
#     p2.join()
#     p1.start()
#     p1.join()
#     print("The number of CPU is:" + str(cpu_count()))
#     for p in active_children():
#         print("child   p.name:" + p.name + "\tp.id" + str(p.pid))
#     print ("END!!!!!!!!!!!!!!!!!")
    
# #     for num in range(10,0,-1):
# #         Process(target=m, args=(lock, resample(i))).start()        


PID is 54192


In [11]:

def f(l, i):
    l.acquire()
    try:
        print('hello world', i)
    finally:
        l.release()

if __name__ == '__main__':
    lock = Lock()

    for num in range(10):
        Process(target=f, args=(lock, num)).start()

hello world 0
hello world1 
hello world 2
hello world3 
hello world 4
hello world 5
hello world 6
hello world 7
hello world 8
hello world 9


In [31]:
from multiprocessing import Process
import os

def info(title):
    print(title)
    print('module name:', __name__)
    print('parent process:', os.getppid())
    print('process id:', os.getpid())

def f(name):
    info('function f')
    print('hello', name)

if __name__ == '__main__':
    info('main line')
    p = Process(target=f, args=('bob',))
    p.start()
    p.join()

main line
module name: __main__
parent process: 13043
process id: 32672
function f
module name: __main__
parent process: 32672
process id: 42836
hello bob
