In [23]:
from spotlight.cross_validation import user_based_train_test_split
from spotlight.datasets.movielens import get_movielens_dataset
from spotlight.evaluation import rmse_score,mrr_score,precision_recall_score
from spotlight.factorization.explicit import ExplicitFactorizationModel
import pandas as pd
import numpy as np
import os


In [24]:
ratings_df = pd.read_csv("ml-latest-small/ratings.csv")
movies_df = pd.read_csv("ml-latest-small/movies.csv")

from collections import defaultdict
from itertools import count

uid_map = defaultdict(count().__next__)
iid_map = defaultdict(count().__next__)
uids = np.array([uid_map[uid] for uid in ratings_df["userId"].values ], dtype=np.int32)
iids = np.array([iid_map[iid] for iid in ratings_df["movieId"].values ], dtype=np.int32)


uid_rev_map = {v: k for k, v in uid_map.items()}
iid_rev_map = {v: k for k, v in iid_map.items()}


ratings = ratings_df["rating"].values.astype(np.float32)
timestamps = ratings_df["timestamp"].values.astype(np.int32)

from spotlight.interactions import Interactions
from spotlight.cross_validation import random_train_test_split

dataset = Interactions(user_ids=uids,item_ids=iids,ratings=ratings,timestamps=timestamps)

#lets initialise the seed, so that its repeatable and reproducible 
train, test = random_train_test_split(dataset, test_percentage=0.2)

In [25]:
dataset

<Interactions dataset (610 users x 9724 items x 100836 interactions)>

In [26]:
train

<Interactions dataset (610 users x 9724 items x 80668 interactions)>

In [27]:
test

<Interactions dataset (610 users x 9724 items x 20168 interactions)>

In [28]:
import os
print(os.getpid())

50310


In [29]:
import tracemalloc
import time


#tracemalloc.start()
model = ExplicitFactorizationModel(n_iter=1)
intial_time = time.time()
model.fit(train)
#current, peak = tracemalloc.get_traced_memory()

#print(f"Inital memory usage was {current / 10**6} MB; Peak was {peak / 10**6} MB; and difference is {(peak / 10**6) - (current / 10**6)} MB")
print(f"The total time is {time.time() - intial_time} seconds")

tracemalloc.stop()
#snapshot2 = tracemalloc.take_snapshot()



The total time is 1.6610691547393799 seconds


In [30]:
#tracemalloc.start()
intial_time = time.time()
print(mrr_score(model, test))
#current, peak = tracemalloc.get_traced_memory()
print(os.getpid())
#print(f"Inital memory usage was {current / 10**6} MB; Peak was {peak / 10**6} MB; and difference is {(peak / 10**6) - (current / 10**6)} MB")
print(f"The total time is {time.time() - intial_time} seconds")


[0.0021206  0.0025118  0.00812398 0.00260971 0.00506422 0.00360756
 0.0011882  0.02260296 0.00131675 0.00308321 0.00065737 0.0016475
 0.00262124 0.01251009 0.00150383 0.01091646 0.02661984 0.00270073
 0.00154309 0.00058714 0.00213852 0.0050239  0.00433856 0.00441526
 0.00713473 0.05287494 0.00385681 0.00313741 0.00810544 0.00147735
 0.0014817  0.01127782 0.00798321 0.0010063  0.00323253 0.00123535
 0.08401195 0.0015726  0.00330375 0.00717042 0.00354771 0.00141058
 0.00319982 0.00096736 0.00192814 0.02649895 0.00107377 0.00194781
 0.00505103 0.00189646 0.0012827  0.00297903 0.00210274 0.01699128
 0.00221265 0.00325838 0.00281727 0.01145128 0.0047016  0.07246772
 0.04644429 0.00190383 0.00616485 0.00247489 0.01557392 0.00436669
 0.00272555 0.00222172 0.00113871 0.00213023 0.00195728 0.01775662
 0.00373525 0.00346921 0.00753746 0.00505399 0.00850366 0.0054245
 0.0071511  0.0011181  0.05913835 0.0021182  0.00411063 0.01256974
 0.00510731 0.00329401 0.00057871 0.00284256 0.00135605 0.002421

In [31]:
tracemalloc.start()


tracemalloc.start()
intial_time = time.time()
print(f"Root Mean Squared Error is {rmse_score(model, test)}")
current, peak = tracemalloc.get_traced_memory()

print(f"Inital memory usage was {current / 10**6} MB; Peak was {peak / 10**6} MB; and difference is {(peak / 10**6) - (current / 10**6)} MB")
print(f"The total time is {time.time() - intial_time} seconds")

Root Mean Squared Error is 1.0843638181686401
Inital memory usage was 0.002346 MB; Peak was 0.405612 MB; and difference is 0.40326599999999996 MB
The total time is 0.006367921829223633 seconds


In [None]:
# scikit-learn bootstrap
from sklearn.utils import resample
from multiprocessing import Process, Lock, cpu_count, active_children, Value
import resource
# data sample

resample_train_cbn = []
resample_test_cbn = []
# prepare bootstrap sample
for i in range(10,0,-1):
    from sklearn.utils import resample
    boot_uid = resample(uids, n_samples=int(uids.size * i / 10), random_state=1)
    boot_iid = resample(iids, n_samples=int(iids.size * i / 10), random_state=1)
#     print(boot_uid.size)
    int_uid = []
    int_iid = []
    int_ratings = []
    int_timestamps = []
    
    temp_sample = ratings_df[ratings_df['userId'].isin(boot_uid)]
    final_sample = temp_sample[temp_sample['movieId'].isin(boot_iid)]
    #print(final_sample)
    int_uid = np.array([uid_map[uid] for uid in final_sample["userId"].values ], dtype=np.int32)
    int_iid = np.array([iid_map[iid] for iid in final_sample["movieId"].values ], dtype=np.int32)
    int_ratings = final_sample['rating'].values.astype(np.float32)
    int_timestamps = final_sample['timestamp'].values.astype(np.int32)

    dataset_boot = Interactions(user_ids=int_uid,item_ids=int_uid,ratings=int_ratings,timestamps=int_timestamps)
    train, test = random_train_test_split(dataset_boot, test_percentage=0.2)
    resample_train_cbn.append(train)
    resample_test_cbn.append(test)
#     print('---------------------')
#     print(len(resample_train_cbn))
#     print(len(resample_test_cbn))
#     print('---------------------')

def train_method(num_1,num_2):
    print(f"PID is {os.getpid()}")
    model = ExplicitFactorizationModel(n_iter=1)
    intial_time =  resource.getrusage(resource.RUSAGE_SELF); 
    model.fit(resample_train_cbn[num_1])
    final_time = resource.getrusage(resource.RUSAGE_SELF); 
    overall_time = final_time.ru_stime - intial_time.ru_stime
    print(overall_time)
    print(f"Root Mean Squared Error is {rmse_score(model, resample_test_cbn[num_2])}")
#     print(f"Root Mean Squared Error is {precision_recall_score(model, resample_test_cbn[num_2])}")
    #print(f"Root Mean Squared Error is {rmse_score(model, test)}")


#train_method(0,0)

if __name__ == '__main__':
    
    p10 = Process(target=train_method, args=(0,0,))
    p9 = Process(target=train_method, args=(1,1,))
    p8 = Process(target=train_method, args=(2,2,))
    p7 = Process(target=train_method, args=(3,3,))
    p6 = Process(target=train_method, args=(4,4,))
    p5 = Process(target=train_method, args=(5,5,))
    p4 = Process(target=train_method, args=(6,6,))
    p3 = Process(target=train_method, args=(7,7,))
    p2 = Process(target=train_method, args=(8,8,))
    p1 = Process(target=train_method, args=(9,9,))
    
    p10.start()
    p10.join()
    p9.start()
    p9.join()
    p8.start()
    p8.join()
    p7.start()
    p7.join()
    p6.start()
    p6.join()
    p5.start()
    p5.join()
    p4.start()
    p4.join()
    p3.start()
    p3.join()
    p2.start()
    p2.join()
    p1.start()
    p1.join()
#     print("The number of CPU is:" + str(cpu_count()))
#     for p in active_children():
#         print("child   p.name:" + p.name + "\tp.id" + str(p.pid))
#     print ("END!!!!!!!!!!!!!!!!!")
    
# #     for num in range(10,0,-1):
# #         Process(target=m, args=(lock, resample(i))).start()        


PID is 53740


In [None]:

def f(l, i):
    l.acquire()
    try:
        print('hello world', i)
    finally:
        l.release()

if __name__ == '__main__':
    lock = Lock()

    for num in range(10):
        Process(target=f, args=(lock, num)).start()

In [31]:
from multiprocessing import Process
import os

def info(title):
    print(title)
    print('module name:', __name__)
    print('parent process:', os.getppid())
    print('process id:', os.getpid())

def f(name):
    info('function f')
    print('hello', name)

if __name__ == '__main__':
    info('main line')
    p = Process(target=f, args=('bob',))
    p.start()
    p.join()

main line
module name: __main__
parent process: 13043
process id: 32672
function f
module name: __main__
parent process: 32672
process id: 42836
hello bob
