In [1]:
from spotlight.cross_validation import user_based_train_test_split
from spotlight.datasets.movielens import get_movielens_dataset
from spotlight.evaluation import rmse_score,mrr_score,precision_recall_score,sequence_mrr_score
from spotlight.factorization.explicit import ExplicitFactorizationModel
import pandas as pd
import numpy as np
import os


In [2]:
ratings_df = pd.read_csv("ml-latest-small/ratings.csv")
movies_df = pd.read_csv("ml-latest-small/movies.csv")

from collections import defaultdict
from itertools import count

uid_map = defaultdict(count().__next__)
iid_map = defaultdict(count().__next__)
uids = np.array([uid_map[uid] for uid in ratings_df["userId"].values ], dtype=np.int32)
iids = np.array([iid_map[iid] for iid in ratings_df["movieId"].values ], dtype=np.int32)


uid_rev_map = {v: k for k, v in uid_map.items()}
iid_rev_map = {v: k for k, v in iid_map.items()}


ratings = ratings_df["rating"].values.astype(np.float32)
timestamps = ratings_df["timestamp"].values.astype(np.int32)

from spotlight.interactions import Interactions
from spotlight.cross_validation import random_train_test_split

dataset = Interactions(user_ids=uids,item_ids=iids,ratings=ratings,timestamps=timestamps)

#lets initialise the seed, so that its repeatable and reproducible 
train, test = random_train_test_split(dataset, test_percentage=0.2)


In [3]:
dataset


<Interactions dataset (610 users x 9724 items x 100836 interactions)>

In [4]:
train

<Interactions dataset (610 users x 9724 items x 80668 interactions)>

In [5]:
test

<Interactions dataset (610 users x 9724 items x 20168 interactions)>

In [6]:
import os
print(os.getpid())

55253


In [7]:
import tracemalloc
import time


#tracemalloc.start()
model = ExplicitFactorizationModel(n_iter=1)
intial_time = time.time()
print(train)
model.fit(train)
#current, peak = tracemalloc.get_traced_memory()

#print(f"Inital memory usage was {current / 10**6} MB; Peak was {peak / 10**6} MB; and difference is {(peak / 10**6) - (current / 10**6)} MB")
print(f"The total time is {time.time() - intial_time} seconds")

tracemalloc.stop()
#snapshot2 = tracemalloc.take_snapshot()



<Interactions dataset (610 users x 9724 items x 80668 interactions)>
The total time is 1.6982049942016602 seconds


In [8]:
#tracemalloc.start()
intial_time = time.time()
print(mrr_score(model, test))
#current, peak = tracemalloc.get_traced_memory()
print(os.getpid())
#print(f"Inital memory usage was {current / 10**6} MB; Peak was {peak / 10**6} MB; and difference is {(peak / 10**6) - (current / 10**6)} MB")
print(f"The total time is {time.time() - intial_time} seconds")


[1.97830028e-03 1.32348464e-03 4.26648429e-04 4.25224809e-03
 4.04760911e-03 1.64795621e-02 3.58404429e-03 4.01385307e-02
 2.26558484e-03 4.37119993e-03 2.50656350e-03 1.52048584e-03
 2.09952994e-03 1.06512674e-02 1.35801667e-02 1.49458877e-02
 9.34945142e-03 2.00936699e-03 1.08172499e-03 3.12793302e-03
 1.57944407e-03 3.94716387e-03 1.11355405e-02 1.16505311e-03
 1.33868809e-03 8.24520801e-03 2.95887170e-03 6.84064013e-03
 4.33439251e-03 4.18660170e-03 2.38551301e-03 5.69734319e-03
 1.71671831e-03 1.37671105e-03 5.59044678e-03 6.21054373e-03
 1.24529849e-03 1.18882827e-02 6.32633473e-03 2.58366039e-03
 5.68765388e-03 2.25031581e-03 1.89176384e-03 2.04665838e-03
 2.24350644e-03 2.91154436e-03 1.63015419e-03 1.96878542e-02
 9.70455468e-04 3.68756306e-03 2.73579448e-03 2.39986100e-03
 5.31918742e-03 1.64280170e-02 1.43445934e-03 1.06611341e-02
 5.11186874e-03 5.29947098e-03 6.98755270e-03 1.20304728e-02
 1.73025388e-02 1.70905278e-03 4.06617680e-03 2.53961798e-03
 3.34927265e-03 4.428492

In [9]:
tracemalloc.start()


tracemalloc.start()
intial_time = time.time()
print(f"Root Mean Squared Error is {rmse_score(model, test)}")
current, peak = tracemalloc.get_traced_memory()

print(f"Inital memory usage was {current / 10**6} MB; Peak was {peak / 10**6} MB; and difference is {(peak / 10**6) - (current / 10**6)} MB")
print(f"The total time is {time.time() - intial_time} seconds")

Root Mean Squared Error is 1.0671308040618896
Inital memory usage was 0.004634 MB; Peak was 0.4079 MB; and difference is 0.40326599999999996 MB
The total time is 0.006567955017089844 seconds


In [None]:
# scikit-learn bootstrap
from sklearn.utils import resample
from multiprocessing import Process, Lock, cpu_count, active_children, Value
import resource
# data sample

resample_train_cbn = []
resample_test_cbn = []
# prepare bootstrap sample
for i in range(10,0,-1):
    from sklearn.utils import resample
    boot_uid = resample(uids, n_samples=int(uids.size * i / 10), random_state=1)
    boot_iid = resample(iids, n_samples=int(iids.size * i / 10), random_state=1)
#     print(boot_uid.size)
    int_uid = []
    int_iid = []
    int_ratings = []
    int_timestamps = []
    
    temp_sample = ratings_df[ratings_df['userId'].isin(boot_uid)]
    final_sample = temp_sample[temp_sample['movieId'].isin(boot_iid)]
    #print(final_sample)
    int_uid = np.array([uid_map[uid] for uid in final_sample["userId"].values ], dtype=np.int32)
    int_iid = np.array([iid_map[iid] for iid in final_sample["movieId"].values ], dtype=np.int32)
    int_ratings = final_sample['rating'].values.astype(np.float32)
    int_timestamps = final_sample['timestamp'].values.astype(np.int32)

    dataset_boot = Interactions(user_ids=int_uid,item_ids=int_uid,ratings=int_ratings,timestamps=int_timestamps)
    train, test = random_train_test_split(dataset_boot, test_percentage=0.2)
    resample_train_cbn.append(train)
    resample_test_cbn.append(test)
#     print('---------------------')
#     print(len(resample_train_cbn))
#     print(len(resample_test_cbn))
#     print('---------------------')

def train_method(num_1,num_2):
    print(num_1,num_2)
    print(f"PID is {os.getpid()}")
#     for line in open("/proc/%d/status" % os.getpid()).readlines():
#         print(line)
# #         if line.startswith("State:"):
# #             return line.split(":",1)[1].strip().split(' ')[0]

    model = ExplicitFactorizationModel(n_iter=1)
    print(1)
    intial_time =  resource.getrusage(resource.RUSAGE_SELF); 
    print(2)
    print(resample_train_cbn[num_1])
    model.fit(resample_train_cbn[num_1])
    print(3)
    final_time = resource.getrusage(resource.RUSAGE_SELF); 
    print(4)
    overall_time_s = final_time.ru_stime - intial_time.ru_stime
    overall_time_u = final_time.ru_utime - intial_time.ru_utime
    print(5)
    print(f"This process‘s system running time is {overall_time_s}")
    print(f"This process‘s user running time is {overall_time_u}")
    print(f"Root Mean Squared Error is {rmse_score(model, resample_test_cbn[num_2])}")
#     print(f"Root Mean Squared Error is {precision_recall_score(model, resample_test_cbn[num_2])}")



# train_method(0,0)

if __name__ == '__main__':
    
    p10 = Process(target=train_method, args=(0,0,))
    p9 = Process(target=train_method, args=(1,1,))
    p8 = Process(target=train_method, args=(2,2,))
    p7 = Process(target=train_method, args=(3,3,))
    p6 = Process(target=train_method, args=(4,4,))
    p5 = Process(target=train_method, args=(5,5,))
    p4 = Process(target=train_method, args=(6,6,))
    p3 = Process(target=train_method, args=(7,7,))
    p2 = Process(target=train_method, args=(8,8,))
    p1 = Process(target=train_method, args=(9,9,))
    
    p10.start()
    p10.join()
    p9.start()
    p9.join()
    p8.start()
    p8.join()
    p7.start()
    p7.join()
    p6.start()
    p6.join()
    p5.start()
    p5.join()
    p4.start()
    p4.join()
    p3.start()
    p3.join()
    p2.start()
    p2.join()
    p1.start()
    p1.join()
#     print("The number of CPU is:" + str(cpu_count()))
#     for p in active_children():
#         print("child   p.name:" + p.name + "\tp.id" + str(p.pid))
#     print ("END!!!!!!!!!!!!!!!!!")
    
# #     for num in range(10,0,-1):
# #         Process(target=m, args=(lock, resample(i))).start()        


0 0
PID is 55242
1
2
<Interactions dataset (609 users x 609 items x 56628 interactions)>


##### def f(l, i):
    l.acquire()
    try:
        print('hello world', i)
    finally:
        l.release()

if __name__ == '__main__':
    lock = Lock()

    for num in range(10):
        Process(target=f, args=(lock, num)).start()

In [31]:
from multiprocessing import Process
import os

def info(title):
    print(title)
    print('module name:', __name__)
    print('parent process:', os.getppid())
    print('process id:', os.getpid())

def f(name):
    info('function f')
    print('hello', name)

if __name__ == '__main__':
    info('main line')
    p = Process(target=f, args=('bob',))
    p.start()
    p.join()

main line
module name: __main__
parent process: 13043
process id: 32672
function f
module name: __main__
parent process: 32672
process id: 42836
hello bob
