In [21]:
from spotlight.cross_validation import user_based_train_test_split
from spotlight.datasets.movielens import get_movielens_dataset
from spotlight.evaluation import rmse_score,mrr_score
from spotlight.factorization.explicit import ExplicitFactorizationModel
import pandas as pd
import numpy as np

In [22]:
ratings_df = pd.read_csv("ml-latest-small/ratings.csv")
movies_df = pd.read_csv("ml-latest-small/movies.csv")

from collections import defaultdict
from itertools import count

uid_map = defaultdict(count().__next__)
iid_map = defaultdict(count().__next__)
uids = np.array([uid_map[uid] for uid in ratings_df["userId"].values ], dtype=np.int32)
iids = np.array([iid_map[iid] for iid in ratings_df["movieId"].values ], dtype=np.int32)


uid_rev_map = {v: k for k, v in uid_map.items()}
iid_rev_map = {v: k for k, v in iid_map.items()}


ratings = ratings_df["rating"].values.astype(np.float32)
timestamps = ratings_df["timestamp"].values.astype(np.int32)

from spotlight.interactions import Interactions
from spotlight.cross_validation import random_train_test_split

dataset = Interactions(user_ids=uids,item_ids=iids,ratings=ratings,timestamps=timestamps)

#lets initialise the seed, so that its repeatable and reproducible 
train, test = random_train_test_split(dataset, test_percentage=0.2)

In [23]:
dataset

<Interactions dataset (610 users x 9724 items x 100836 interactions)>

In [24]:
train

<Interactions dataset (610 users x 9724 items x 80668 interactions)>

In [25]:
test

<Interactions dataset (610 users x 9724 items x 20168 interactions)>

In [26]:
import os
print(os.getpid())

32672


In [27]:
import tracemalloc
import time


#tracemalloc.start()
model = ExplicitFactorizationModel(n_iter=1)
intial_time = time.time()
model.fit(train)
#current, peak = tracemalloc.get_traced_memory()

#print(f"Inital memory usage was {current / 10**6} MB; Peak was {peak / 10**6} MB; and difference is {(peak / 10**6) - (current / 10**6)} MB")
print(f"The total time is {time.time() - intial_time} seconds")

tracemalloc.stop()
#snapshot2 = tracemalloc.take_snapshot()



The total time is 1.754486083984375 seconds


In [28]:
#tracemalloc.start()
intial_time = time.time()
print(mrr_score(model, test))
#current, peak = tracemalloc.get_traced_memory()
print(os.getpid())
#print(f"Inital memory usage was {current / 10**6} MB; Peak was {peak / 10**6} MB; and difference is {(peak / 10**6) - (current / 10**6)} MB")
print(f"The total time is {time.time() - intial_time} seconds")


[0.00421241 0.00426889 0.00156217 0.00748395 0.00312102 0.00317642
 0.00247378 0.03694599 0.01453661 0.00217685 0.0012112  0.00232136
 0.00162011 0.0184774  0.00170536 0.00558069 0.02421537 0.00750212
 0.00109846 0.00198895 0.00254249 0.00406532 0.00287336 0.00164356
 0.01045416 0.03829464 0.00163038 0.00290702 0.01081976 0.01338427
 0.01565582 0.00296544 0.00408809 0.00099209 0.00209141 0.00306043
 0.00167603 0.00693268 0.00260071 0.00108315 0.001981   0.00216063
 0.0021661  0.00307275 0.00193258 0.00104411 0.00539095 0.00230377
 0.00116495 0.00367558 0.00084112 0.00960837 0.00266198 0.01142953
 0.0006656  0.0088644  0.00418857 0.00669149 0.00518044 0.00051125
 0.00378633 0.01075055 0.00307789 0.00186261 0.00518901 0.00431731
 0.01439185 0.00299818 0.00168706 0.0016071  0.00333651 0.02379733
 0.00133408 0.00517919 0.00279277 0.00324594 0.00530026 0.00487701
 0.0024202  0.00190556 0.01883178 0.00122136 0.0053982  0.01199843
 0.00582057 0.00342545 0.00301387 0.00642007 0.00058013 0.0005

In [29]:
tracemalloc.start()


tracemalloc.start()
intial_time = time.time()
print(f"Root Mean Squared Error is {rmse_score(model, test)}")
current, peak = tracemalloc.get_traced_memory()

print(f"Inital memory usage was {current / 10**6} MB; Peak was {peak / 10**6} MB; and difference is {(peak / 10**6) - (current / 10**6)} MB")
print(f"The total time is {time.time() - intial_time} seconds")

Root Mean Squared Error is 1.0642986297607422
Inital memory usage was 0.001826 MB; Peak was 0.40554 MB; and difference is 0.403714 MB
The total time is 0.007669925689697266 seconds


In [63]:
# scikit-learn bootstrap
from sklearn.utils import resample
from multiprocessing import Process, Lock, cpu_count, active_children
# data sample

# prepare bootstrap sample
def resample_10():
    boot_uid = resample(uids, n_samples=int(uids.size), random_state=1)
    boot_iid = resample(iids, n_samples=int(iids.size), random_state=1)
    print(boot_uid.size)
    int_uid = []
    int_iid = []
    int_ratings = []
    int_timestamps = []
    
    temp_sample = ratings_df[ratings_df['userId'].isin(boot_uid)]
    final_sample = temp_sample[temp_sample['movieId'].isin(boot_iid)]
    #print(final_sample)
    int_uid = np.array([uid_map[uid] for uid in final_sample["userId"].values ], dtype=np.int32)
    int_iid = np.array([iid_map[iid] for iid in final_sample["movieId"].values ], dtype=np.int32)
    int_ratings = final_sample['rating'].values.astype(np.float32)
    int_timestamps = final_sample['timestamp'].values.astype(np.int32)

    dataset_boot = Interactions(user_ids=int_uid,item_ids=int_uid,ratings=int_ratings,timestamps=int_timestamps)
    train, test = random_train_test_split(dataset_boot, test_percentage=0.2)
    model = ExplicitFactorizationModel(n_iter=1)
    #intial_time = time.time()
    model.fit(train)
    print(f"Root Mean Squared Error is {rmse_score(model, test)}")

def resample_9():
    boot_uid = resample(uids, n_samples=int(uids.size * 9 /10), random_state=1)
    boot_iid = resample(iids, n_samples=int(iids.size * 9 /10), random_state=1)
    print(boot_uid.size)
    int_uid = []
    int_iid = []
    int_ratings = []
    int_timestamps = []
    
    temp_sample = ratings_df[ratings_df['userId'].isin(boot_uid)]
    final_sample = temp_sample[temp_sample['movieId'].isin(boot_iid)]
    #print(final_sample)
    int_uid = np.array([uid_map[uid] for uid in final_sample["userId"].values ], dtype=np.int32)
    int_iid = np.array([iid_map[iid] for iid in final_sample["movieId"].values ], dtype=np.int32)
    int_ratings = final_sample['rating'].values.astype(np.float32)
    int_timestamps = final_sample['timestamp'].values.astype(np.int32)

    dataset_boot = Interactions(user_ids=int_uid,item_ids=int_uid,ratings=int_ratings,timestamps=int_timestamps)
    train, test = random_train_test_split(dataset_boot, test_percentage=0.2)
    model = ExplicitFactorizationModel(n_iter=1)
    #intial_time = time.time()
    model.fit(train)
    print(f"Root Mean Squared Error is {rmse_score(model, test)}")

def resample_8():
    boot_uid = resample(uids, n_samples=int(uids.size * 8 /10), random_state=1)
    boot_iid = resample(iids, n_samples=int(iids.size * 8 /10), random_state=1)
    print(boot_uid.size)
    int_uid = []
    int_iid = []
    int_ratings = []
    int_timestamps = []
    
    temp_sample = ratings_df[ratings_df['userId'].isin(boot_uid)]
    final_sample = temp_sample[temp_sample['movieId'].isin(boot_iid)]
    #print(final_sample)
    int_uid = np.array([uid_map[uid] for uid in final_sample["userId"].values ], dtype=np.int32)
    int_iid = np.array([iid_map[iid] for iid in final_sample["movieId"].values ], dtype=np.int32)
    int_ratings = final_sample['rating'].values.astype(np.float32)
    int_timestamps = final_sample['timestamp'].values.astype(np.int32)

    dataset_boot = Interactions(user_ids=int_uid,item_ids=int_uid,ratings=int_ratings,timestamps=int_timestamps)
    train, test = random_train_test_split(dataset_boot, test_percentage=0.2)
    model = ExplicitFactorizationModel(n_iter=1)
    #intial_time = time.time()
    model.fit(train)
    print(f"Root Mean Squared Error is {rmse_score(model, test)}")

def resample_7():
    boot_uid = resample(uids, n_samples=int(uids.size * 7 /10), random_state=1)
    boot_iid = resample(iids, n_samples=int(iids.size * 7 /10), random_state=1)
    print(boot_uid.size)
    int_uid = []
    int_iid = []
    int_ratings = []
    int_timestamps = []
    
    temp_sample = ratings_df[ratings_df['userId'].isin(boot_uid)]
    final_sample = temp_sample[temp_sample['movieId'].isin(boot_iid)]
    #print(final_sample)
    int_uid = np.array([uid_map[uid] for uid in final_sample["userId"].values ], dtype=np.int32)
    int_iid = np.array([iid_map[iid] for iid in final_sample["movieId"].values ], dtype=np.int32)
    int_ratings = final_sample['rating'].values.astype(np.float32)
    int_timestamps = final_sample['timestamp'].values.astype(np.int32)

    dataset_boot = Interactions(user_ids=int_uid,item_ids=int_uid,ratings=int_ratings,timestamps=int_timestamps)
    train, test = random_train_test_split(dataset_boot, test_percentage=0.2)
    model = ExplicitFactorizationModel(n_iter=1)
    #intial_time = time.time()
    model.fit(train)
    print(f"Root Mean Squared Error is {rmse_score(model, test)}")
    
def resample_6():
    boot_uid = resample(uids, n_samples=int(uids.size * 6 /10), random_state=1)
    boot_iid = resample(iids, n_samples=int(iids.size * 6 /10), random_state=1)
    print(boot_uid.size)
    int_uid = []
    int_iid = []
    int_ratings = []
    int_timestamps = []
    
    temp_sample = ratings_df[ratings_df['userId'].isin(boot_uid)]
    final_sample = temp_sample[temp_sample['movieId'].isin(boot_iid)]
    #print(final_sample)
    int_uid = np.array([uid_map[uid] for uid in final_sample["userId"].values ], dtype=np.int32)
    int_iid = np.array([iid_map[iid] for iid in final_sample["movieId"].values ], dtype=np.int32)
    int_ratings = final_sample['rating'].values.astype(np.float32)
    int_timestamps = final_sample['timestamp'].values.astype(np.int32)

    dataset_boot = Interactions(user_ids=int_uid,item_ids=int_uid,ratings=int_ratings,timestamps=int_timestamps)
    train, test = random_train_test_split(dataset_boot, test_percentage=0.2)
    model = ExplicitFactorizationModel(n_iter=1)
    #intial_time = time.time()
    model.fit(train)
    print(f"Root Mean Squared Error is {rmse_score(model, test)}")
    
def resample_5():
    boot_uid = resample(uids, n_samples=int(uids.size * 5 /10), random_state=1)
    boot_iid = resample(iids, n_samples=int(iids.size * 5 /10), random_state=1)
    print(boot_uid.size)
    int_uid = []
    int_iid = []
    int_ratings = []
    int_timestamps = []
    
    temp_sample = ratings_df[ratings_df['userId'].isin(boot_uid)]
    final_sample = temp_sample[temp_sample['movieId'].isin(boot_iid)]
    #print(final_sample)
    int_uid = np.array([uid_map[uid] for uid in final_sample["userId"].values ], dtype=np.int32)
    int_iid = np.array([iid_map[iid] for iid in final_sample["movieId"].values ], dtype=np.int32)
    int_ratings = final_sample['rating'].values.astype(np.float32)
    int_timestamps = final_sample['timestamp'].values.astype(np.int32)

    dataset_boot = Interactions(user_ids=int_uid,item_ids=int_uid,ratings=int_ratings,timestamps=int_timestamps)
    train, test = random_train_test_split(dataset_boot, test_percentage=0.2)
    model = ExplicitFactorizationModel(n_iter=1)
    #intial_time = time.time()
    model.fit(train)
    print(f"Root Mean Squared Error is {rmse_score(model, test)}")
    
def resample_4():
    boot_uid = resample(uids, n_samples=int(uids.size * 4 /10), random_state=1)
    boot_iid = resample(iids, n_samples=int(iids.size * 4 /10), random_state=1)
    print(boot_uid.size)
    int_uid = []
    int_iid = []
    int_ratings = []
    int_timestamps = []
    
    temp_sample = ratings_df[ratings_df['userId'].isin(boot_uid)]
    final_sample = temp_sample[temp_sample['movieId'].isin(boot_iid)]
    #print(final_sample)
    int_uid = np.array([uid_map[uid] for uid in final_sample["userId"].values ], dtype=np.int32)
    int_iid = np.array([iid_map[iid] for iid in final_sample["movieId"].values ], dtype=np.int32)
    int_ratings = final_sample['rating'].values.astype(np.float32)
    int_timestamps = final_sample['timestamp'].values.astype(np.int32)

    dataset_boot = Interactions(user_ids=int_uid,item_ids=int_uid,ratings=int_ratings,timestamps=int_timestamps)
    train, test = random_train_test_split(dataset_boot, test_percentage=0.2)
    model = ExplicitFactorizationModel(n_iter=1)
    #intial_time = time.time()
    model.fit(train)
    print(f"Root Mean Squared Error is {rmse_score(model, test)}")
    
def resample_3():
    boot_uid = resample(uids, n_samples=int(uids.size * 3 /10), random_state=1)
    boot_iid = resample(iids, n_samples=int(iids.size * 3 /10), random_state=1)
    print(boot_uid.size)
    int_uid = []
    int_iid = []
    int_ratings = []
    int_timestamps = []
    
    temp_sample = ratings_df[ratings_df['userId'].isin(boot_uid)]
    final_sample = temp_sample[temp_sample['movieId'].isin(boot_iid)]
    #print(final_sample)
    int_uid = np.array([uid_map[uid] for uid in final_sample["userId"].values ], dtype=np.int32)
    int_iid = np.array([iid_map[iid] for iid in final_sample["movieId"].values ], dtype=np.int32)
    int_ratings = final_sample['rating'].values.astype(np.float32)
    int_timestamps = final_sample['timestamp'].values.astype(np.int32)

    dataset_boot = Interactions(user_ids=int_uid,item_ids=int_uid,ratings=int_ratings,timestamps=int_timestamps)
    train, test = random_train_test_split(dataset_boot, test_percentage=0.2)
    model = ExplicitFactorizationModel(n_iter=1)
    #intial_time = time.time()
    model.fit(train)
    print(f"Root Mean Squared Error is {rmse_score(model, test)}")
    
def resample_2():
    boot_uid = resample(uids, n_samples=int(uids.size * 2 /10), random_state=1)
    boot_iid = resample(iids, n_samples=int(iids.size * 2 /10), random_state=1)
    print(boot_uid.size)
    int_uid = []
    int_iid = []
    int_ratings = []
    int_timestamps = []
    
    temp_sample = ratings_df[ratings_df['userId'].isin(boot_uid)]
    final_sample = temp_sample[temp_sample['movieId'].isin(boot_iid)]
    #print(final_sample)
    int_uid = np.array([uid_map[uid] for uid in final_sample["userId"].values ], dtype=np.int32)
    int_iid = np.array([iid_map[iid] for iid in final_sample["movieId"].values ], dtype=np.int32)
    int_ratings = final_sample['rating'].values.astype(np.float32)
    int_timestamps = final_sample['timestamp'].values.astype(np.int32)

    dataset_boot = Interactions(user_ids=int_uid,item_ids=int_uid,ratings=int_ratings,timestamps=int_timestamps)
    train, test = random_train_test_split(dataset_boot, test_percentage=0.2)
    model = ExplicitFactorizationModel(n_iter=1)
    #intial_time = time.time()
    model.fit(train)
    print(f"Root Mean Squared Error is {rmse_score(model, test)}")
    
def resample_1():
    boot_uid = resample(uids, n_samples=int(uids.size * 1 /10), random_state=1)
    boot_iid = resample(iids, n_samples=int(iids.size * 1 /10), random_state=1)
    print(boot_uid.size)
    int_uid = []
    int_iid = []
    int_ratings = []
    int_timestamps = []
    
    temp_sample = ratings_df[ratings_df['userId'].isin(boot_uid)]
    final_sample = temp_sample[temp_sample['movieId'].isin(boot_iid)]
    #print(final_sample)
    int_uid = np.array([uid_map[uid] for uid in final_sample["userId"].values ], dtype=np.int32)
    int_iid = np.array([iid_map[iid] for iid in final_sample["movieId"].values ], dtype=np.int32)
    int_ratings = final_sample['rating'].values.astype(np.float32)
    int_timestamps = final_sample['timestamp'].values.astype(np.int32)

    dataset_boot = Interactions(user_ids=int_uid,item_ids=int_uid,ratings=int_ratings,timestamps=int_timestamps)
    train, test = random_train_test_split(dataset_boot, test_percentage=0.2)
    model = ExplicitFactorizationModel(n_iter=1)
    #intial_time = time.time()
    model.fit(train)
    print(f"Root Mean Squared Error is {rmse_score(model, test)}")
    

if __name__ == '__main__':
    p10 = Process(target=resample_10, args=())
    p9 = Process(target=resample_9, args=())
    p8 = Process(target=resample_8, args=())
    p7 = Process(target=resample_7, args=())
    p6 = Process(target=resample_6, args=())
    p5 = Process(target=resample_5, args=())
    p4 = Process(target=resample_4, args=())
    p3 = Process(target=resample_3, args=())
    p2 = Process(target=resample_2, args=())
    p1 = Process(target=resample_1, args=())
    
    p10.start()
    p9.start()
    p8.start()
    p7.start()
    p6.start()
    p5.start()
    p4.start()
    p3.start()
    p2.start()
    p1.start()
    print("The number of CPU is:" + str(cpu_count()))
    for p in active_children():
        print("child   p.name:" + p.name + "\tp.id" + str(p.pid))
    print ("END!!!!!!!!!!!!!!!!!")
    
#     for num in range(10,0,-1):
#         Process(target=m, args=(lock, resample(i))).start()        


100836
90752
80668
7058560501

5041840334

30250The number of CPU is:8

20167

NameError: name 'multiprocessing' is not defined


10083


In [32]:
from multiprocessing import Process, Lock

def f(l, i):
    l.acquire()
    try:
        print('hello world', i)
    finally:
        l.release()

if __name__ == '__main__':
    lock = Lock()

    for num in range(10):
        Process(target=f, args=(lock, num)).start()

hello world 0
hello world 1
hello world 2
hello world 3
hello world 4
hello world 5
hello world 6
hello world 7
hello world 8
hello world 9


In [31]:
from multiprocessing import Process
import os

def info(title):
    print(title)
    print('module name:', __name__)
    print('parent process:', os.getppid())
    print('process id:', os.getpid())

def f(name):
    info('function f')
    print('hello', name)

if __name__ == '__main__':
    info('main line')
    p = Process(target=f, args=('bob',))
    p.start()
    p.join()

main line
module name: __main__
parent process: 13043
process id: 32672
function f
module name: __main__
parent process: 32672
process id: 42836
hello bob
