In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import sparse
import random
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfTransformer
import tensorflow as tf

TEST_SET_THRESHOLD = 10
TEST_SET_HOLDOUT = 0.2

# Defining methods to create csr matrices

In [2]:
def classify_durations(data):
    data.loc[tracks['duration_sec'].isin(range(60)),'duration_sec'] = 1
    data.loc[tracks['duration_sec'].isin(range(60,120)), 'duration_sec'] = 2
    data.loc[tracks['duration_sec'].isin(range(120,180)), 'duration_sec'] = 3
    data.loc[tracks['duration_sec'].isin(range(180,240)), 'duration_sec'] = 4
    data.loc[tracks['duration_sec'].isin(range(240,300)), 'duration_sec'] = 5
    data.loc[tracks['duration_sec'].isin(range(300,200000)), 'duration_sec'] = 6

def build_urm_csr(data):
    fill_data = np.ones(data.shape[0])
    #posso usare gli id direttamente solo perchè come già detto sono consistenti
    row = data['playlist_id'].values
    col = data['track_id'].values
    n_pl = np.amax(data['playlist_id']) + 1
    n_tr = np.amax(data['track_id']) + 1
    
    return sparse.csr_matrix((fill_data, (row, col)), dtype=float, shape=(n_pl, n_tr))

def build_icm_csr(data):
    
    classify_durations(data)
    
    albums_id = data['album_id']
    artists_id = data['artist_id']
    durations = data['duration_sec']
    tracks = data['track_id']
    
    albums_max = np.amax(albums_id)
    artists_max = np.amax(artists_id)
    durations_max = np.amax(durations)
    number_of_songs = data.shape[0]
    
    icm_csr_matrix = sparse.csr_matrix((number_of_songs, albums_max + artists_max + durations_max + 3), dtype=np.uint32)
    
    icm_csr_matrix[tracks,albums_id] = 1
    icm_csr_matrix[tracks, albums_max + artists_id] = 1
    icm_csr_matrix[tracks, albums_max + artists_max + durations] = 1

    return icm_csr_matrix

# SPLITTING DATASET

In [3]:

def create_targets(training_set):
    training_set = training_set
    grouped = training_set.groupby('playlist_id')['track_id'].nunique()
    clipped = grouped.index[grouped>10]
    nn_target_groups = training_set.loc[training_set['playlist_id'].isin(clipped)]

    nn_target_set = pd.DataFrame(columns=["playlist_id","track_id"])

    for name, group in nn_target_groups.groupby('playlist_id'):
        tail_group = group.tail(10)
        nn_target_set = nn_target_set.append(tail_group)

    training_set = training_set[training_set["playlist_id"].isin(clipped.tolist())]
    training_set = pd.concat([training_set, nn_target_set, nn_target_set]).drop_duplicates(keep=False)
    training_set.to_csv("nn_dataset/training_set.csv", index = False, header = ["playlist_id", "track_id"])
    nn_target_set.to_csv("nn_dataset/target_set.csv", index = False, header = ["playlist_id", "track_id"])
    return training_set

def open_training_set():
    return pd.read_csv('../nn_dataset/training_set.csv')

def split_dataset(train, threshold, holdout):
    grouped = train.groupby('playlist_id')['track_id'].nunique()
    clipped = grouped.index[grouped>threshold].tolist()
    test_set_indices = [ clipped[i] for i in sorted(random.sample(range(len(clipped)), int(holdout*len(clipped)))) ]
    test_groups = train.loc[train['playlist_id'].isin(test_set_indices)]
    
    test_set = pd.DataFrame(columns=["playlist_id","track_id"])
    for name, group in test_groups.groupby('playlist_id'):
        test_set = test_set.append(group.tail(10))

    training_set = pd.concat([train, test_set, test_set]).drop_duplicates(keep=False)
    test_set_playlists = test_set['playlist_id'].unique()
    return training_set, test_set, test_set_playlists


In [4]:
#tracks = pd.read_csv('../input/tracks.csv')
train = pd.read_csv('../input/train.csv')
target = pd.read_csv('../input/target_playlists.csv')

In [5]:
training_set, test_set, test_set_playlists = split_dataset(create_targets(train), TEST_SET_THRESHOLD,TEST_SET_HOLDOUT)

In [22]:
train = pd.read_csv('nn_dataset/training_set.csv')
target_set = pd.read_csv("nn_dataset/target_set.csv")
training_set, test_set, test_set_playlists = split_dataset(train, TEST_SET_THRESHOLD, TEST_SET_HOLDOUT)

In [23]:
urm_csr = build_urm_csr(training_set)
test_set_csr = build_urm_csr(test_set)
target_csr = build_urm_csr(target_set)


# EVALUATION METHODS

In [21]:
def precision(recommended_items, relevant_items):
    
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    
    precision_score = np.sum(is_relevant, dtype=np.float32) / len(is_relevant)
    
    return precision_score

def recall(recommended_items, relevant_items):
    
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    
    recall_score = np.sum(is_relevant, dtype=np.float32) / relevant_items.shape[0]
    
    return recall_score

def MAP(recommended_items, relevant_items):
       
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    
    # Cumulative sum: precision at 1, at 2, at 3 ...
    p_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))
    
    map_score = np.sum(p_at_k) / np.min([relevant_items.shape[0], is_relevant.shape[0]])

    return map_score


def evaluate_algorithm(URM_test, recommender_object, target_playlists, at=10):
    
    
    cumulative_precision = 0.0
    cumulative_recall = 0.0
    cumulative_MAP = 0.0
    
    num_eval = 0


    result = []
    
    for user_id in target_playlists:
    
        target_items = URM_test.getrow(user_id).indices
        
        recommended_items = recommender_object.recommend(user_id, at=at)
        num_eval+=1
        
        cumulative_precision += precision(recommended_items, target_items)
        cumulative_recall += recall(recommended_items, target_items)
        cumulative_MAP += MAP(recommended_items, target_items)
        
        recommendation_string = " ".join(str(i) for i in recommended_items)
        temp = [user_id,recommendation_string]
        result.append(temp)


    cumulative_precision /= num_eval
    cumulative_recall /= num_eval
    cumulative_MAP /= num_eval
    
    rec = pd.DataFrame(result)
    rec.to_csv("sample_submission.csv", index = False, header = ["playlist_id", "track_ids"])
    
    print("Recommender performance is: Precision = {:.6f}, Recall = {:.6f}, MAP = {:.6f}".format(
        cumulative_precision, cumulative_recall, cumulative_MAP)) 

# ALGORITHM

In [116]:
class NeuralNet(object):
    
    def __init__(self, URM_csr, target_csr):
        self.urm = URM_csr.nonzero()
        self.targets = target_csr.nonzero()
        self.target_indices = np.unique(self.targets[0])
        
        self.model = tf.keras.models.Sequential()
        self.model.add(tf.keras.layers.Dense(10000,input_dim = 20635, activation="relu"))
        self.model.add(tf.keras.layers.Dense(1000,activation="relu"))
        self.model.add(tf.keras.layers.Dense(20635,activation="softmax"))
        
        self.optimizer_rmsprop = tf.keras.optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=0.0)
        self.model.compile(loss="mean_squared_error", optimizer=self.optimizer_rmsprop, metrics=['accuracy'])
        
    def fit(self):
        for epoch in range(5):
            print("EPOCH " + str(epoch))
            for i in range(len(self.target_indices)):
                print("Training data number " + str(i))
                target = self.target_indices[i]
                input_masked = (self.urm[0] == target)*self.urm[1]
                output_masked = (self.targets[0] == target)*self.targets[1]
                input_data = input_masked[input_masked.nonzero()]
                output_data = output_masked[output_masked.nonzero()]
                x = np.zeros(20635, dtype = np.uint32)
                x[input_data] = 1
                x = x.reshape(-1, 20635)
                y = np.zeros(20635, dtype = np.uint32)
                y = y.reshape(-1, 20635)
                self.model.fit(x, y, epochs = 1)
        
    def recommend(self, user_id, at=10, remove_seen=True):
        user = self.URM_csr.getrow(user_id)
        if(user.size > self.k):
            itemPopularity = user.dot(self.item_similarities)
            popularItems = np.argsort(np.array(itemPopularity.todense())[0])
            popularItems = np.flip(popularItems, axis = 0)
        else:
            sim = self.user_similarities.getrow(user_id)
            itemPopularity = sim.dot(self.URM_csr)
            popularItems = np.argsort(np.array(itemPopularity.todense())[0])
            popularItems = np.flip(popularItems, axis = 0)
        
        if remove_seen:
            unseen_items_mask = np.in1d(popularItems, self.URM_csr[user_id].indices,
                                        assume_unique=True, invert = True)

            unseen_items = popularItems[unseen_items_mask]
            
            recommended_items = unseen_items[0:at]

        else:
            recommended_items = popularItems[0:at]
            
        #recommended_items = " ".join(str(i) for i in recommended_items)
        return recommended_items

# Testing algorithm

In [117]:
nn = NeuralNet(urm_csr, target_csr)
nn.fit()
#ensemble.fit(urm_csr, K_THRESHOLD_BEST)
#evaluate_algorithm(test_set_csr, ensemble, test_set_playlists)


EPOCH 0
Training data number 0
Epoch 1/1


ResourceExhaustedError: OOM when allocating tensor with shape[20365,10000] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[Node: dense/kernel/Initializer/random_uniform/RandomUniform = RandomUniform[T=DT_INT32, _class=["loc:@dense/kernel/Assign"], dtype=DT_FLOAT, seed=0, seed2=0, _device="/job:localhost/replica:0/task:0/device:GPU:0"](dense/kernel/Initializer/random_uniform/shape)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


Caused by op 'dense/kernel/Initializer/random_uniform/RandomUniform', defined at:
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\ipykernel\kernelapp.py", line 505, in start
    self.io_loop.start()
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\tornado\platform\asyncio.py", line 132, in start
    self.asyncio_loop.run_forever()
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\asyncio\base_events.py", line 427, in run_forever
    self._run_once()
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\asyncio\base_events.py", line 1440, in _run_once
    handle._run()
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\asyncio\events.py", line 145, in _run
    self._callback(*self._args)
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\tornado\ioloop.py", line 758, in _run_callback
    ret = callback()
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\tornado\stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\tornado\gen.py", line 1233, in inner
    self.run()
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\tornado\gen.py", line 1147, in run
    yielded = self.gen.send(value)
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\ipykernel\kernelbase.py", line 357, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\tornado\gen.py", line 326, in wrapper
    yielded = next(result)
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\ipykernel\kernelbase.py", line 267, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\tornado\gen.py", line 326, in wrapper
    yielded = next(result)
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\ipykernel\kernelbase.py", line 534, in execute_request
    user_expressions, allow_stdin,
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\tornado\gen.py", line 326, in wrapper
    yielded = next(result)
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\ipykernel\ipkernel.py", line 294, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\ipykernel\zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\IPython\core\interactiveshell.py", line 2819, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\IPython\core\interactiveshell.py", line 2845, in _run_cell
    return runner(coro)
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\IPython\core\async_helpers.py", line 67, in _pseudo_sync_runner
    coro.send(None)
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\IPython\core\interactiveshell.py", line 3020, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\IPython\core\interactiveshell.py", line 3185, in run_ast_nodes
    if (yield from self.run_code(code, result)):
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\IPython\core\interactiveshell.py", line 3267, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-26-e344808f3d9c>", line 1, in <module>
    nn = NeuralNet()
  File "<ipython-input-25-b1eed32f0ecb>", line 6, in __init__
    self.model.add(tf.keras.layers.Dense(10000,activation="relu"))
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\keras\_impl\keras\engine\sequential.py", line 185, in add
    output_tensor = layer(self.outputs[0])
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\keras\_impl\keras\engine\base_layer.py", line 314, in __call__
    output = super(Layer, self).__call__(inputs, *args, **kwargs)
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\layers\base.py", line 699, in __call__
    self.build(input_shapes)
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\layers\core.py", line 138, in build
    trainable=True)
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\layers\base.py", line 546, in add_variable
    partitioner=partitioner)
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\training\checkpointable.py", line 436, in _add_variable_with_custom_getter
    **kwargs_for_getter)
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 1317, in get_variable
    constraint=constraint)
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 1079, in get_variable
    constraint=constraint)
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 425, in get_variable
    constraint=constraint)
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 394, in _true_getter
    use_resource=use_resource, constraint=constraint)
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 786, in _get_single_variable
    use_resource=use_resource)
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 2220, in variable
    use_resource=use_resource)
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 2210, in <lambda>
    previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 2193, in default_variable_creator
    constraint=constraint)
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\ops\variables.py", line 235, in __init__
    constraint=constraint)
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\ops\variables.py", line 343, in _init_from_args
    initial_value(), name="initial_value", dtype=dtype)
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 770, in <lambda>
    shape.as_list(), dtype=dtype, partition_info=partition_info)
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\ops\init_ops.py", line 472, in __call__
    shape, -limit, limit, dtype, seed=self.seed)
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\ops\random_ops.py", line 242, in random_uniform
    rnd = gen_random_ops.random_uniform(shape, dtype, seed=seed1, seed2=seed2)
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\ops\gen_random_ops.py", line 673, in random_uniform
    name=name)
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\framework\ops.py", line 3392, in create_op
    op_def=op_def)
  File "C:\Users\giuse\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\framework\ops.py", line 1718, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[20365,10000] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[Node: dense/kernel/Initializer/random_uniform/RandomUniform = RandomUniform[T=DT_INT32, _class=["loc:@dense/kernel/Assign"], dtype=DT_FLOAT, seed=0, seed2=0, _device="/job:localhost/replica:0/task:0/device:GPU:0"](dense/kernel/Initializer/random_uniform/shape)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.



# TESTING SINGLE ITERATION

In [35]:
target_csr.nonzero()[1][:10].shape

(10,)

In [96]:
target_indices = target_csr.nonzero()[0]
target = target_indices[0]
urm = np.array(urm_csr.nonzero(), dtype= np.uint32)
x = (urm[0] == target)*urm[1]

In [104]:
x = x[x.nonzero()]
data = np.zeros(20635, dtype= np.uint32)
data[x] = 1

In [105]:
data


array([0, 0, 0, ..., 0, 0, 0], dtype=uint32)