In [137]:
import os
import numpy as np
import pandas as pd

In [4]:
    def load_ratings(dataset_name):
        base_dir = '../webUI/static'
        save_dir = os.path.join(base_dir, dataset_name)
        ratings_file = os.path.join(save_dir, 'input.csv')
        ratings = pd.read_csv(ratings_file)
        return ratings

In [5]:
df = load_ratings("ml-10k")
np_ratings = df.to_numpy()

In [6]:
    def load_similarities(dataset_name, k=20):
        base_dir = '../webUI/static'
        save_dir = os.path.join(base_dir, dataset_name)
        similiraties_file = os.path.join(save_dir, 'similarities.npy')
        neighbors_file = os.path.join(save_dir, 'neighbors.npy')
        similarities = np.load(similiraties_file)
        neighbors = np.load(neighbors_file)
        return similarities[:,:k], neighbors[:,:k]

In [7]:
similarities, neighbors = load_similarities("ml-10k")

In [8]:
    def load_item_to_index(dataset_name):
        base_dir = '../webUI/static'
        save_dir = os.path.join(base_dir, dataset_name)
        item_to_index_file = os.path.join(save_dir, 'item_to_index.txt')
        item_to_index = {}
        with open(item_to_index_file, 'r') as f:
            for line in f:
                 (key, val) = line.split(':')
                 item_to_index[key] = int(val)

        return item_to_index

In [9]:
item_to_index = load_item_to_index("ml-10k")

In [10]:
    def load_unique_users(dataset_name):
        base_dir = '../webUI/static'
        save_dir = os.path.join(base_dir, dataset_name)
        unique_users_file = os.path.join(save_dir, 'unique_user_ids.txt')
        unique_users = []
        with open(unique_users_file, 'r') as f:
            for line in f:
                unique_users.append(line.strip())
        return unique_users

In [11]:
userids = load_unique_users("ml-10k")

In [12]:
print(userids)

['17291429.0', '23573258.0', '26432623.0', '52793396.0', '53633605.0', '55012109.0', '57226048.0', '58937384.0', '67057254.0', '67057735.0', '74488735.0', '76019825.0', '83578673.0', '93769355.0', '96787900.0', '103920991.0', '108653825.0', '109622737.0', '116373079.0', '118840122.0', '132482584.0', '142843689.0', '147606305.0', '156934864.0', '160874621.0', '161262801.0', '162378039.0', '174194590.0', '174228242.0', '174626103.0', '193359259.0', '199729912.0', '210650161.0', '216459207.0', '222887239.0', '225954017.0', '248298308.0', '248957242.0', '258435110.0', '262903158.0', '267419726.0', '281416109.0', '293450741.0', '302244949.0', '303440603.0', '304057912.0', '325251614.0', '356600138.0', '388340618.0', '393912760.0', '410479725.0', '422939499.0', '450221005.0']


In [13]:
    def candidate_items(userid):
        """
        :param userid : user id for which we wish to find candidate items
        :return : I_u, candidates
        """
        # 1. Finding the set I_u of items already rated by user userid
        I_u = np_ratings[np_ratings[:, 0] == userid]
        I_u = I_u[:, 1]

        # 2. Taking the union of similar items for all items in I_u to form the set of candidate items
        c = set()
        for iid in I_u:
            #get index of iid
            title_index = item_to_index[iid]
            # add the neighbors of item iid in the set of candidate items
            c.update(neighbors[title_index])

        c = list(c)
        # 3. exclude from the set C all items in I_u.
        candidates = np.setdiff1d(c, I_u, assume_unique=True)

        return I_u, candidates

In [14]:
    def similarity_with_Iu(c, I_u):
        """
        compute similarity between an item c and a set of items I_u. For each item i in I_u, get similarity between
        i and c, if c exists in the set of items similar to itemid.
        :param c : itemid of a candidate item
        :param I_u : set of items already purchased by a given user
        :return w : similarity between c and I_u
        """
        w = 0
        for iid in I_u :
            # get similarity between itemid and c, if c is one of the k nearest neighbors of itemid
            title_index = item_to_index[iid]
            if c in neighbors[title_index] :
                w = w + similarities[title_index, neighbors[title_index] == c][0]
        return w

In [15]:
    def rank_candidates(candidates, I_u):
        """
        rank candidate items according to their similarities with i_u
        :param candidates : list of candidate items
        :param I_u : list of items purchased by the user
        :return ranked_candidates : dataframe of candidate items, ranked in descending order of similarities with I_u
        """

        # list of candidate items mapped to their corresponding similarities to I_u
        sims = [similarity_with_Iu(c, I_u) for c in candidates]
        mapping = list(zip(candidates, sims))

        ranked_candidates = sorted(mapping, key=lambda couple:couple[1], reverse=True)
        return ranked_candidates

In [19]:
    def topn_recommendation(userid, N=15):
        """
        Produce top-N recommendation for a given user
        :param userid : user for which we produce top-N recommendation
        :param n : length of the top-N recommendation list
        :return topn
        """
        # find candidate items
        I_u, candidates = candidate_items(userid)

        # rank candidate items according to their similarities with I_u
        ranked_candidates = rank_candidates(candidates, I_u)

        # get the first N row of ranked_candidates to build the top N recommendation list
        topn = pd.DataFrame(ranked_candidates[:N], columns=['itemid','similarity_with_Iu'])
        df_item_index1 = pd.DataFrame(list(item_to_index.items()), columns=['track_title', 'itemid'])
        topn = pd.merge(topn, df_item_index1, on='itemid', how='inner')
        return topn

In [20]:
result = topn_recommendation(float(userids[0]), 15)

In [21]:
print(type(result))

<class 'pandas.core.frame.DataFrame'>


In [22]:
print(result.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15 entries, 0 to 14
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   itemid              15 non-null     int64  
 1   similarity_with_Iu  15 non-null     float64
 2   track_title         15 non-null     object 
dtypes: float64(1), int64(1), object(1)
memory usage: 480.0+ bytes
None


In [27]:
for index, row in result.iterrows():
    print(row['itemid'], row['similarity_with_Iu'], row['track_title'])
    

2258 1.7891672158666703 Next to Me
1107 0.9999999999999999 Gambling Man
2900 0.9999999999999998 Somebody That I Used to Know
854 0.9955795027140815 Drunk
2087 0.9922778767136677 Melancholy Sky
612 0.9899494936611665 Commander
1420 0.9852117548196744 I Don't Dance
2966 0.9847835588179368 Standing Still
847 0.9805806756909201 Driving home for Christmas
473 0.9757162520935707 Burn It Down
3484 0.9747323185282745 V.E.N.O.M
384 0.9647638212377322 Blue Cassette
2329 0.9595689816501457 Nur noch kurz die Welt retten
677 0.9486832980505138 Dancing in the Dark
682 0.9486832980505138 Danza kuduro
