In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [2]:
movies = pd.read_csv('../data/movies.dat', sep='::', header=None, names=['movie_id', 'title', 'genres'], encoding='ISO-8859-1')
ratings = pd.read_csv('../data/ratings.dat', sep='::', header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'], encoding='ISO-8859-1')
users = pd.read_csv('../data/users.dat', sep='::', header=None, names=['user_id', 'gender', 'age', 'occupation', 'zip_code'], encoding='ISO-8859-1')

In [3]:
def generate_arms(movies, ratings):

    # Define the TfidfVectorizer
    tfidf = TfidfVectorizer(stop_words='english')

    # Fit and transform the genres column of the movies dataframe
    movies_genres_tfidf = tfidf.fit_transform(movies['genres'])

    # Loop over each unique user ID
    arms = {}
    for user_id in ratings['user_id'].unique():

        # Compute the user preference vectors
        user_preferences = ratings[(ratings['user_id'] == user_id)]
        user_preferences = user_preferences.merge(movies, on='movie_id')
        user_preferences_genres_tfidf = tfidf.transform(user_preferences['genres'])
        user_preference_vector = user_preferences_genres_tfidf.sum(axis=0)

        # Convert the user preference vector to a numpy array
        user_preference_vector = np.array(user_preference_vector).reshape(-1)

        # Compute the cosine similarity between the user preference vector and the movie feature vectors
        similarity_scores = cosine_similarity(user_preference_vector.reshape(1, -1), movies_genres_tfidf)

        # Get the row indices of the top n movies with the highest similarity scores
        n = 10
        top_movie_indices = similarity_scores.argsort()[0][-n:][::-1]

        # Get the corresponding movie ids and titles
        top_movie_ids = [movies.iloc[i]['movie_id'] for i in top_movie_indices]

        # Store the top movie ids as the user's arms
        arms[user_id] = top_movie_ids

    return arms

In [4]:
generate_arms(movies, ratings)

{1: [588, 2078, 2102, 2080, 2081, 1566, 48, 594, 783, 2092],
 2: [390, 1916, 21, 1134, 3432, 2411, 2410, 2409, 3430, 1112],
 3: [2880, 2879, 552, 2735, 2723, 2405, 380, 1197, 2406, 2265],
 4: [1876, 3864, 2640, 316, 2275, 2528, 1356, 1374, 2643, 2094],
 5: [3407, 3500, 1734, 1723, 3477, 1701, 133, 3495, 3504, 3543],
 6: [496, 195, 2675, 3261, 1542, 2065, 1172, 2145, 1817, 224],
 7: [1036, 98, 2763, 165, 3316, 2334, 1869, 2490, 3555, 1769],
 8: [789, 1956, 1942, 1943, 3195, 3194, 3192, 3191, 1949, 1952],
 9: [1648, 3227, 218, 1476, 3506, 984, 72, 3778, 75, 2289],
 10: [472, 450, 166, 3083, 1045, 794, 372, 570, 2238, 371],
 11: [3543, 3794, 2977, 970, 3308, 472, 371, 2750, 106, 1279],
 12: [3266, 1918, 2000, 2001, 3184, 2002, 1473, 3362, 403, 1809],
 13: [3698, 173, 1356, 480, 1371, 1372, 1373, 2641, 1374, 1375],
 14: [11, 2675, 2506, 1817, 2626, 2248, 1211, 1888, 3108, 2426],
 15: [3227, 1648, 390, 1916, 21, 1480, 1626, 51, 292, 591],
 16: [852, 3358, 2597, 3743, 597, 3739, 2029, 2834, 