In [1]:
import pandas as pd
ratings = pd.read_csv("ml-100k/u.data", sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])

In [2]:
movie_columns = ['item_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL'] + \
                ['unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 
                 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 
                 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

movies = pd.read_csv("ml-100k/u.item", sep='|', encoding='latin-1', names=movie_columns, usecols=['item_id', 'title'])
print(movies.head())

   item_id              title
0        1   Toy Story (1995)
1        2   GoldenEye (1995)
2        3  Four Rooms (1995)
3        4  Get Shorty (1995)
4        5     Copycat (1995)


In [3]:
# Load movies with genres
movies = pd.read_csv("ml-100k/u.item", sep='|', encoding='latin-1', names=movie_columns, usecols=[0, 1] + list(range(5, 24)))
print(movies.head())


   item_id              title  unknown  Action  Adventure  Animation  \
0        1   Toy Story (1995)        0       0          0          1   
1        2   GoldenEye (1995)        0       1          1          0   
2        3  Four Rooms (1995)        0       0          0          0   
3        4  Get Shorty (1995)        0       1          0          0   
4        5     Copycat (1995)        0       0          0          0   

   Children  Comedy  Crime  Documentary  ...  Fantasy  Film-Noir  Horror  \
0         1       1      0            0  ...        0          0       0   
1         0       0      0            0  ...        0          0       0   
2         0       0      0            0  ...        0          0       0   
3         0       1      0            0  ...        0          0       0   
4         0       0      1            0  ...        0          0       0   

   Musical  Mystery  Romance  Sci-Fi  Thriller  War  Western  
0        0        0        0       0         0 

In [4]:
from sklearn.metrics.pairwise import cosine_similarity

genre_features = movies.iloc[:, 2:].values  # Only genre columns

# Compute cosine similarity between movies
similarity_matrix = cosine_similarity(genre_features)

print(similarity_matrix.shape)  # Should be (1682, 1682) since we have 1682 movies


(1682, 1682)


In [None]:
def recommend_movies(movie_title, movies_df, similarity_matrix, top_n=5):
    """Recommend movies similar to a given movie based on cosine similarity."""
    idx = movies_df.index[movies_df['title'] == movie_title].tolist()
    
    if not idx:
        return f"Movie '{movie_title}' not found!"
    
    idx = idx[0]  # Get the first matching index
    
    # Get similarity scores for this movie
    sim_scores = list(enumerate(similarity_matrix[idx]))

    # Sort by similarity (excluding the movie itself)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]

    # Get recommended movie titles
    recommended_movies = [movies_df.iloc[i[0]]['title'] for i in sim_scores]
    
    return recommended_movies

# Test recommendations with Toy Story
print(recommend_movies("Toy Story (1995)", movies, similarity_matrix))

['Aladdin and the King of Thieves (1996)', 'Aladdin (1992)', 'Goofy Movie, A (1995)', 'Santa Clause, The (1994)', 'Home Alone (1990)']


Testing 10 random movies

In [19]:
import random

random_movies = movies.sample(n=10, random_state=random.randint(0, 1000)) 

# Print the titles
for movie_test in random_movies['title']:
    print("Testing : " + movie_test)
    print(recommend_movies(movie_test,movies,similarity_matrix))
    print("=====================================================")

Testing : Cool Hand Luke (1967)
['Eat Drink Man Woman (1994)', 'Ed Wood (1994)', "What's Eating Gilbert Grape (1993)", 'Welcome to the Dollhouse (1995)', 'Swingers (1996)']
Testing : Eye of Vichy, The (Oeil de Vichy, L') (1993)
['Hoop Dreams (1994)', 'Brother Minister: The Assassination of Malcolm X (1994)', 'Haunted World of Edward D. Wood Jr., The (1995)', 'Maya Lin: A Strong Clear Vision (1994)', 'Paradise Lost: The Child Murders at Robin Hood Hills (1996)']
Testing : Beauty and the Beast (1991)
['Snow White and the Seven Dwarfs (1937)', 'All Dogs Go to Heaven 2 (1996)', 'Cinderella (1950)', 'Alice in Wonderland (1951)', 'Fantasia (1940)']
Testing : Specialist, The (1994)
['Under Siege (1992)', 'Steel (1997)', 'Sudden Death (1995)', 'Striking Distance (1993)', "Jackie Chan's First Strike (1996)"]
Testing : How to Make an American Quilt (1995)
['Angels and Insects (1995)', 'Mad Love (1995)', 'Phenomenon (1996)', "Breakfast at Tiffany's (1961)", 'Graduate, The (1967)']
Testing : Secon