In [4]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import nltk
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MultiLabelBinarizer

In [5]:
movies=pd.read_csv('movies.csv')
ratings=pd.read_csv('ratings.csv')
print("done")

done


# Movies Dataset

In [6]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
# Print to confirm loading
print("Movies shape:", movies.shape)
print("Ratings shape:", ratings.shape)

Movies shape: (62423, 3)
Ratings shape: (25000095, 4)


# Ratings Dataset

In [8]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


# I.Collaborative filtering: Movie based

## Description
The model finds a movie's index and retrieves all user ratings for that movie. It then compares these ratings to other movies to identify similar ones and recommends them based on user preferences.

In [9]:
# merging the two datasets
df=pd.merge(ratings,movies,on='movieId')

print(df.shape)

df.head()

(25000095, 6)


Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,296,5.0,1147880044,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
1,1,306,3.5,1147868817,Three Colors: Red (Trois couleurs: Rouge) (1994),Drama
2,1,307,5.0,1147868828,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama
3,1,665,5.0,1147878820,Underground (1995),Comedy|Drama|War
4,1,899,3.5,1147868510,Singin' in the Rain (1952),Comedy|Musical|Romance


In [10]:
# Take the first 3% of the data
num_rows = int(len(df) * 0.03)  # Calculate 3% of the total rows
df_short = df.iloc[:num_rows]  # Take the first 3% of the data
print("Sampled Data shape:", df_short.shape)

#df_short = df.sample(frac=0.002, random_state=42)  # 0.2% of data

total_nan = df_short.isna().sum().sum()
print(total_nan)
df_short.head()

Sampled Data shape: (750002, 6)
0


Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,296,5.0,1147880044,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
1,1,306,3.5,1147868817,Three Colors: Red (Trois couleurs: Rouge) (1994),Drama
2,1,307,5.0,1147868828,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama
3,1,665,5.0,1147878820,Underground (1995),Comedy|Drama|War
4,1,899,3.5,1147868510,Singin' in the Rain (1952),Comedy|Musical|Romance


In [11]:
user_item_matrix=df_short.pivot_table(index='movieId',columns='userId',values='rating').fillna(0)
print("User-Item Matrix shape:", user_item_matrix.shape)
user_item_matrix.head()

User-Item Matrix shape: (20111, 5108)


userId,1,2,3,4,5,6,7,8,9,10,...,5099,5100,5101,5102,5103,5104,5105,5106,5107,5108
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,3.5,4.0,3.0,4.0,0.0,0.0,4.0,0.0,3.5,...,0.0,0.0,3.0,0.0,0.0,4.0,2.0,0.0,2.0,4.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# KNN

In [12]:
model_knn=NearestNeighbors(metric='cosine', algorithm = 'auto', n_neighbors=5, n_jobs=-1)
model_knn.fit(user_item_matrix)

In [13]:
def recommend_movies(movie_title, model = model_knn, user_item_matrix = user_item_matrix, number_recommendations = 5):
    movie_index = df_short[df_short['title'] == movie_title].index[0]
    print('movie_index:',movie_index)
    
    distances,indexes = model.kneighbors(user_item_matrix.iloc[movie_index].values.reshape(1, -1),number_recommendations, return_distance=True)
    print('indexes',indexes)
    
    recommended_movies=df_short.iloc[indexes.flatten()]
    return recommended_movies
recommend_movies('Toy Story (1995)')

movie_index: 70
indexes [[  70  531 3547  651  730]]


Unnamed: 0,userId,movieId,rating,timestamp,title,genres
70,2,1,3.5,1141415820,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
531,3,8865,4.0,1439473526,Sky Captain and the World of Tomorrow (2004),Action|Adventure|Sci-Fi
3547,19,91658,2.0,1446298585,"Girl with the Dragon Tattoo, The (2011)",Drama|Thriller
651,3,59501,3.5,1566090550,"Chronicles of Narnia: Prince Caspian, The (2008)",Adventure|Children|Fantasy
730,3,81229,4.0,1439474063,Red (2010),Action|Comedy


# II.Collaborative filtering: Genre of movie based

Recommand movies based on the genres without taking into considerations the ratings and other attributes

In [14]:
movies_2=movies.copy()
movies_2['genres'] = movies_2['genres'].fillna('').str.split('|')

# Create genre features
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(movies_2['genres'])

# Create the genre dataframe
genre_df = pd.DataFrame(genre_matrix,columns=mlb.classes_,index=movies_2.index)

# Combine movie information with genre encoding
movies_with_genres = pd.concat([movies, genre_df], axis=1)

print(movies_with_genres.shape)
movies_with_genres.head()

(62423, 23)


Unnamed: 0,movieId,title,genres,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
movies_with_genres_short=movies_with_genres.drop(columns=['genres','title','movieId'])
movies_with_genres_short.head()

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# KNN Model

In [16]:
knn_model_genres = NearestNeighbors(n_neighbors=5, algorithm='auto')
knn_model_genres.fit(movies_with_genres_short)

In [17]:
def recommend_movies_genre(movie_title, model=knn_model_genres, genre_matrix=movies_with_genres_short, number_recommendations=5):
    try:
        # Find the index of the movie by title
        movie_index = movies_with_genres[movies_with_genres['title'] == movie_title].index[0]
        print('Movie index:', movie_index)
        
        # Select the genre vector as a DataFrame to keep feature names consistent
        genre_vector_df = genre_matrix.iloc[[movie_index]]

        # Use the genre vector to find nearest neighbors
        distances, indexes = model.kneighbors(genre_vector_df, number_recommendations, return_distance=True)
        print('Indexes:', indexes)
        
        # Retrieve recommended movies by their indexes
        recommended_movies = movies_with_genres.iloc[indexes.flatten()]
        return recommended_movies[['title', 'genres']]
    
    except IndexError:
        print(f"Movie '{movie_title}' not found in the dataset.")
        return None

# Test the function
recommend_movies_genre('Toy Story (1995)')

Movie index: 0
Indexes: [[60800 52833 58039 55898  2203]]


Unnamed: 0,title,genres
60800,UglyDolls (2019),Adventure|Animation|Children|Comedy|Fantasy
52833,Trolls Holiday (2017),Adventure|Animation|Children|Comedy|Fantasy
58039,Here Comes the Grump (2018),Adventure|Animation|Children|Comedy|Fantasy
55898,Penguin Highway (2018),Adventure|Animation|Children|Comedy|Fantasy
2203,Antz (1998),Adventure|Animation|Children|Comedy|Fantasy


In [18]:
import pickle

# Save the model
pickle_out = open("classifier.pkl", "wb")
pickle.dump(knn_model_genres, pickle_out)
pickle_out.close()

# III.Collaborative filtering: Genre and ratings based

In [19]:
combined_matrix = pd.concat([user_item_matrix, movies_with_genres_short], axis=1).fillna(0)
combined_matrix.columns = combined_matrix.columns.astype(str)
print(" shape user_item_matrix (ratings) :", user_item_matrix.shape)
print(" shape movies with genres :", movies_with_genres.shape)

print("Combined Matrix shape (ratings + genres):", combined_matrix.shape)
combined_matrix.head()

 shape user_item_matrix (ratings) : (20111, 5108)
 shape movies with genres : (62423, 23)
Combined Matrix shape (ratings + genres): (72041, 5128)


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
1,0.0,3.5,4.0,3.0,4.0,0.0,0.0,4.0,0.0,3.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [20]:
#KNN
model_knn_combined = NearestNeighbors(metric='cosine', algorithm='auto', n_neighbors=5, n_jobs=-1)
model_knn_combined.fit(combined_matrix)

In [21]:
def recommend_movies_with_genres(movie_title, model=model_knn_combined, combined_matrix=combined_matrix, number_recommendations=5):
    try:
        movie_index = df_short[df_short['title'] == movie_title].index[0]
        
        movie_row = combined_matrix.iloc[[movie_index]]
        
        # Get recommendations using KNN with both ratings and genres
        distances, indexes = model.kneighbors(movie_row, 
                                              number_recommendations, return_distance=True)

        recommended_movies = df_short.iloc[indexes.flatten()]
        return recommended_movies[['title', 'genres', 'rating']]
    
    except IndexError:
        print(f"Movie '{movie_title}' not found in the dataset.")
        return None

# Example usage
recommend_movies_with_genres('Toy Story (1995)')

Unnamed: 0,title,genres,rating
70,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.5
651,"Chronicles of Narnia: Prince Caspian, The (2008)",Adventure|Children|Fantasy,3.5
4767,Betrayed (1988),Drama|Thriller,2.5
806,World War Z (2013),Action|Drama|Horror|IMAX,3.5
749,Thor (2011),Action|Adventure|Drama|Fantasy|IMAX,4.0
