### Movie Recommender Analysis
### Abed Tabbalat

In [1]:
# Import Packages
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [3]:
# Load data
movies_df = pd.read_csv('Movies.csv')  # replace with actual path
ratings_df = pd.read_csv('Ratings.csv')  # replace with actual path

##### Movies Dataframe

In [4]:
# movies df info
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58098 entries, 0 to 58097
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  58098 non-null  int64 
 1   title    58098 non-null  object
 2   genres   58098 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.3+ MB


In [5]:
# movies description
movies_df.describe()

Unnamed: 0,movieId
count,58098.0
mean,111919.516197
std,59862.660956
min,1.0
25%,72437.75
50%,126549.0
75%,161449.5
max,193886.0


In [7]:
# show movies df
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
58093,193876,The Great Glinka (1946),(no genres listed)
58094,193878,Les tribulations d'une caissière (2011),Comedy
58095,193880,Her Name Was Mumu (2016),Drama
58096,193882,Flora (2017),Adventure|Drama|Horror|Sci-Fi


##### Ratings Dataframe

In [8]:
# Ratings df info
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27753444 entries, 0 to 27753443
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 847.0 MB


In [9]:
# ratings description
ratings_df.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,27753440.0,27753440.0,27753440.0,27753440.0
mean,141942.0,18488.0,3.530445,1193122000.0
std,81707.4,35102.63,1.066353,216048200.0
min,1.0,1.0,0.5,789652000.0
25%,71176.0,1097.0,3.0,998605300.0
50%,142022.0,2716.0,3.5,1174256000.0
75%,212459.0,7150.0,4.0,1422744000.0
max,283228.0,193886.0,5.0,1537945000.0


In [10]:
# show ratings df
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264
...,...,...,...,...
27753439,283228,8542,4.5,1379882795
27753440,283228,8712,4.5,1379882751
27753441,283228,34405,4.5,1379882889
27753442,283228,44761,4.5,1354159524


##### Analysis

In [11]:
# Merge movies and ratings dataframes
df = pd.merge(ratings_df, movies_df, on='movieId')
df

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,307,3.5,1256677221,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama
1,6,307,4.0,832059248,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama
2,56,307,4.0,1383625728,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama
3,71,307,5.0,1257795414,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama
4,84,307,3.0,999055519,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama
...,...,...,...,...,...,...
27753439,282403,167894,1.0,1524243885,Stranglehold (1994),Action
27753440,282732,161572,3.5,1504408070,The Great Houdini (1976),Drama
27753441,283000,117857,3.5,1417317969,Hotline (2014),Documentary
27753442,283000,133409,3.5,1431539331,Barnum! (1986),(no genres listed)


In [14]:
# Filter to include only top 1000 most rated movies
top_movies = ratings_df.movieId.value_counts().index[:1000]
df = df[df.movieId.isin(top_movies)]
df

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,307,3.5,1256677221,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama
1,6,307,4.0,832059248,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama
2,56,307,4.0,1383625728,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama
3,71,307,5.0,1257795414,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama
4,84,307,3.0,999055519,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama
...,...,...,...,...,...,...
24976553,283101,2366,2.5,1092337860,King Kong (1933),Action|Adventure|Fantasy|Horror
24976554,283116,2366,4.0,1000820168,King Kong (1933),Action|Adventure|Fantasy|Horror
24976555,283153,2366,4.0,1047143351,King Kong (1933),Action|Adventure|Fantasy|Horror
24976556,283187,2366,3.5,1397341537,King Kong (1933),Action|Adventure|Fantasy|Horror


In [15]:
# Pivot and create movie-user matrix
movie_user_matrix = df.pivot(index='movieId', columns='userId', values='rating').fillna(0)
movie_user_matrix

userId,1,2,3,4,5,6,7,8,9,10,...,283219,283220,283221,283222,283223,283224,283225,283226,283227,283228
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,5.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5
2,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,2.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122904,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
134130,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
134853,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
152081,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# Create mapper from movie title to index
hashmap = {
    movie: i for i, movie in enumerate(
        list(movies_df.set_index('movieId').loc[movie_user_matrix.index].title)
    )
}
hashmap

{'Toy Story (1995)': 0,
 'Jumanji (1995)': 1,
 'Grumpier Old Men (1995)': 2,
 'Father of the Bride Part II (1995)': 3,
 'Heat (1995)': 4,
 'Sabrina (1995)': 5,
 'GoldenEye (1995)': 6,
 'American President, The (1995)': 7,
 'Casino (1995)': 8,
 'Sense and Sensibility (1995)': 9,
 'Ace Ventura: When Nature Calls (1995)': 10,
 'Get Shorty (1995)': 11,
 'Copycat (1995)': 12,
 'Powder (1995)': 13,
 'Leaving Las Vegas (1995)': 14,
 'City of Lost Children, The (Cité des enfants perdus, La) (1995)': 15,
 'Dangerous Minds (1995)': 16,
 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)': 17,
 'Babe (1995)': 18,
 'Dead Man Walking (1995)': 19,
 'Clueless (1995)': 20,
 'Mortal Kombat (1995)': 21,
 'To Die For (1995)': 22,
 'Seven (a.k.a. Se7en) (1995)': 23,
 'Pocahontas (1995)': 24,
 'Usual Suspects, The (1995)': 25,
 'Mighty Aphrodite (1995)': 26,
 'Postman, The (Postino, Il) (1994)': 27,
 'Indian in the Cupboard, The (1995)': 28,
 "Mr. Holland's Opus (1995)": 29,
 'From Dusk Till Dawn (1996)': 30,
 'Br

In [17]:
# Transform the matrix to scipy sparse matrix
movie_user_matrix_sparse = csr_matrix(movie_user_matrix.values)
movie_user_matrix_sparse

<1000x280514 sparse matrix of type '<class 'numpy.float64'>'
	with 17051401 stored elements in Compressed Sparse Row format>

In [18]:
# Fit the model
model_knn = NearestNeighbors(
    metric='cosine',
    algorithm='brute',
    n_neighbors=20,
    n_jobs=-1
)
model_knn.fit(movie_user_matrix_sparse)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=20)

In [26]:
# Assume a given movie name and number of recommendations for testing
movie_name = "Grease"
num_recommendations = 10

In [27]:
# Get corresponding movie id
idx = [hashmap[key] for key in hashmap if movie_name.lower() in key.lower()]

if idx:
    distances, indices = model_knn.kneighbors(
        movie_user_matrix_sparse[idx],
        n_neighbors=int(num_recommendations) + 1
    )

    raw_recommends = sorted(
            list(
                zip(
                    indices.squeeze().tolist(),
                    distances.squeeze().tolist()
                )
            ),
            key=lambda x: x[1]
        )[:0:-1]

    # Get movie titles
    reverse_hashmap = {v: k for k, v in hashmap.items()}
    recommendations = [reverse_hashmap[i[0]] for i in raw_recommends]
else:
    recommendations = ['Sorry that title does not exists']

print(recommendations)

['Big (1988)', 'E.T. the Extra-Terrestrial (1982)', 'Top Gun (1986)', 'When Harry Met Sally... (1989)', 'Honey, I Shrunk the Kids (1989)', 'Wizard of Oz, The (1939)', 'Little Mermaid, The (1989)', 'Sound of Music, The (1965)', 'Mary Poppins (1964)', 'Dirty Dancing (1987)']
