In [1]:
# Importing Libraries
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.neighbors import NearestNeighbors
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import requests

In [2]:
#Define the URL for the movies list API
url="http://127.0.0.1:5000/api/v1.0/movies_list"

In [3]:
# Send a GET request to the API to retrieve the movie list
response=requests.get(url)


In [4]:
# Checking if the request was successful
if response.status_code == 200:
    # Convert the JSON response to a pandas DataFrame
    movies_combined_df = pd.DataFrame(response.json())
else:
    print("Failed to retrieve data:", response.status_code)

In [5]:
# Display the first few rows of the combined movies DataFrame
movies_combined_df.head()

Unnamed: 0,cleaned_genres,movieId,poster_path,rating,timestamp,title,userId
0,Action|Crime|Drama|Thriller,949,/zMyfPUelumio3tiDKPffaUpsQTD.jpg,4.0,956598942,Heat,102
1,Action|Crime|Drama|Thriller,949,/zMyfPUelumio3tiDKPffaUpsQTD.jpg,4.0,942345464,Heat,363
2,Action|Crime|Drama|Thriller,949,/zMyfPUelumio3tiDKPffaUpsQTD.jpg,4.5,1133735550,Heat,452
3,Action|Crime|Drama|Thriller,949,/zMyfPUelumio3tiDKPffaUpsQTD.jpg,3.5,1340405089,Heat,505
4,Action|Crime|Drama|Thriller,949,/zMyfPUelumio3tiDKPffaUpsQTD.jpg,3.0,879503053,Heat,537


In [7]:
# Extracting movies-related data into a new DataFrame
movies_df = movies_combined_df[['movieId', 'title', 'cleaned_genres']].copy()
movies_df.head()

Unnamed: 0,movieId,title,cleaned_genres
0,949,Heat,Action|Crime|Drama|Thriller
1,949,Heat,Action|Crime|Drama|Thriller
2,949,Heat,Action|Crime|Drama|Thriller
3,949,Heat,Action|Crime|Drama|Thriller
4,949,Heat,Action|Crime|Drama|Thriller


In [63]:
# Create a path to csv and read it in pandas dataframe
#path=Path("Data/movies.csv")
#movies_df=pd.read_csv(path)
#movies_df.head()

In [8]:
# Extracting ratings-related data into a new DataFrame
ratings_df = movies_combined_df[['userId', 'movieId', 'rating', 'timestamp']].copy()
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,102,949,4.0,956598942
1,363,949,4.0,942345464
2,452,949,4.5,1133735550
3,505,949,3.5,1340405089
4,537,949,3.0,879503053


In [38]:
# Create a path to csv and read it in pandas dataframe
#path=Path("Data/ratings.csv")
#ratings_df=pd.read_csv(path)
#ratings_df.head()

In [9]:
#Calculate and print basic statistics about the dataset
n_ratings = len(ratings_df)
n_movies = len(ratings_df['movieId'].unique())
n_users = len(ratings_df['userId'].unique())
 
print(f"Number of ratings: {n_ratings}")
print(f"Number of unique movieId's: {n_movies}")
print(f"Number of unique users: {n_users}")
print(f"Average ratings per user: {round(n_ratings/n_users, 2)}")
print(f"Average ratings per movie: {round(n_ratings/n_movies, 2)}")

Number of ratings: 43000
Number of unique movieId's: 2615
Number of unique users: 671
Average ratings per user: 64.08
Average ratings per movie: 16.44


In [10]:
#Calculate the frequencies of ratings by each user
user_freq = ratings_df[['userId', 'movieId']].groupby(
    'userId').count().reset_index()
user_freq.columns = ['userId', 'n_ratings']
print(user_freq.head())

   userId  n_ratings
0       1          6
1       2         58
2       3         24
3       4        109
4       5         53


In [11]:
#TEST CELL DELETE# Find and display the lowest and highest rated movies
mean_rating = ratings_df.groupby('movieId')[['rating']].mean()
lowest_rated = mean_rating['rating'].idxmin()
highest_rated = mean_rating['rating'].idxmax()
print("Lowest rated movie details:", movies_df.loc[movies_df['movieId'] == lowest_rated])
print("Highest rated movie details:", movies_df.loc[movies_df['movieId'] == highest_rated])
print("Number of people who rated the highest rated movie:", ratings_df[ratings_df['movieId'] == highest_rated].shape[0])
print("Number of people who rated the lowest rated movie:", ratings_df[ratings_df['movieId'] == lowest_rated].shape[0])

Lowest rated movie details:        movieId     title                  cleaned_genres
29121     2191  Stranded  Thriller|Drama|Science Fiction
Highest rated movie details:        movieId       title                         cleaned_genres
18101      183  The Wizard  Adventure|Comedy|Drama|Family|Romance
Number of people who rated the highest rated movie: 1
Number of people who rated the lowest rated movie: 1


In [12]:
# Find Lowest and Highest rated movies:
mean_rating = ratings_df.groupby('movieId')[['rating']].mean()
# Lowest rated movies
lowest_rated = mean_rating['rating'].idxmin()
movies_df.loc[movies_df['movieId'] == lowest_rated]
# Highest rated movies
highest_rated = mean_rating['rating'].idxmax()
movies_df.loc[movies_df['movieId'] == highest_rated]
# show number of people who rated movies rated movie highest
ratings_df[ratings_df['movieId']==highest_rated]
# show number of people who rated movies rated movie lowest
ratings_df[ratings_df['movieId']==lowest_rated]
 
## the above movies has very low dataset. We will use bayesian average
movie_stats = ratings_df.groupby('movieId')[['rating']].agg(['count', 'mean'])
movie_stats.columns = movie_stats.columns.droplevel()

In [13]:
# Prepare for a recommendation system: Create a user-item matrix
from scipy.sparse import csr_matrix
#Create a user-iteam matrix from the ratings DataFrame 
def create_matrix(df):
     
    N = len(df['userId'].unique())
    M = len(df['movieId'].unique())
     
    # Map Ids to indices
    user_mapper = dict(zip(np.unique(df["userId"]), list(range(N))))
    movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(M))))
     
    # Map indices to IDs
    user_inv_mapper = dict(zip(list(range(N)), np.unique(df["userId"])))
    movie_inv_mapper = dict(zip(list(range(M)), np.unique(df["movieId"])))
     
    user_index = [user_mapper[i] for i in df['userId']]
    movie_index = [movie_mapper[i] for i in df['movieId']]
    # Create a sparse matrix with ratings in their corresponding user/movie positions
    X = csr_matrix((df["rating"], (movie_index, user_index)), shape=(M, N))
     
    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper
# Generate the user-item matrix and mapping dictionaries     
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_matrix(ratings_df)

In [14]:
# Define a function to find similar movies using the k-Nearest Neighbors algorithm
def find_similar_movies(movie_id, X, k, metric='cosine', show_distance=False):
     
    neighbour_ids = []
     
    movie_ind = movie_mapper[movie_id]
    movie_vec = X[movie_ind]
    k+=1
    kNN = NearestNeighbors(n_neighbors=k, algorithm="brute", metric=metric)
    kNN.fit(X)
    movie_vec = movie_vec.reshape(1,-1)
    neighbour = kNN.kneighbors(movie_vec, return_distance=show_distance)
    for i in range(0,k):
        n = neighbour.item(i)
        neighbour_ids.append(movie_inv_mapper[n])
    neighbour_ids.pop(0)
    return neighbour_ids
 
 
movie_titles = dict(zip(movies_df['movieId'], movies_df['title']))
 
movie_id = 3
 
similar_ids = find_similar_movies(movie_id, X, k=10)
movie_title = movie_titles[movie_id]
 
print(f"Since you watched {movie_title}")
for i in similar_ids:
    print(movie_titles[i])

Since you watched Shadows in Paradise
Lolita
Shaft in Africa
Mrs. Doubtfire
The Passion of Joan of Arc
Almost Famous
Hero
Dont Look Back
Donnie Darko
The Living Daylights
Monty Python and the Holy Grail


In [15]:
def recommend_movies_for_user(user_id, X, user_mapper, movie_mapper, movie_inv_mapper, k=10):
    df1 = ratings_df[ratings_df['userId'] == user_id]
     
    if df1.empty:
        print(f"User with ID {user_id} does not exist.")
        return
 
    movie_id = df1[df1['rating'] == max(df1['rating'])]['movieId'].iloc[0]
 
    movie_titles = dict(zip(movies_df['movieId'], movies_df['title']))
 
    similar_ids = find_similar_movies(movie_id, X, k)
    movie_title = movie_titles.get(movie_id, "Movie not found")
 
    if movie_title == "Movie not found":
        print(f"Movie with ID {movie_id} not found.")
        return
 
    print(f"Since you watched {movie_title}, you might also like:")
    for i in similar_ids:
        print(movie_titles.get(i, "Movie not found"))

In [16]:
user_id = 150  # Replace with the desired user ID
recommend_movies_for_user(user_id, X, user_mapper, movie_mapper, movie_inv_mapper, k=10)

Since you watched Run Lola Run, you might also like:
Ali: Fear Eats the Soul
The Passion of Joan of Arc
Fahrenheit 9/11
Arlington Road
Tuya's Marriage
Wag the Dog
The 39 Steps
Minority Report
Twin Peaks: Fire Walk with Me
Mrs. Doubtfire
