<a href="https://colab.research.google.com/github/aaubs/ds-master/blob/main/notebooks/M1_Class_UML_solutions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from heapq import nlargest

# Load dataset
movies = pd.read_csv('https://raw.githubusercontent.com/aaubs/ds-master/main/data/movies_UML/movies.csv')
ratings = pd.read_csv('https://raw.githubusercontent.com/aaubs/ds-master/main/data/movies_UML/ratings.csv')


In [2]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [3]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [4]:
movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
# Create a user-movie matrix
user_movie_matrix = ratings.pivot(index='movieId', columns='userId', values='rating')

In [7]:
# Fill NaN values with 0
user_movie_matrix = user_movie_matrix.fillna(0)

In [8]:
user_movie_matrix

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Compute the cosine similarity matrix
cosine_similarity_matrix = cosine_similarity(user_movie_matrix)


In [10]:
# Create a DataFrame for the similarity values
similarity_df = pd.DataFrame(cosine_similarity_matrix, index=user_movie_matrix.index, columns=user_movie_matrix.index)


In [11]:
similarity_df.head(10)

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.410562,0.296917,0.035573,0.308762,0.376316,0.277491,0.131629,0.232586,0.395573,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.410562,1.0,0.282438,0.106415,0.287795,0.297009,0.228576,0.172498,0.044835,0.417693,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.296917,0.282438,1.0,0.092406,0.417802,0.284257,0.402831,0.313434,0.30484,0.242954,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.035573,0.106415,0.092406,1.0,0.188376,0.089685,0.275035,0.158022,0.0,0.095598,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.308762,0.287795,0.417802,0.188376,1.0,0.298969,0.474002,0.283523,0.335058,0.218061,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.376316,0.297009,0.284257,0.089685,0.298969,1.0,0.244105,0.147562,0.214088,0.386414,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.277491,0.228576,0.402831,0.275035,0.474002,0.244105,1.0,0.273757,0.162,0.238949,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.131629,0.172498,0.313434,0.158022,0.283523,0.147562,0.273757,1.0,0.0,0.189867,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.232586,0.044835,0.30484,0.0,0.335058,0.214088,0.162,0.0,1.0,0.048611,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.395573,0.417693,0.242954,0.095598,0.218061,0.386414,0.238949,0.189867,0.048611,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.072542


In [16]:
def recommend_movies(movie_id, num_recommendations=5):
    # Get the similarity values for the movie
    movie_similarities = similarity_df[movie_id]

    # Get the top N+1 most similar movies (N+1 because one of them is the movie itself)
    recommended_movies = nlargest(num_recommendations+1, movie_similarities.index, key=lambda x: movie_similarities[x])

    # Remove the original movie ID from the recommendations
    recommended_movies.remove(movie_id)

    # Fetch the title and genre and concatenate them
    recommendations = movies[movies['movieId'].isin(recommended_movies)]
    recommendations['title_genre'] = recommendations['title'] + " (" + recommendations['genres'] + ")"

    return recommendations




In [20]:
# Test
movie_id_test = 1  # This is the ID for "Toy Story"
print(recommend_movies(movie_id_test))

      movieId                                      title  \
224       260  Star Wars: Episode IV - A New Hope (1977)   
314       356                        Forrest Gump (1994)   
418       480                       Jurassic Park (1993)   
615       780       Independence Day (a.k.a. ID4) (1996)   
2355     3114                         Toy Story 2 (1999)   

                                           genres  \
224                       Action|Adventure|Sci-Fi   
314                      Comedy|Drama|Romance|War   
418              Action|Adventure|Sci-Fi|Thriller   
615              Action|Adventure|Sci-Fi|Thriller   
2355  Adventure|Animation|Children|Comedy|Fantasy   

                                            title_genre  
224   Star Wars: Episode IV - A New Hope (1977) (Act...  
314      Forrest Gump (1994) (Comedy|Drama|Romance|War)  
418   Jurassic Park (1993) (Action|Adventure|Sci-Fi|...  
615   Independence Day (a.k.a. ID4) (1996) (Action|A...  
2355  Toy Story 2 (1999) (Adve

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommendations['title_genre'] = recommendations['title'] + " (" + recommendations['genres'] + ")"


In [19]:
def gradio_interface(movie_name):
    movie_id = movies[movies['title'] == movie_name]['movieId'].values[0]
    recommendations = recommend_movies(movie_id)
    # Concatenate the recommendations into a single string
    recommendations_text = "Recommendations based on " + movie_name + ":\n\n"
    recommendations_text += "\n".join(recommendations['title_genre'].values.tolist())
    return recommendations_text

# Creating Gradio UI with dropdown for movie selection
iface = gr.Interface(fn=gradio_interface,
                     inputs=gr.inputs.Dropdown(choices=movies['title'].tolist(), label="Select or Type a Movie Name"),
                     outputs="text")
iface.launch()



  inputs=gr.inputs.Dropdown(choices=movies['title'].tolist(), label="Select or Type a Movie Name"),
  inputs=gr.inputs.Dropdown(choices=movies['title'].tolist(), label="Select or Type a Movie Name"),


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

