In [15]:
import numpy as np
import pandas as pd

# finding closest match
import difflib 

# textual data into numerical values
from sklearn.feature_extraction.text import TfidfVectorizer

# finding similarity scores
from sklearn.metrics.pairwise import cosine_similarity

from ipywidgets import *

# loading the movie data from csv file
movies_data=pd.read_csv("C:\\Users\\alok\\Desktop\\Inzint\\datasets\\movies.csv")
ratings=pd.read_csv("C:\\Users\\alok\\Desktop\\Inzint\\datasets\\ratings.csv")
print(movies_data.columns)
print(ratings.columns)



Index(['movieId', 'title', 'genres'], dtype='object')
Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')


In [16]:
movies_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [17]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [18]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,105339.0,105339.0,105339.0,105339.0
mean,364.924539,13381.312477,3.51685,1130424000.0
std,197.486905,26170.456869,1.044872,180266000.0
min,1.0,1.0,0.5,828565000.0
25%,192.0,1073.0,3.0,971100800.0
50%,383.0,2497.0,3.5,1115154000.0
75%,557.0,5991.0,4.0,1275496000.0
max,668.0,149532.0,5.0,1452405000.0


In [19]:
movies_data["genres"] = movies_data["genres"].str.split("|")

In [20]:
movies_explode = movies_data.explode("genres")

In [21]:
movies_explode = movies_explode[movies_explode["genres"]!='(no genres listed)']

In [22]:
movies_explode.nunique()

movieId    10322
title      10320
genres        19
dtype: int64

In [23]:
movies_merged = pd.merge(ratings, movies_explode, on='movieId')

In [24]:
# Group by movie title and genres and calculate average rating and rating count
popularity_df = movies_merged.groupby(['title', 'genres']).agg({'rating': ['mean', 'count']}).reset_index()

# Rename the columns for clarity
popularity_df.columns = ['title', 'genres', 'avg_rating', 'rating_count']

In [25]:
popularity_df


Unnamed: 0,title,genres,avg_rating,rating_count
0,'71 (2014),Action,3.500000,1
1,'71 (2014),Drama,3.500000,1
2,'71 (2014),Thriller,3.500000,1
3,'71 (2014),War,3.500000,1
4,'Hellboy': The Seeds of Creation (2004),Action,3.000000,1
...,...,...,...,...
23093,xXx: State of the Union (2005),Thriller,2.071429,7
23094,¡Three Amigos! (1986),Comedy,3.012500,40
23095,¡Three Amigos! (1986),Western,3.012500,40
23096,À nous la liberté (Freedom for Us) (1931),Comedy,3.000000,1


In [27]:
def top_N_popular_movies_by_genre(df, genre, N):
    # Filter DataFrame by genre
    genre_df = df[df['genres'].str.contains(genre)]
    
    # Group by movie title and calculate rating count
    rating_count = genre_df.groupby('title')['rating'].count().reset_index()
    
    # Sort movies based on rating count in descending order
    top_movies = rating_count.sort_values(by='rating', ascending=False).head(N)
    
    return top_movies




In [28]:
genre = 'Comedy'  # Specify the genre
N = 10  # Number of top movies to return
top_movies_by_genre = top_N_popular_movies_by_genre(movies_merged, genre, N)
print(top_movies_by_genre)

                                  title  rating
2523                Pulp Fiction (1994)     325
1069                Forrest Gump (1994)     311
3207                   Toy Story (1995)     232
250           Back to the Future (1985)     213
995                        Fargo (1996)     201
102                      Aladdin (1992)     191
3237                   True Lies (1994)     184
57    Ace Ventura: Pet Detective (1994)     173
2023   Men in Black (a.k.a. MIB) (1997)     172
2498         Princess Bride, The (1987)     171


In [29]:
def TopPopularMovies(genre, num_threshold, topN):
    # Create popularity DataFrame
    popularity_df = movies_merged.groupby(['genres', 'title']).agg({'rating': ['mean', 'size']}).reset_index()
    popularity_df.columns = ['genres', 'title', 'avg_ratings', 'no_of_ratings']

    # Filter and sort data
    filtered_df = popularity_df[(popularity_df["genres"] == genre) & (popularity_df["no_of_ratings"] > num_threshold)]
    sorted_df = filtered_df.sort_values(by="avg_ratings", ascending=False).head(topN)

    # Select specific columns
    selected_df = sorted_df[["title", "avg_ratings", "no_of_ratings"]]

    return selected_df


In [30]:
TopPopularMovies("Comedy",50,10)

Unnamed: 0,title,avg_ratings,no_of_ratings
5933,Monty Python and the Holy Grail (1975),4.301948,154
4835,Fargo (1996),4.271144,201
5647,Life Is Beautiful (La Vita è bella) (1997),4.253425,73
6792,"Sting, The (1973)",4.207792,77
4015,Annie Hall (1977),4.205882,68
7182,Wallace & Gromit: The Wrong Trousers (1993),4.168831,77
6338,"Princess Bride, The (1987)",4.163743,171
7180,Wallace & Gromit: A Close Shave (1995),4.163636,55
6363,Pulp Fiction (1994),4.16,325
4909,Forrest Gump (1994),4.138264,311


In [31]:
def bayesian_average(avg_rating, num_ratings, global_avg_rating, min_ratings):
    # Calculate the Bayesian average
    bayesian_avg = (min_ratings * global_avg_rating + num_ratings * avg_rating) / (min_ratings + num_ratings)
    return bayesian_avg

def TopPopularMovies(genre, num_threshold, topN, global_avg_rating, min_ratings):
    # Create popularity DataFrame
    popularity_df = movies_merged.groupby(['genres', 'title']).agg({'rating': ['mean', 'size']}).reset_index()
    popularity_df.columns = ['genres', 'title', 'avg_ratings', 'no_of_ratings']

    # Filter data
    filtered_df = popularity_df[(popularity_df["genres"] == genre) & (popularity_df["no_of_ratings"] > num_threshold)]

    # Calculate Bayesian average
    filtered_df['bayesian_avg'] = filtered_df.apply(lambda row: bayesian_average(row['avg_ratings'], row['no_of_ratings'], global_avg_rating, min_ratings), axis=1)

    # Sort data by Bayesian average
    sorted_df = filtered_df.sort_values(by="bayesian_avg", ascending=False).head(topN)

    # Select specific columns
    selected_df = sorted_df[["title", "bayesian_avg", "no_of_ratings"]]

    return selected_df


In [None]:
TopPopularMovies("Comedy", 50, 10, 3.516850, 0.5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['bayesian_avg'] = filtered_df.apply(lambda row: bayesian_average(row['avg_ratings'], row['no_of_ratings'], global_avg_rating, min_ratings), axis=1)


Unnamed: 0,title,bayesian_avg,no_of_ratings
5933,Monty Python and the Holy Grail (1975),4.299407,154
4835,Fargo (1996),4.269273,201
5647,Life Is Beautiful (La Vita è bella) (1997),4.248414,73
6792,"Sting, The (1973)",4.203335,77
4015,Annie Hall (1977),4.200853,68
7182,Wallace & Gromit: The Wrong Trousers (1993),4.164625,77
6338,"Princess Bride, The (1987)",4.161857,171
6363,Pulp Fiction (1994),4.159012,325
7180,Wallace & Gromit: A Close Shave (1995),4.157809,55
4909,Forrest Gump (1994),4.137266,311


# Content Based

# Take a Break

In [None]:
movies_data

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]
...,...,...,...
10324,146684,Cosmic Scrat-tastrophe (2015),"[Animation, Children, Comedy]"
10325,146878,Le Grand Restaurant (1966),[Comedy]
10326,148238,A Very Murray Christmas (2015),[Comedy]
10327,148626,The Big Short (2015),[Drama]


In [32]:

# combining all the selected features into a single string for each movie
combined_features = movies_merged.groupby("title").agg({"genres": lambda x: " ".join(list(x))}).reset_index()
combined_features.head()




Unnamed: 0,title,genres
0,'71 (2014),Action Drama Thriller War
1,'Hellboy': The Seeds of Creation (2004),Action Adventure Comedy Documentary Fantasy
2,'Round Midnight (1986),Drama Musical
3,'Til There Was You (1997),Drama Romance Drama Romance Drama Romance
4,"'burbs, The (1989)",Comedy Comedy Comedy Comedy Comedy Comedy Come...


In [33]:
vectorizer = TfidfVectorizer()
feature_vector = vectorizer.fit_transform(combined_features["genres"])

similarity= cosine_similarity(feature_vector, feature_vector)

In [34]:
indices = pd.Series(movies_data['title'].index,index = movies_data['title'])

In [35]:
indices

title
Toy Story (1995)                           0
Jumanji (1995)                             1
Grumpier Old Men (1995)                    2
Waiting to Exhale (1995)                   3
Father of the Bride Part II (1995)         4
                                       ...  
Cosmic Scrat-tastrophe (2015)          10324
Le Grand Restaurant (1966)             10325
A Very Murray Christmas (2015)         10326
The Big Short (2015)                   10327
Marco Polo: One Hundred Eyes (2015)    10328
Length: 10329, dtype: int64

In [36]:
index = indices["Toy Story (1995)"]

movie_similarity_scores = similarity[index]

movie_similarity_scores = list(enumerate(movie_similarity_scores))

sorted_similarity_scores = sorted(movie_similarity_scores, key=lambda x: x[1], reverse=True)

print(sorted_similarity_scores)


[(0, 1.0000000000000002), (596, 1.0000000000000002), (858, 1.0000000000000002), (3837, 1.0000000000000002), (4405, 1.0000000000000002), (7421, 1.0000000000000002), (9242, 1.0000000000000002), (5470, 1.0), (210, 0.9564936958601954), (945, 0.9564936958601954), (6652, 0.9564936958601954), (6954, 0.9564936958601954), (9384, 0.9564936958601954), (9549, 0.9564936958601954), (357, 0.9407843734274323), (123, 0.8967502662971476), (155, 0.8967502662971476), (288, 0.8967502662971476), (315, 0.8967502662971476), (353, 0.8967502662971476), (793, 0.8967502662971476), (863, 0.8967502662971476), (866, 0.8967502662971476), (944, 0.8967502662971476), (1130, 0.8967502662971476), (1296, 0.8967502662971476), (1510, 0.8967502662971476), (2907, 0.8967502662971476), (3254, 0.8967502662971476), (3687, 0.8967502662971476), (3830, 0.8967502662971476), (3925, 0.8967502662971476), (4066, 0.8967502662971476), (4231, 0.8967502662971476), (4615, 0.8967502662971476), (4694, 0.8967502662971476), (4800, 0.89675026629714

In [37]:
sorted_similarity_scores[:10]

[(0, 1.0000000000000002),
 (596, 1.0000000000000002),
 (858, 1.0000000000000002),
 (3837, 1.0000000000000002),
 (4405, 1.0000000000000002),
 (7421, 1.0000000000000002),
 (9242, 1.0000000000000002),
 (5470, 1.0),
 (210, 0.9564936958601954),
 (945, 0.9564936958601954)]

In [38]:
def recommend_movies(movie_title, topN=5):
   
    index = indices[movie_title]  
  
    movie_similarity_scores = similarity[index]    
    movie_similarity_scores = list(enumerate(movie_similarity_scores))  
    sorted_similarity_scores = sorted(movie_similarity_scores, key=lambda x: x[1], reverse=True)
 
    top_indices = [i[0] for i in sorted_similarity_scores[1:topN+1]]  
    top_movies = combined_features.iloc[top_indices]['title'].tolist()
    
    return top_movies

In [39]:
input_movie_title = "Toy Story (1995)"
recommended_movies = recommend_movies(input_movie_title, topN=5)
print(f"Recommended movies similar to '{input_movie_title}':")
for movie in recommended_movies:
    print(movie)

Recommended movies similar to 'Toy Story (1995)':
Army of Shadows (L'armée des ombres) (1969)
Battle Royale 2: Requiem (Batoru rowaiaru II: Chinkonka) (2003)
Green Zone (2010)
Hurt Locker, The (2008)
Rambo (Rambo 4) (2008)


In [40]:
input_ = input("Enter Movie Name: ")
closest_match = difflib.get_close_matches(input_, movies_data['title'].tolist(), n=1)
if closest_match:
    print(closest_match[0])
else:
    print("No close match found.")

No close match found.


In [41]:
recommended_movies = recommend_movies(closest_match[0], topN=5)
print(f"Recommended movies similar to '{input_}':")
for movie in recommended_movies:
    print(movie)

IndexError: list index out of range

In [42]:
import ipywidgets as widgets
from IPython.display import display

def recommend_movies(movie_title, topN=5):
   
    index = indices[movie_title]  
  
    movie_similarity_scores = similarity[index]    
    movie_similarity_scores = list(enumerate(movie_similarity_scores))  
    sorted_similarity_scores = sorted(movie_similarity_scores, key=lambda x: x[1], reverse=True)
 
    top_indices = [i[0] for i in sorted_similarity_scores[1:topN+1]]  
    top_movies = combined_features.iloc[top_indices]['title'].tolist()
    
    return top_movies




In [43]:
def on_button_clicked(b):
    movie_title = text_input.value
    recommended_movies = recommend_movies(movie_title)
    output.clear_output()
    with output:
        print(f"Recommended movies similar to '{movie_title}':")
        for movie in recommended_movies:
            print(movie)

text_input = widgets.Text(placeholder='Enter Movie Name')
button = widgets.Button(description="Get Recommendations")
button.on_click(on_button_clicked)

output = widgets.Output()
display(text_input, button, output)

Text(value='', placeholder='Enter Movie Name')

Button(description='Get Recommendations', style=ButtonStyle())

Output()