Step 1: Import Libraries and Load Dataset

In [None]:
#importing the libreries
import pandas as pd

#loading the dataset
movies=pd.read_csv(r"C:\Users\Expert\PycharmProjects\pythonProject\movies.csv")
ratings=pd.read_csv(r"C:\Users\Expert\PycharmProjects\pythonProject\ratings.csv")

In [None]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Step 2: Exploratory Data Analysis

In [None]:
#Understanding the Distribution of Features

# Display basic information about the movies dataset
print("Movies Dataset Info:")
print(movies.info())

# Display basic information about the ratings dataset
print("\nRatings Dataset Info:")
print(ratings.info())

Movies Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10329 entries, 0 to 10328
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  10329 non-null  int64 
 1   title    10329 non-null  object
 2   genres   10329 non-null  object
dtypes: int64(1), object(2)
memory usage: 242.2+ KB
None

Ratings Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105339 entries, 0 to 105338
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     105339 non-null  int64  
 1   movieId    105339 non-null  int64  
 2   rating     105339 non-null  float64
 3   timestamp  105339 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.2 MB
None


In [None]:
# Find unique users and movies

unique_users = ratings['userId'].nunique()
unique_movies = movies['movieId'].nunique()
print("\nNumber of Unique Users: {}".format(unique_users))
print("Number of Unique Movies: {}".format(unique_movies))


Number of Unique Users: 668
Number of Unique Movies: 10329


In [None]:
#Average Rating and Total Movies at Genre Level

# Merge movies and ratings datasets
merged_data = pd.merge(ratings, movies, on='movieId')

# Display average rating per movie
average_rating_per_movie = merged_data.groupby('title')['rating'].mean()
print("\nAverage Rating per Movie:")
print(average_rating_per_movie)

# Display total number of movies at the genre level
total_movies_per_genre = movies['genres'].str.split('|', expand=True).stack().value_counts()
print("\nTotal Movies per Genre:")
print(total_movies_per_genre)



Average Rating per Movie:
title
'71 (2014)                                       3.500000
'Hellboy': The Seeds of Creation (2004)          3.000000
'Round Midnight (1986)                           2.500000
'Til There Was You (1997)                        4.000000
'burbs, The (1989)                               3.125000
                                                   ...   
loudQUIETloud: A Film About the Pixies (2006)    4.500000
xXx (2002)                                       2.958333
xXx: State of the Union (2005)                   2.071429
¡Three Amigos! (1986)                            3.012500
À nous la liberté (Freedom for Us) (1931)        3.000000
Name: rating, Length: 10323, dtype: float64

Total Movies per Genre:
Drama                 5220
Comedy                3515
Thriller              2187
Romance               1788
Action                1737
Crime                 1440
Adventure             1164
Horror                1001
Sci-Fi                 860
Mystery          

In [None]:
#Unique Genres Considered

# Extract unique genres from the dataset
unique_genres = set('|'.join(movies['genres']).split('|'))

print("\nUnique Genres Considered:")
print(unique_genres)



Unique Genres Considered:
{'Drama', 'Sci-Fi', 'Action', 'Children', 'Horror', 'Thriller', 'Romance', 'Musical', 'Mystery', 'IMAX', 'Animation', 'Western', '(no genres listed)', 'Documentary', 'Crime', 'Adventure', 'Fantasy', 'Comedy', 'Film-Noir', 'War'}


Step 3: Design the 3 Different Types of Recommendation Modules

In [None]:
# Popularity-Based Recommender Module

In [None]:
def popularity_recommender(genre_input, ratings_threshold, num_recommendations):
    # Filter movies by genre
    genre_movies = movies[movies['genres'].str.contains(genre_input, case=False)]

    # Filter movies by ratings threshold
    high_rated_movies = ratings.groupby('movieId')['rating'].agg(['count', 'mean']).reset_index()
    high_rated_movies = high_rated_movies[high_rated_movies['count'] >= ratings_threshold]

    # Merge datasets
    genre_high_rated_movies = pd.merge(genre_movies, high_rated_movies, on='movieId')

    # Sort by popularity (average rating)
    sorted_movies = genre_high_rated_movies.sort_values(by='mean', ascending=False)

    # Recommend top N movies
    top_recommendations = sorted_movies.head(num_recommendations)

    return top_recommendations[['title', 'mean', 'count']]


In [None]:
popularity_recommender("Action",10, 5)

Unnamed: 0,title,mean,count
186,All Quiet on the Western Front (1930),4.5,13
278,Princess Mononoke (Mononoke-hime) (1997),4.384615,52
291,Hard-Boiled (Lat sau san taam) (1992),4.307692,13
97,North by Northwest (1959),4.273973,73
119,Henry V (1989),4.272727,22


In [None]:
# Content-Based Recommender Module

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Function to create a content-based recommender system
def content_based_recommender(movie_title, num_recommendations):
    # Find the index of the movie with the given title
    movie_index = movies[movies['title'] == movie_title].index[0]

    # Create a TF-IDF Vectorizer for movie genres
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(movies['genres'].fillna(''))

    # Calculate the cosine similarity between movies
    cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

    # Get the similarity scores for the given movie
    similarity_scores = list(enumerate(cosine_similarities[movie_index]))

    # Sort movies based on similarity scores
    similar_movies = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:]

    # Get the top 'num_recommendations' similar movies
    top_movies_indices = [index for index, _ in similar_movies[:num_recommendations]]

    # Display the final result
    print("Sr.No\tMovie_Title")
    for i, index in enumerate(top_movies_indices, start=1):
        print(f"{i}\t{movies['title'].iloc[index]}")


In [None]:
content_based_recommender("Inception (2010)",5)

Sr.No	Movie_Title
1	Watchmen (2009)
2	Super 8 (2011)
3	RoboCop (2014)
4	V for Vendetta (2006)
5	Transformers (2007)


In [None]:
# Collaborative-Based Recommender Module

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def collaborative_recommender(target_user_id, num_similar_users, num_recommendations):
    # Create a user-item matrix
    user_item_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)

    # Calculate cosine similarity between users
    user_similarity = cosine_similarity(user_item_matrix)

    # Identify K similar users for the target user
    similar_users_indices = user_similarity[target_user_id - 1].argsort()[::-1][1:num_similar_users+1]

    # Predict ratings for unrated movies for the target user
    target_user_ratings = user_item_matrix.loc[target_user_id]
    predicted_ratings = user_item_matrix.iloc[similar_users_indices].mean(axis=0)

    # Filter unrated movies
    unrated_movies = target_user_ratings[target_user_ratings == 0].index

    # Sort and recommend top N movies
    recommendations = predicted_ratings[unrated_movies].sort_values(ascending=False).head(num_recommendations)

    return recommendations


In [None]:
collaborative_recommender(1,10,5)

movieId
4226    3.30
1682    3.05
3793    3.00
231     2.90
2329    2.85
dtype: float64

Additional/Optional: Create a GUI interface using Python libraries (ipywidgets
etc.) to play around with the recommendation modules