In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

plt.style.use("ggplot")

In [None]:
movies_data = pd.read_csv("movies.csv")
movies_data

In [None]:
ratings_data = pd.read_csv("ratings.csv")
ratings_data

In [None]:
links_data = pd.read_csv("links.csv")
links_data

In [None]:
tags_data = pd.read_csv("tags.csv")
tags_data

In [None]:
combined_data = (
    movies_data.merge(ratings_data, on="movieId", how="inner")
    .merge(tags_data, on="movieId", how="inner")
    .merge(links_data, on="movieId", how="inner")
)
print("The Shape of data is {}".format(combined_data.shape))
combined_data.head()

In [None]:
### What is the shape of "movies.csv"?

In [None]:
print(f"The shape of the movies.csv is {movies_data.shape}")


In [None]:
### What is the shape of "ratings.csv"?

In [None]:
print(f"The shape of the ratings.csv is {ratings_data.shape}")


In [None]:
### How many unique "userId" are available in "ratings.csv"?

In [None]:
len(ratings_data["userId"].unique())

In [None]:
### Which movie has recieved maximum number of user ratings?

In [None]:
user_ratings_count = ratings_data["movieId"].value_counts()
max_user_ratings_movieId = user_ratings_count.idxmax()
max_user_ratings_count = user_ratings_count.max()
max_user_ratings_title = movies_data.loc[
    movies_data["movieId"] == max_user_ratings_movieId, "title"
].values[0]

print("Movie with the maximum number of user ratings:")
print("MovieId:", max_user_ratings_movieId)
print("Title:", max_user_ratings_title)
print("Number of Ratings:", max_user_ratings_count)

In [None]:
### Select all the correct tags submitted by users to "Matrix, The (1999)" movie

In [None]:
matrix_movieId = movies_data[
    movies_data["title"].str.contains("Matrix, The \(1999\)", regex=True)
]["movieId"].iloc[0]
tags_data = pd.read_csv("tags.csv", encoding="utf-8")
matrix_tags = tags_data[tags_data["movieId"] == matrix_movieId]["tag"].unique()

print("Tags for The Matrix (1999):")
print(matrix_tags)

In [None]:
### What is the average user rating for movie named "Terminator 2: Judgment Day (1991)"?

In [None]:
terminator2_movieId = movies_data[
    movies_data["title"] == "Terminator 2: Judgment Day (1991)"
]["movieId"].iloc[0]
terminator2_avg_rating = ratings_data[ratings_data["movieId"] == terminator2_movieId][
    "rating"
].mean()

print("Average user rating for Terminator 2: Judgment Day (1991):")
print(terminator2_avg_rating)

In [None]:
### How does the data distribution of user ratings for "Fight Club (1999)" movie looks like?

In [None]:
fight_club_movieId = movies_data[
    movies_data["title"] == "Fight Club (1999)"
]["movieId"].iloc[0]
fight_club_ratings = ratings_data[ratings_data["movieId"] == fight_club_movieId]["rating"]

plt.figure(figsize=(10, 5))
sns.kdeplot(fight_club_ratings)
plt.title("Distribution of User Ratings for Fight Club (1999)")
plt.xlabel("Rating")
plt.ylabel("Density")
plt.show()

In [None]:
### Which movie is the most popular based on  average user ratings?

In [None]:
ratings_count = ratings_data.groupby("movieId").size()
movies_with_100plus_ratings = ratings_count[ratings_count >= 100].index
average_ratings_100plus = (
    ratings_data[ratings_data["movieId"].isin(movies_with_100plus_ratings)]
    .groupby("movieId")["rating"]
    .mean()
)
most_popular_by_avg_rating_100plus = average_ratings_100plus.idxmax()
most_popular_movie_title_100plus = movies_data[
    movies_data["movieId"] == most_popular_by_avg_rating_100plus
]["title"].iloc[0]

In [None]:
### Select all the correct options which comes under top 5 popular movies based on number of user ratings.

In [None]:
top_movies_user_ratings = ratings_data["movieId"].value_counts().head(5).index.tolist()
top_movies_titles = movies_data[movies_data["movieId"].isin(top_movies_user_ratings)][
    "title"
].tolist()
top_movies_titles

In [None]:
### Which Sci-Fi movie is "third most popular" based on the number of user ratings?

In [None]:
sci_fi_movies_data = movies_data[movies_data["genres"].str.contains("Sci-Fi")]
sci_fi_movie_ratings = ratings_data[ratings_data["movieId"].isin(sci_fi_movies_data["movieId"])][
    "movieId"
].value_counts()
third_most_popular_sci_fi_movie_id = sci_fi_movie_ratings.index[2]
third_most_popular_sci_fi_movie_title = movies_data[
    movies_data["movieId"] == third_most_popular_sci_fi_movie_id
]["title"].iloc[0]


In [None]:
### Mention the movieId of the movie which has the highest IMDB rating.

In [None]:
merged_ratings_imdb_data = average_ratings_100plus.reset_index().merge(
    links_data, on="movieId"
)

In [None]:
highest_imdb_rating = merged_ratings_imdb_data["rating"].max()
highest_imdb_movieId = merged_ratings_imdb_data[
    merged_ratings_imdb_data["rating"] == highest_imdb_rating
]["movieId"].iloc[0]

In [None]:
print(highest_imdb_movieId)

In [None]:
movies_data[movies_data["movieId"] == 318]

In [None]:
### Mention the movieId of the "Sci-Fi" movie which has the highest IMDB rating.

In [None]:
sci_fi_movies_data = movies_data[movies_data["genres"].str.contains("Sci-Fi")]
sci_fi_ratings_data = sci_fi_movies_data.merge(
    average_ratings_100plus.reset_index(), on="movieId"
)
highest_rated_sci_fi_movie_data = sci_fi_ratings_data[
    sci_fi_ratings_data["rating"] == sci_fi_ratings_data["rating"].max()
]
highest_rated_sci_fi_movie_data[["movieId", "title", "rating"]]
