# Innomatics Data Science Internship Entrance Test
                                                                                     - By Amit Ojahiya

###  Importing all the required libraries   

In [None]:
import pandas as pd 
import matplotlib.pyplot as plt

### Loading all the Datasets 

In [None]:

links_data = pd.read_csv('links.csv')
movies_data = pd.read_csv('movies.csv')
ratings_data = pd.read_csv('ratings.csv')
tags_data = pd.read_csv('tags.csv')

### Question 2 : Getting the shape of the movies.csv file 

In [None]:
# By using shape function
movies_data.shape

### Question 3 : Getting the shape of 'ratings.csv' file  

In [None]:
ratings_data.shape

### Question 4 : Getting the number of unique 'userId' from 'ratings_data'  

In [None]:
# By using nunique() function 

unique_users = ratings_data['userId'].nunique()

unique_users

### Question 5 : Getting the movie which has recieved max num of user ratings  

In [None]:
# Grouping the ratings by movieId and counting the number of ratings for each movie
movie_rating_counts = ratings_data.groupby('movieId').size()

# Finding the movie with the maximum number of ratings
max_rated_movie_id = movie_rating_counts.idxmax()

# Finding the title of the movie with the maximum number of ratings
max_rated_movie_title = movies_data[movies_data['movieId'] == max_rated_movie_id]['title'].iloc[0]

print("Movie with the maximum number of ratings:", max_rated_movie_title)
print("Movie ID with the maximum number of ratings:", max_rated_movie_id)


### Question 6 : Getting all the correct tags submitted by users to "Matrix, The (1999)" movie  

In [None]:
# Finding the movieId for "Matrix, The (1999)"

matrix_movie = movies_data[movies_data['title'] == 'Matrix, The (1999)']
matrix_movie_id = matrix_movie['movieId'].values[0]

# Filtering tags for the movie with its movieId
matrix_tags = tags_data[tags_data['movieId'] == matrix_movie_id]

# Displaying unique tags for the movie
unique_tags = matrix_tags['tag'].unique()
print("Tags for 'Matrix, The (1999)':")
print(unique_tags)

### Question 7 : Getting the avg user rating for the movie named "Terminator 2: Judgment Day (1991)" 

In [None]:
# Finding the movieId for "Terminator 2: Judgment Day (1991)"
terminator_movie = movies_data[movies_data['title'] == 'Terminator 2: Judgment Day (1991)']

if not terminator_movie.empty:
    terminator_movie_id = terminator_movie.iloc[0]['movieId']

    # Calculating the average user rating for the movie
    average_rating = ratings_data[ratings_data['movieId'] == terminator_movie_id]['rating'].mean()
    
    print(f"Average user rating for 'Terminator 2: Judgment Day (1991)': {average_rating:.2f}")
else:
    print("Movie not found")

### Question 8 :  Getting the data distribution of user ratings for "Fight Club (1999)" movie

In [None]:
# Finding the movieId for "Fight Club (1999)"
fight_club_movie = movies_data[movies_data['title'] == 'Fight Club (1999)']

if not fight_club_movie.empty:
    fight_club_movie_id = fight_club_movie.iloc[0]['movieId']

    # Filtering ratings for the movie
    fight_club_ratings = ratings_data[ratings_data['movieId'] == fight_club_movie_id]['rating']

    # Plotting the histogram of user ratings for "Fight Club (1999)"
    plt.hist(fight_club_ratings, bins=10, edgecolor='black')
    plt.xlabel('Rating')
    plt.ylabel('Frequency')
    plt.title('User Ratings Distribution for Fight Club (1999)')
    plt.show()
else:
    print("Movie not found")

- The above distribution exhibits **left-skewness**. A left-skewed distribution means that most of the data points are on the right side, and the tail extends towards the left

## Creating a Filtered Dataset using the mentioned Mandatory Operations for solving the futher questions 

In [None]:
# Grouping user ratings by movieId and apply count and mean operations
grouped_ratings = ratings_data.groupby('movieId')['rating'].agg(['count', 'mean']).reset_index()

# Renaming columns for clarity
grouped_ratings.columns = ['movieId', 'rating_count', 'rating_mean']

# Merging movies data with grouped ratings using inner join
merged_data = pd.merge(movies_data, grouped_ratings, on='movieId', how='inner')

# Filtering movies with more than 50 user ratings
filtered_data = merged_data[merged_data['rating_count'] > 50]

# Displaying the resulting dataset
filtered_data.head()

### Question 9 : Getting the most popular movie based on the avg user rating  

In [None]:
# Sorting the filtered dataset by 'rating_mean' in descending order
sorted_data = filtered_data.sort_values(by='rating_mean', ascending=False)

# Getting the most popular movie (highest average rating)
most_popular_movie = sorted_data.iloc[0]

print("Most popular movie based on average user ratings:")
print("Title:", most_popular_movie['title'])
print("Average Rating:", most_popular_movie['rating_mean'])

### Question 10 : Getting the top 5 popular movies based on the number of user ratings  

In [None]:
# Sorting the filtered dataset by 'rating_count' in descending order
sorted_by_ratings_count = filtered_data.sort_values(by='rating_count', ascending=False)

# Getting the top 5 popular movies based on number of user ratings
top_5_popular_movies = sorted_by_ratings_count.head(5)

# Displaying the titles of the top 5 popular movies
print("Top 5 popular movies based on number of user ratings:")
print(top_5_popular_movies[['title', 'rating_count']])

### Question 11 : Getting the 3rd most popular Sci-Fi movie based on the number of user ratings  

In [None]:
# Filtering the dataset for Sci-Fi movies
sci_fi_movies = filtered_data[filtered_data['genres'].str.contains('Sci-Fi')]

# Sorting the Sci-Fi movies by 'rating_count' in descending order
sorted_sci_fi_by_ratings = sci_fi_movies.sort_values(by='rating_count', ascending=False)

# Getting the third most popular Sci-Fi movie based on number of user ratings
third_most_popular_sci_fi = sorted_sci_fi_by_ratings.iloc[2]  # Index 2 for the third movie (0-indexed)

print("Third most popular Sci-Fi movie based on number of user ratings:")
print("Title:", third_most_popular_sci_fi['title'])
print("Number of User Ratings:", third_most_popular_sci_fi['rating_count'])

### Performing WebScraping for getting the IMDB Reviews columns in the filtered dataset 

In [None]:
# First we merged our filtered_data with the 'imdbId' col of the links_data 

# Merge filtered_data with links_data
filtered_data_with_imdb = pd.merge(filtered_data, links_data[['movieId', 'imdbId']], on='movieId', how='left')

# Display the resulting dataset with the new 'imdbId' column
filtered_data_with_imdb.head()

In [None]:
import requests
import numpy as np
from bs4 import BeautifulSoup  
def scrapper(imdbId):
    id = str(int(imdbId))
    n_zeroes = 7 - len(id)
    new_id = "0"*n_zeroes + id
    URL = f"https://www.imdb.com/title/tt{new_id}/"
    request_header = {'Content-Type': 'text/html; charset=UTF-8', 
                      'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/119.0', 
                      'Accept-Encoding': 'gzip, deflate, br'}
    response = requests.get(URL, headers=request_header)  
    soup = BeautifulSoup(response.text, 'html.parser')  
    imdb_rating = soup.find('span', {'itemprop': 'ratingValue'})  
    return imdb_rating.text if imdb_rating else np.nan


Since this method is generating an error, let's attempt a different approach.

* We'll bring in the IMDB Dataset and conduct data cleaning to retain the necessary columns only.

In [None]:
# Reading the imdb dataset 

imdb = pd.read_csv('imdb.csv')

imdb.head()

In [None]:
# Displaying the columns present in the above dataset 

imdb.columns

- For solving the remaning questions we only need **'Series_Title'**, **'Genre'**, and **'IMDB_RATING'** columns 

In [None]:
# Creating a new DataFrame with only the specified columns
imdb_data_filtered = imdb[['Series_Title', 'Genre', 'IMDB_Rating']]

# Displaying the filtered IMDb dataset
imdb_data_filtered.head()

### Question 12 : Getting the 'movieId' of the movie which has the highest IMDB Rating  

- Approach :  We can get the highest IMDB Rating movie from  the 'imdb_data_filtered' dataset and corresponding find its 'movieId' from the 'filtered_data_with_imdb' dataset

In [None]:
highest_rating = imdb_data_filtered['IMDB_Rating'].max()
highest_rated_movies = imdb_data_filtered[imdb_data_filtered['IMDB_Rating'] == highest_rating]


highest_rated_movie_names = highest_rated_movies['Series_Title']

print(f"The name of the movie with the highest rating ({highest_rating}):")
print(highest_rated_movie_names)

- So, The Shawshank Redemption is the highest IMDB Rated movie

Now, to gets its movieId from the movies_data dataset or filtered_data_with_imdb dataset

In [None]:
movie_id_shawshank = filtered_data_with_imdb.loc[filtered_data_with_imdb['title'] == 'Shawshank Redemption, The (1994)', 'movieId'].values[0]

print(f"The 'movieId' of 'Shawshank Redemption, The (1994)' is: {movie_id_shawshank}")


### Question 13 : Getting the 'movieId' of the 'Sci-Fi' movie which has the highest IMDB Rating  

* Approach : We can get the highest IMDB Rating Sci-Fi movie from the 'imdb_data_filtered' dataset and corresponding find its 'movieId' from the 'filtered_data_with_imdb' dataset

In [None]:
# Filtering the DataFrame for Sci-Fi genre
sci_fi_movies = imdb_data_filtered[imdb_data_filtered['Genre'].str.contains('Sci-Fi', case=False)]

# Finding the highest-rated Sci-Fi movie
highest_rated_sci_fi_movie = sci_fi_movies.loc[sci_fi_movies['IMDB_Rating'].idxmax()]
highest_rated_sci_fi_movie_title = highest_rated_sci_fi_movie['Series_Title']

print("Highest-rated Sci-Fi movie:")
print(highest_rated_sci_fi_movie_title)


* So, Inception is the highest IMDB Rated movie


Now, to gets its movieId from the movies_data dataset or filtered_data_with_imdb dataset

In [None]:
movie_id_inception = filtered_data_with_imdb.loc[filtered_data_with_imdb['title'] == 'Inception (2010)', 'movieId'].values[0]

print(f"The 'movieId' of 'Inception' is: ",{movie_id_inception})
