In [2]:
import pandas as pd

movies = pd.read_csv('data/movies.csv')
ratings = pd.read_csv('data/ratings.csv')
links = pd.read_csv('data/links.csv')

print(movies.head())
print(ratings.head())
print(links.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931
   movieId  imdbId   tmdbId
0        1  114709    862.0
1        2  113497   8844.0
2        3  113228  15602.0
3        4  114885  31357.0
4        5  113041  11862.0


In [3]:
print("movies df")
print(movies.isna().sum())

print("\nratings df")
print(ratings.isna().sum())

print("\nlinks df")
print(links.isna().sum())

links[links['tmdbId'].isna()]

missing_link_movies = pd.merge(links,movies,how="inner",on="movieId")
missing_link_movies[missing_link_movies['tmdbId'].isna()][['title']]

movies df
movieId    0
title      0
genres     0
dtype: int64

ratings df
userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

links df
movieId    0
imdbId     0
tmdbId     8
dtype: int64


Unnamed: 0,title
624,"Last Klezmer: Leopold Kozlowski, His Life and ..."
843,Loser (1991)
2141,Saturn 3 (1980)
3027,Horrors of Spider Island (Ein Toter Hing im Ne...
5532,"Decalogue, The (Dekalog) (1989)"
5854,Eros (2004)
6059,Babylon 5
7382,"No. 1 Ladies' Detective Agency, The (2008)"


# Visualize the Average Ratings of Each Genre

### Merged Datasets

In [4]:
movies_and_ratings = pd.merge(movies,ratings,how="inner",on='movieId')[['movieId','title','genres','rating']]

In [5]:
import seaborn as sns
import matplotlib.pyplot as plt



In [6]:
movies_and_ratings['genres'] = movies_and_ratings['genres'].str.split('|')

print(movies_and_ratings.head())

   movieId             title  \
0        1  Toy Story (1995)   
1        1  Toy Story (1995)   
2        1  Toy Story (1995)   
3        1  Toy Story (1995)   
4        1  Toy Story (1995)   

                                              genres  rating  
0  [Adventure, Animation, Children, Comedy, Fantasy]     4.0  
1  [Adventure, Animation, Children, Comedy, Fantasy]     4.0  
2  [Adventure, Animation, Children, Comedy, Fantasy]     4.5  
3  [Adventure, Animation, Children, Comedy, Fantasy]     2.5  
4  [Adventure, Animation, Children, Comedy, Fantasy]     4.5  


In [7]:
movies_and_ratings = movies_and_ratings.explode('genres')
print(movies_and_ratings.head())

   movieId             title     genres  rating
0        1  Toy Story (1995)  Adventure     4.0
0        1  Toy Story (1995)  Animation     4.0
0        1  Toy Story (1995)   Children     4.0
0        1  Toy Story (1995)     Comedy     4.0
0        1  Toy Story (1995)    Fantasy     4.0


In [8]:
genre_avg = (
    movies_and_ratings.groupby('genres')['rating']
    .mean()
    .round(3)
    .sort_values(ascending=False)
)
genre_avg = pd.DataFrame(genre_avg)
print(genre_avg)

                    rating
genres                    
Film-Noir            3.920
War                  3.808
Documentary          3.798
Crime                3.658
Drama                3.656
Mystery              3.632
Animation            3.630
IMAX                 3.618
Western              3.584
Musical              3.564
Adventure            3.509
Romance              3.507
Thriller             3.494
Fantasy              3.491
(no genres listed)   3.489
Sci-Fi               3.456
Action               3.448
Children             3.413
Comedy               3.385
Horror               3.258


In [9]:
import plotly.express as px

fig = px.bar(
    genre_avg,
    x=genre_avg.index,
    y='rating',
    title='Average Movie Rating per Genre',
    labels={
        'genres': 'Movie Genre',       # x-axis label
        'rating': 'Average Rating',      # y-axis label
    }
)
fig.show()

# Visualizing # Ratings Per Genre

In [38]:
num_ratings = (
    movies_and_ratings.groupby('genres')[['rating']]
    .count()
    .sort_values(by = 'rating', ascending=False)
)

ratings_fig = px.histogram(
    num_ratings,
    x=num_ratings.index,
    y='rating',
    title= 'Number of Ratings per Genre',
)

ratings_fig.update_layout(yaxis_title="Number of Ratings")
ratings_fig.show()

# Visualize Number Movies per Genre

In [37]:
movies_genres = movies.copy()
movies_genres['genres'] = movies_genres['genres'].str.split('|')
movies_genres = movies_genres.explode('genres')


In [None]:
num_movies  = (
    movies_genres.groupby('genres')[['title']]
    .count()
    .sort_values(by = 'title', ascending=False)
)

movies_fig = px.histogram(
    num_movies,
    x=num_movies.index,
    y='title',
    title= 'Number of Movies per Genre',
    labels={"title": "movies"},     # Fixing the hover data to say "sum of movies"
    hover_data=["title"],
)

movies_fig.update_layout(yaxis_title="Number of Movies")
movies_fig.show()

# Visualize Ratings to Genre Ratio

In [64]:
genre_rating_ratio = (
    (num_ratings['rating'] / num_movies['title'])
    .reset_index(name='rating_to_movie_ratio')
)

ratio_fig = px.histogram(
    genre_rating_ratio,
    x='genres',
    y='rating_to_movie_ratio',
    title= 'Ratings to Genre Ratio',
    # labels={"title": "movies"},     # Fixing the hover data to say "sum of movies"
    # hover_data=["title"],
)

ratio_fig.update_layout(yaxis_title="Number of Movies")
ratio_fig.show()

# Visualize top 10 highest rated movies

In [None]:
movies_and_ratings_grouped = (
    movies_and_ratings.groupby(['movieId','title'])['rating']
    .mean()
    .round(3)
    .sort_values(ascending=False)
)
movies_and_ratings_grouped = pd.DataFrame(movies_and_ratings_grouped).reset_index()

highest_rated_movies = movies_and_ratings_grouped.merge(
    ratings.groupby('movieId').size().reset_index(name='num_ratings'), 
    on='movieId', 
    how='left'
    ).sort_values(by=['rating','num_ratings'], ascending=False)

highest_rated_movies.head(10)

Unnamed: 0,movieId,title,rating,num_ratings
24,6818,Come and See (Idi i smotri) (1985),5.0,2
26,99,Heidi Fleiss: Hollywood Madam (1995),5.0,2
32,1151,Lesson Faust (1994),5.0,2
53,78836,Enter the Void (2009),5.0,2
127,3473,Jonah Who Will Be 25 in the Year 2000 (Jonas q...,5.0,2
226,6442,Belle époque (1992),5.0,2
273,53,Lamerica (1994),5.0,2
0,187717,Won't You Be My Neighbor? (2018),5.0,1
1,6983,Jane Eyre (1944),5.0,1
2,5328,Rain (2001),5.0,1


In [84]:
most_rated_movies = highest_rated_movies.sort_values(by='num_ratings', ascending=False)
most_rated_movies.head(10)

Unnamed: 0,movieId,title,rating,num_ratings
1035,356,Forrest Gump (1994),4.164,329
722,318,"Shawshank Redemption, The (1994)",4.429,317
971,296,Pulp Fiction (1994),4.197,307
1038,593,"Silence of the Lambs, The (1991)",4.161,279
972,2571,"Matrix, The (1999)",4.192,278
935,260,Star Wars: Episode IV - A New Hope (1977),4.231,251
3116,480,Jurassic Park (1993),3.75,238
1233,110,Braveheart (1995),4.032,237
2296,589,Terminator 2: Judgment Day (1991),3.971,224
940,527,Schindler's List (1993),4.225,220


# Visualize the Distribution of Ratings for Each Genre

In [11]:
for genre in genre_avg.index:
    genre_ratings = movies_and_ratings[movies_and_ratings['genres'] == genre]
    
    # Printing the distribution
    dist_fig = px.histogram(
        genre_ratings,
        x='rating',
        title= f'{genre} Movie Rating Distribution',
        labels = {
            'rating' : 'Rating Values',
            'count' : 'Number of Ratings',
        }
    )

    dist_fig.update_traces(
        marker=dict(
            line=dict(
                color='black', width=1.0
                )
            )
        )

    dist_fig.show()

    #Printing the box plot
    box_fig = px.box(
        genre_ratings,
        x='rating',
        title = f'{genre} Box Plot',
    )

    box_fig.show()



In [12]:
num_genres = (
    movies_and_ratings.groupby('genres')[['movieId']]
    .count()
    # .sort_values(ascending=False)
)

# print(num_ratings)

ratings_fig = px.histogram(
    num_ratings,
    x=num_ratings.index,
    y='rating',
    title= 'Number of Ratings per Genre',
)

ratings_fig.update_layout(yaxis_title="Number of Ratings")

ratings_fig.show()