In [1]:
import pandas as pd
import altair as alt
import matplotlib.pyplot as plt
import seaborn as sns
import upsetplot
from altair_saver import save

In [2]:
movies = pd.read_csv('movies.csv')
# Removing movies with no genres
movies = movies.dropna(subset = ["genres"])
# Removing movies with less than 1000 imdb votes to remove entries that may have severe biases
movies = movies.loc[movies["imdb_votes"] > 1000]
movies['genres'] = movies['genres'].str.split(', ')
for index, row in movies.iterrows():
    movies.loc[index, 'genres'] = row['genres'][0]
movies.head(10)

Unnamed: 0.1,Unnamed: 0,title,release_date,revenue,runtime,budget,original_language,genres,imdb_rating,imdb_votes
0,7,Finding Nemo,2003-05-30,940335500.0,100.0,94000000.0,en,Animation,8.2,1131808.0
1,15,My Life Without Me,2003-03-07,12300000.0,106.0,2500000.0,en,Drama,7.4,26005.0
2,17,Pirates of the Caribbean: The Curse of the Bla...,2003-07-09,655011200.0,143.0,140000000.0,en,Adventure,8.1,1232790.0
3,18,Kill Bill: Vol. 1,2003-10-10,180906100.0,111.0,30000000.0,en,Action,8.2,1217716.0
4,19,Jarhead,2005-11-04,97076150.0,123.0,72000000.0,en,Drama,7.0,210770.0
5,21,9 Songs,2004-07-16,1574623.0,69.0,1000000.0,en,Drama,4.8,24311.0
6,24,The Simpsons Movie,2007-07-25,527068900.0,87.0,75000000.0,en,Animation,7.3,353570.0
7,25,Eternal Sunshine of the Spotless Mind,2004-03-19,72258130.0,108.0,20000000.0,en,Science Fiction,8.3,1097479.0
8,27,Pirates of the Caribbean: Dead Man's Chest,2006-07-06,1065700000.0,151.0,200000000.0,en,Adventure,7.4,780004.0
9,28,A History of Violence,2005-09-23,60740830.0,96.0,32000000.0,en,Drama,7.4,257978.0


In [3]:
top_genre_movies = pd.DataFrame()
genres = ['Drama', 'Romance', 'Adventure', 'Fantasy',
       'Action', 'Comedy', 'Science Fiction',
       'Thriller', 'Horror', 'Documentary']
for genre in genres:
    top_genre_movies = pd.concat([top_genre_movies, movies.loc[movies["genres"] == genre].nlargest(3, "imdb_rating")])
top_genre_movies.head(10)

Unnamed: 0.1,Unnamed: 0,title,release_date,revenue,runtime,budget,original_language,genres,imdb_rating,imdb_votes
22,116,The Dark Knight,2008-07-16,1004558000.0,152.0,185000000.0,en,Drama,9.0,2928359.0
113,846,The Prestige,2006-10-17,109676300.0,130.0,40000000.0,en,Drama,8.5,1469264.0
139,947,The Departed,2006-10-04,291465000.0,151.0,90000000.0,en,Drama,8.5,1446693.0
105,758,Cinderella Man,2005-06-02,108539900.0,144.0,88000000.0,en,Romance,8.0,200591.0
2470,94659,Her,2013-12-18,47351250.0,126.0,23000000.0,en,Romance,8.0,683700.0
2403,85133,Before Midnight,2013-04-05,11176470.0,109.0,3000000.0,en,Romance,7.9,175190.0
19,86,The Lord of the Rings: The Return of the King,2003-12-17,1118889000.0,201.0,94000000.0,en,Adventure,9.0,2017756.0
2484,96741,Interstellar,2014-11-05,701729200.0,169.0,165000000.0,en,Adventure,8.7,2168644.0
3017,183548,Avengers: Endgame,2019-04-24,2799439000.0,181.0,356000000.0,en,Adventure,8.4,1302252.0
720,5307,How to Train Your Dragon,2010-03-18,494879500.0,98.0,165000000.0,en,Fantasy,8.1,816375.0


In [4]:
chart = alt.Chart(top_genre_movies, title = "Ratings For The Top 3 Movies In Each Genre").mark_bar().encode(
    alt.X("title:N").sort('-y').axis(
        labelAngle=45
    ),
    alt.Y("imdb_rating:Q").scale(domain = (5, 10), clamp = True).axis(
        titleAngle=0,
        titleAlign="left",
        titleY=-2,
        titleX=0),
    alt.Color("genres:N")
)
chart