In [58]:
# Import required libraries
import gzip
import shutil
import os
import pandas as pd
import plotly.express as px

In [59]:
# Extract the movies.csv.gz file
with gzip.open('data/movies.csv.gz', 'rb') as f_in:
    with open('data/movies.csv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [60]:
# Importing Cleaned Movie data
movies = pd.read_csv('data/movies.csv')

In [61]:
# Verify dataframe loaded
movies.head()


Unnamed: 0,Title,Year,Genres,Rating,Votes
0,Miss Jerry,1894.0,Romance,5.3,222
1,The Corbett-Fitzsimmons Fight,1897.0,"Documentary,News,Sport",5.2,551
2,The Story of the Kelly Gang,1906.0,"Action,Adventure,Biography",6.0,976
3,L'enfant prodigue,1907.0,Drama,5.6,31
4,Robbery Under Arms,1907.0,Drama,4.3,28


In [62]:
# Check number of entries
len(movies)

363630

In [63]:
# Check datatypes
data_types = movies.dtypes
print(data_types)

Title      object
Year      float64
Genres     object
Rating    float64
Votes       int64
dtype: object


In [64]:
# Convert 'Genres' column to string
movies['Genres'] = movies['Genres'].astype(str)

In [65]:
# Count unique entries in the 'Genres' column
unique_genres_count = movies['Genres'].nunique()
print(unique_genres_count)

1256


In [66]:
# Exploding the 'genres' column
genres_exploded = movies['Genres'].str.split(',').explode().reset_index(drop=True)

genres_exploded.head()

0        Romance
1    Documentary
2           News
3          Sport
4         Action
Name: Genres, dtype: object

In [67]:
# Count unique entries in Genres column, print count
unique_genres_count = genres_exploded.nunique()
print(unique_genres_count)

27


In [68]:
# DataFrame movies in column genres count how many times each entry from unique_genres_count exists, print results
genre_counts = movies['Genres'].str.get_dummies(sep=',').sum().reset_index()
genre_counts.columns = ['Genre', 'Count']
print(genre_counts)

          Genre   Count
0        Action   35070
1         Adult       1
2     Adventure   21473
3     Animation    7171
4     Biography   12841
5        Comedy   89291
6         Crime   32051
7   Documentary   67320
8         Drama  164613
9        Family   15579
10      Fantasy   11154
11    Film-Noir     864
12    Game-Show      17
13      History   11329
14       Horror   25647
15        Music   10419
16      Musical    7801
17      Mystery   14729
18         News     710
19   Reality-TV     184
20      Romance   39723
21       Sci-Fi    8621
22        Sport    4779
23    Talk-Show      44
24     Thriller   30760
25          War    7356
26      Western    5684


In [69]:
# from genre_counts print list of the 10 Genres using count column
top_genres = genre_counts.nlargest(10, 'Count')
print(top_genres)
with open('data/top_genres.txt', 'w') as f:
    for index, row in top_genres.iterrows():
        f.write(f"{row['Genre']}: {row['Count']}\n")
        # save top_genres as csv
top_genres.to_csv('data/top_genres.csv', index=False)

          Genre   Count
8         Drama  164613
5        Comedy   89291
7   Documentary   67320
20      Romance   39723
0        Action   35070
6         Crime   32051
24     Thriller   30760
14       Horror   25647
2     Adventure   21473
9        Family   15579


In [70]:
# Plotly bar 'top_genres'
fig = px.bar(top_genres, x='Genre', y='Count', title='Top 10 Genres',
            color_discrete_sequence=['yellow'],) # IMDB likes yellow
# Outline bars for clarity
fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey')))
fig.show()

In [71]:
# Find 10 highest rated 'Movie' titles
top_rating = movies.nlargest(10, 'Rating')[['Title', 'Rating']]
# Print list
print(top_rating.to_string(index=False))

                                            Title  Rating
                 Auf den Spuren des Hans im Glück    10.0
                                 Broadway Legends    10.0
                                          Kaputol    10.0
                                       D on Dance    10.0
                              Rainy in Glenageary    10.0
                                It's a Love Thang    10.0
Love Live! Series 9th Anniversary LOVE LIVE! FEST    10.0
                               Olu Bliss: Dive In    10.0
                            Tetonica Castro: Home    10.0
                                             Ixel    10.0


In [72]:
# Find the average rating amongst all titles
average_rating = movies['Rating'].mean()
print("Average Rating:", average_rating)

Average Rating: 6.231308472898275


In [73]:
# Find 10 highest voted 'Movie' titles
top_vote = movies.nlargest(10, 'Votes')[['Title', 'Votes']]
top_vote.insert(1, ' ', ' ')
# Print list
print(top_vote.to_string(index=False))

                                            Title     Votes
                         The Shawshank Redemption   3013434
                                  The Dark Knight   2989960
                                        Inception   2657138
                                       Fight Club   2436128
                                     Forrest Gump   2355891
                                     Pulp Fiction   2311032
                                     Interstellar   2296921
                                       The Matrix   2135884
                                    The Godfather   2103011
The Lord of the Rings: The Fellowship of the Ring   2088682


In [74]:
# Plotly pie for 'top_vote'
fig_pie = px.pie(top_vote, names='Title', values='Votes', title='Top 10 Votes Distribution')
fig_pie.update_layout(title_x=0.395)  # Adjust the title placement
fig_pie.show()

In [75]:
# Find the average vote count amongst all titles
average_votes = movies['Votes'].mean()
print("Average Vote Count:", average_rating)

Average Vote Count: 6.231308472898275


In [76]:
# Importing 'Drama'' Movie data
drama = pd.read_csv('data/drama_mov.csv')

In [77]:
# Intro 'drama'
drama.head()

Unnamed: 0,Title,Year,Genres,Rating,Votes
0,L'enfant prodigue,1907.0,Drama,5.6,31
1,Robbery Under Arms,1907.0,Drama,4.3,28
2,Amleto,1908.0,Drama,3.2,32
3,Don Quijote,1908.0,Drama,4.3,23
4,Hamlet,1910.0,Drama,4.5,41


In [78]:
# Count 'Drama' titles
len(drama)

164613

In [79]:
# Sorting the DataFrame from earliest to latest
drama = drama.sort_values(by='Year', ignore_index=True) # Cleaner look
drama.head()


Unnamed: 0,Title,Year,Genres,Rating,Votes
0,La vie et la passion de Jésus Christ,1903.0,"Biography,Drama",6.5,752
1,S. Lubin's Passion Play,1903.0,Drama,4.4,11
2,Dingjunshan,1905.0,Drama,6.3,53
3,L'enfant prodigue,1907.0,Drama,5.6,31
4,Violante,1907.0,Drama,3.4,19


In [80]:
# Grouping by decade and counting entries
drama['Decade'] = (drama['Year'] // 10) * 10
decade_counts_drama = drama.groupby('Decade').size().reset_index(name='Count')

# Printing results
print(decade_counts_drama)

# Saving results to a text file
with open('data/drama_decades.txt', 'w') as f:
    f.write(decade_counts_drama.to_string())

    Decade  Count
0   1900.0     18
1   1910.0   1261
2   1920.0   2350
3   1930.0   4733
4   1940.0   4827
5   1950.0   7466
6   1960.0  10289
7   1970.0  12932
8   1980.0  14843
9   1990.0  16255
10  2000.0  26073
11  2010.0  42994
12  2020.0  20572


In [81]:
# Plotly bar 'Drama' title release by decade
fig = px.bar(decade_counts_drama, x='Decade', y='Count', # color='Decade', 
             color_discrete_sequence=['firebrick'], # Drama is usually  red
             title='Drama Releases by Decade')
# Outline bars for clarity
fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey')))
# Label the bar for 2010 pre-covid vertically
fig.add_annotation(x=2010, y=decade_counts_drama.loc[decade_counts_drama['Decade'] == 2010, 'Count'].values[0],
                   text="Pre-COVID",
                   showarrow=True,
                   arrowhead=2,
                   ax=0,
                   ay=-40,
                   font=dict(size=12))
fig.show()


In [82]:
# Find 10 highest rated 'Drama' titles
top_rating_drama = drama.nlargest(10, 'Rating')[['Title', 'Rating']]
# Print list
print(top_rating_drama.to_string(index=False))

                                 Title  Rating
                              The Poet    10.0
The Secret Diaries of the Film Censors    10.0
                         Ka Mon Bajwat    10.0
                                  Ixel    10.0
                             Displaced    10.0
                      7 Days in a Coma    10.0
                   Rainy in Glenageary    10.0
                               Kaputol    10.0
                            D on Dance    10.0
                                Rijali    10.0


In [83]:
# Find the average rating amongst all 'Drama' titles
average_rating_drama = drama['Rating'].mean()
print("Average Rating:", average_rating_drama)

Average Rating: 6.257878782356193


In [84]:
# Find 10 highest voted 'Drama' titles
top_vote_drama = drama.nlargest(10, 'Votes')[['Title', 'Votes']]
# Add space between columns for easier reading by adding a blank column with (2) spaces
top_vote_drama.insert(1, ' ', ' ')
# Print list
print(top_vote_drama.to_string(index=False))

                                            Title     Votes
                         The Shawshank Redemption   3013434
                                  The Dark Knight   2989960
                                       Fight Club   2436128
                                     Forrest Gump   2355891
                                     Pulp Fiction   2311032
                                     Interstellar   2296921
                                    The Godfather   2103011
The Lord of the Rings: The Fellowship of the Ring   2088682
    The Lord of the Rings: The Return of the King   2059827
                            The Dark Knight Rises   1896633


In [85]:
# Plotly pie distribution of 'Drama' titles top votes
fig_pie = px.pie(top_vote_drama, names='Title', values='Votes', title='Top 10 Voted Drama Films')
fig_pie.update_layout(title_x=0.4)  # Adjust the title placement
fig_pie.show()

In [86]:
# Find the average vote count amongst all 'Drama' titles
average_votes_drama = drama['Votes'].mean()
print("Average Vote Count:", average_votes_drama)

Average Vote Count: 3708.965045288039


In [87]:
# Importing 'Comedy' Movie data
comedy = pd.read_csv('data/comedy_mov.csv')

In [88]:
# Intro 'comedy'
comedy.head()

Unnamed: 0,Title,Year,Genres,Rating,Votes
0,Salome Mad,1909.0,Comedy,3.8,24
1,Házasodik az uram,1913.0,Comedy,3.5,37
2,Die Insel der Seligen,1913.0,"Comedy,Fantasy",4.6,77
3,A Regiment of Two,1913.0,"Comedy,Drama",6.3,27
4,Wo ist Coletti?,1913.0,"Comedy,Crime",6.3,51


In [89]:
# Count 'Comedy' titles
len(comedy)

89291

In [90]:
# Sorting the DataFrame from earliest to latest
comedy = comedy.sort_values(by='Year', ignore_index=True) 
comedy.head()


Unnamed: 0,Title,Year,Genres,Rating,Votes
0,Solser en Hesse,1900.0,Comedy,2.7,11
1,Lika mot lika,1906.0,Comedy,3.4,31
2,Um Cavalheiro Deveras Obsequioso,1909.0,Comedy,4.7,21
3,Salome Mad,1909.0,Comedy,3.8,24
4,La Chicanera,1909.0,"Comedy,Musical",4.6,13


In [91]:
# Grouping by decade and counting entries
comedy['Decade'] = (comedy['Year'] // 10) * 10
decade_counts_comedy = comedy.groupby('Decade').size().reset_index(name='Count')

# Printing results
print(decade_counts_comedy)

# Saving results to a text file
with open('data/comedy_decades.txt', 'w') as f:
    f.write(decade_counts_comedy.to_string())

    Decade  Count
0   1900.0      6
1   1910.0    306
2   1920.0    838
3   1930.0   3129
4   1940.0   2927
5   1950.0   3883
6   1960.0   5450
7   1970.0   6387
8   1980.0   7438
9   1990.0   8705
10  2000.0  15015
11  2010.0  24151
12  2020.0  11056


In [92]:
# Plotly bar 'Comedy' title release by decade
fig = px.bar(decade_counts_comedy, x='Decade', y='Count', # color='Decade', 
             color_discrete_sequence=['yellow'], # Comedy is usually  yellow
             title='Comedy Releases by Decade')
fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey')))
# Label the bar for 2010 pre-covid vertically
fig.add_annotation(x=2010, y=decade_counts_comedy.loc[decade_counts_comedy['Decade'] == 2010, 'Count'].values[0],
                   text="Pre-COVID",
                   showarrow=True,
                   arrowhead=2,
                   ax=0,
                   ay=-40,
                   font=dict(size=12))
fig.show()

In [93]:
# Find 10 highest rated Comedy titles
top_rating_comedy = comedy.nlargest(10, 'Rating')[['Title', 'Rating']]
# Print list
print(top_rating_comedy.to_string(index=False))

                                                              Title  Rating
                                   Auf den Spuren des Hans im Glück    10.0
                                   Las locuras del Dr. Arisos Tenes    10.0
                                                      Lost in Vyond    10.0
                                                        Introvertic    10.0
                                       Don Gil von den grünen Hosen     9.9
                                                Hauptsache Minister     9.9
                                                   Bad Psychiatrist     9.9
Was nicht im Baedecker steht: Bitte, einsteigen zu Käses Rundfahrt!     9.8
                                                Ben Blue's Brothers     9.8
                                                          Quadrille     9.8


In [94]:
# Find the average rating amongst all 'Comedy" titles
average_rating_comedy = comedy['Rating'].mean()
print("Average Rating:", average_rating_comedy)

Average Rating: 5.95986829579689


In [95]:
# Find 10 highest voted 'Comedy' titles
top_vote_comedy = comedy.nlargest(10, 'Votes')[['Title', 'Votes']]
top_vote_comedy.insert(1, ' ', ' ')
# Print list
print(top_vote_comedy.to_string(index=False))

                  Title     Votes
       Django Unchained   1770268
The Wolf of Wall Street   1663985
     Back to the Future   1363981
Guardians of the Galaxy   1308133
        The Truman Show   1262442
               Deadpool   1194045
                     Up   1170170
           Finding Nemo   1150177
              Toy Story   1112155
         Monsters, Inc.   1018346


In [96]:
# Plotly pie distribution of 'Comedy' titles top votes
fig_pie = px.pie(top_vote_comedy, names='Title', values='Votes', title='Top 10 Voted Comedy Films')
fig_pie.update_layout(title_x=0.45)  # Adjust the title placement
fig_pie.show()

In [97]:
# Find the average vote count amongst all 'Comedy' titles
average_votes_comedy = comedy['Votes'].mean()
print("Average Vote Count:", average_votes_comedy)

Average Vote Count: 4045.885117201062


In [98]:
# Importing 'Documentary' Movie data
docu = pd.read_csv('data/documentary_mov.csv')

In [99]:
# Intro docu
docu.head()

Unnamed: 0,Title,Year,Genres,Rating,Votes
0,The Corbett-Fitzsimmons Fight,1897.0,"Documentary,News,Sport",5.2,551
1,Life of Villa,1912.0,"Documentary,War",7.8,33
2,Dr. Mawson in the Antarctic,1913.0,Documentary,5.7,28
3,The Adventures of Buffalo Bill,1917.0,"Documentary,Western",6.4,27
4,"Joliet Prison, Joliet, Ill.",1914.0,Documentary,5.8,10


In [100]:
# Count 'Documentary' titles
len(docu)

67320

In [101]:
# Sorting the DataFrame from earliest to latest
docu = docu.sort_values(by='Year', ignore_index=True) 
docu.head()

Unnamed: 0,Title,Year,Genres,Rating,Votes
0,Birmingham,1896.0,Documentary,4.0,22
1,The Corbett-Fitzsimmons Fight,1897.0,"Documentary,News,Sport",5.2,551
2,Reproduction of the Corbett and Fitzsimmons Fight,1897.0,"Documentary,News,Sport",4.3,65
3,Saída dos Operários do Arsenal da Marinha,1898.0,Documentary,4.7,11
4,A Rua Augusta em Dia de Festa,1898.0,Documentary,3.0,10


In [102]:
# Grouping by decade and counting entries
docu['Decade'] = (docu['Year'] // 10) * 10
decade_counts_docu = docu.groupby('Decade').size().reset_index(name='Count')

# Printing results
print(decade_counts_docu)

# Saving results to a text file
with open('data/documentary_decades.txt', 'w') as f:
    f.write(decade_counts_docu.to_string())

    Decade  Count
0   1890.0     11
1   1900.0     16
2   1910.0     50
3   1920.0    106
4   1930.0    128
5   1940.0    170
6   1950.0    262
7   1960.0    752
8   1970.0   1494
9   1980.0   1980
10  1990.0   4270
11  2000.0  15094
12  2010.0  29611
13  2020.0  13376


In [103]:
# Plotly bar 'Documentary' title release by decade
fig = px.bar(decade_counts_docu, x='Decade', y='Count', # color='Decade', 
             color_discrete_sequence=['gold'], # Historical is usually  gold
             title='Documentary Releases by Decade')
fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey')))
# Label the bar for 2010 pre-covid vertically
fig.add_annotation(x=2010, y=decade_counts_docu.loc[decade_counts_docu['Decade'] == 2010, 'Count'].values[0],
                   text="Pre-COVID",
                   showarrow=True,
                   arrowhead=2,
                   ax=0,
                   ay=-40,
                   font=dict(size=12))
fig.show()

In [104]:
# Find 10 highest rated "Documentary" titles"
top_rating_docu = docu.nlargest(10, 'Rating')[['Title', 'Year', 'Rating']]
# Print list
print(top_rating_docu.to_string(index=False))

                            Title   Year  Rating
                 Broadway Legends 2002.0    10.0
Bio jednom jedan... Dusko Radovic 2006.0    10.0
                          Carraco 2022.0    10.0
                       COMPLEXion 2023.0    10.0
   Paradise (bunnies and flowers) 2023.0    10.0
 Retratos de República Dominicana 2024.0    10.0
       Opioids: The Hidden Crisis 2024.0    10.0
                          Inbound 2025.0    10.0
 Making of Sash! With My Own Eyes 2000.0     9.9
                         Kot ptic 2006.0     9.9


In [105]:
# Find the average rating amongst all 'Documentary' titles
average_rating_docu = docu['Rating'].mean()
print("Average Rating:", average_rating_docu)

Average Rating: 7.1867988710635755


In [106]:
# Find 10 highest voted 'Documentary' titles
top_vote_docu = docu.nlargest(10, 'Votes')[['Title', 'Votes']]
top_vote_docu.insert(1, ' ', ' ')
# Print list
print(top_vote_docu.to_string(index=False))

                Title    Votes
Bowling for Columbine   150408
      Fahrenheit 9/11   133554
        Super Size Me   115207
   Jackass: The Movie   104011
   The Social Dilemma    91622
An Inconvenient Truth    85902
   Jackass Number Two    81327
            Free Solo    80994
           Inside Job    80863
                Senna    79995


In [107]:
# Plotly pie distribution of 'Documentary' titles top votes
fig_pie = px.pie(top_vote_docu, names='Title', values='Votes', title='Top 10 Voted Documentary Films')
fig_pie.update_layout(title_x=0.46)  # Adjust the title placement
fig_pie.show()

In [108]:
# Find the average vote count amongst all 'Comedy' titles
average_votes_docu = docu['Votes'].mean()
print("Average Vote Count:", average_votes_docu)

Average Vote Count: 262.96200237670826


In [109]:
# Importing 'Romance' Movie data
romance = pd.read_csv('data/romance_mov.csv')

In [110]:
# Intro romance
romance.head()

Unnamed: 0,Title,Year,Genres,Rating,Votes
0,Miss Jerry,1894.0,Romance,5.3,222
1,La dame aux camélias,1912.0,"Drama,Romance",5.3,45
2,Amor fatal,1911.0,"Drama,Romance",7.5,24
3,Anny - en gatepiges roman,1912.0,"Drama,Romance",4.6,17
4,Den glade løjtnant,1912.0,Romance,3.8,11


In [111]:
# Count 'Romance' titles
len(romance)

39723

In [112]:
# Sorting the DataFrame from earliest to latest
romance = romance.sort_values(by='Year', ignore_index=True) 
romance.head()

Unnamed: 0,Title,Year,Genres,Rating,Votes
0,Miss Jerry,1894.0,Romance,5.3,222
1,A Viúva Alegre,1909.0,Romance,5.3,21
2,Sumurûn,1910.0,Romance,5.0,31
3,Amor fatal,1911.0,"Drama,Romance",7.5,24
4,Arrah-Na-Pogue,1911.0,"Drama,Romance",3.2,28


In [113]:
# Grouping by decade and counting entries
romance['Decade'] = (romance['Year'] // 10) * 10
decade_counts_romance = romance.groupby('Decade').size().reset_index(name='Count')

# Printing results
print(decade_counts_romance)

# Saving results to a text file
with open('data/romance_decades.txt', 'w') as f:
    f.write(decade_counts_romance.to_string())

    Decade  Count
0   1890.0      1
1   1900.0      1
2   1910.0    230
3   1920.0    841
4   1930.0   2224
5   1940.0   1600
6   1950.0   2036
7   1960.0   2544
8   1970.0   2415
9   1980.0   2787
10  1990.0   3946
11  2000.0   6600
12  2010.0   9611
13  2020.0   4887


In [114]:
# Plotly bar 'Romance' title release by decade
fig = px.bar(decade_counts_romance, x='Decade', y='Count', # color='Decade', 
             color_discrete_sequence=['orange'], # Musicals are usually  orange
             title='Romance Releases by Decade')
fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey')))
# Label the bar for 2010 pre-covid vertically
fig.add_annotation(x=2010, y=decade_counts_romance.loc[decade_counts_romance['Decade'] == 2010, 'Count'].values[0],
                   text="Pre-COVID",
                   showarrow=True,
                   arrowhead=2,
                   ax=0,
                   ay=-40,
                   font=dict(size=12))
fig.show()


In [115]:
# Find 10 highest rated 'Romance' titles
top_romance_romance = romance.nlargest(10, 'Rating')[['Title', 'Rating']]
# Print list
print(top_romance_romance.to_string(index=False))

                                        Title  Rating
Dekh Mujhe Bhi - Syed Fardeen and Shweta Jean    10.0
                                  The College     9.9
                        Pyar Kiya Toh Nibhana     9.8
                               Peluang Ketiga     9.8
                    Yello Jogappa Ninnaramane     9.8
                             Pop Lock 'n Roll     9.7
                                  Get Over It     9.7
                               Tahanan (Home)     9.7
                          A Ghetto Love Story     9.7
                                      Othello     9.6


In [116]:
# Find the average rating amongst all 'Romance' titles
average_rating_romance = romance['Rating'].mean()
print("Average Rating:", average_rating_romance)

Average Rating: 6.090688014500416


In [118]:
# Find 10 highest voted 'Romance' titles
top_vote_romance = romance.nlargest(10, 'Votes')[['Title', 'Votes']]
top_vote_romance.insert(1, ' ', ' ')
# Print list
print(top_vote_romance.to_string(index=False))

                                Title     Votes
                         Forrest Gump   2355891
                              Titanic   1328713
Eternal Sunshine of the Spotless Mind   1125792
                    Good Will Hunting   1124684
                  Slumdog Millionaire    892227
  Le fabuleux destin d'Amélie Poulain    813889
                      La vita è bella    770987
              Silver Linings Playbook    756705
  The Curious Case of Benjamin Button    717025
                                  Her    696551


In [119]:
# Plotly pie distribution of 'Romance' titles top votes
fig_pie = px.pie(top_vote_romance, names='Title', values='Votes', title='Top 10 Voted Romance Films')
fig_pie.update_layout(title_x=0.46)  # Adjust the title placement
fig_pie.show()

In [120]:
# Find the average vote count amongst all 'Romance' titles
average_votes_romance = romance['Votes'].mean()
print("Average Vote Count:", average_votes_romance)

Average Vote Count: 3698.346751252423


In [121]:
# Importing 'Action' Movie data
action = pd.read_csv('data/action_mov.csv')

In [122]:
# Intro action
action.head()

Unnamed: 0,Title,Year,Genres,Rating,Votes
0,The Story of the Kelly Gang,1906.0,"Action,Adventure,Biography",6.0,976
1,What Happened to Mary,1912.0,"Action,Drama,Thriller",6.2,36
2,Who Will Marry Mary?,1913.0,"Action,Adventure",5.2,29
3,Cameo Kirby,1914.0,"Action,Drama,Romance",6.5,18
4,The Exploits of Elaine,1914.0,Action,6.2,107


In [123]:
# Count 'Action' titles
len(action)

35070

In [124]:
# Sorting the DataFrame from earliest to latest
action = action.sort_values(by='Year', ignore_index=True) 
action.head()

Unnamed: 0,Title,Year,Genres,Rating,Votes
0,The Story of the Kelly Gang,1906.0,"Action,Adventure,Biography",6.0,976
1,Chûshingura,1910.0,"Action,Drama",5.6,29
2,Attack on the Gold Escort,1911.0,"Action,Drama",4.2,26
3,What Happened to Mary,1912.0,"Action,Drama,Thriller",6.2,36
4,Cooee and the Echo,1912.0,"Action,Adventure",5.4,25


In [125]:
# Grouping by decade and counting entries
action['Decade'] = (action['Year'] // 10) * 10
decade_counts_action = action.groupby('Decade').size().reset_index(name='Count')

# Printing results
print(decade_counts_action)

# Saving results to a text file
with open('data/action_decades.txt', 'w') as f:
    f.write(decade_counts_action.to_string())

    Decade  Count
0   1900.0      1
1   1910.0    118
2   1920.0    416
3   1930.0    659
4   1940.0    507
5   1950.0    720
6   1960.0   1657
7   1970.0   3122
8   1980.0   4058
9   1990.0   4672
10  2000.0   5252
11  2010.0   9216
12  2020.0   4672


In [126]:
# Plotly bar 'Action' title release by decade
fig = px.bar(decade_counts_action, x='Decade', y='Count', # color='Decade', 
             color_discrete_sequence=['dodgerblue'], # Action is usually  blue
             title='Action Releases by Decade')
fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey')))
# Label the bar for 2010 pre-covid vertically
fig.add_annotation(x=2010, y=decade_counts_action.loc[decade_counts_action['Decade'] == 2010, 'Count'].values[0],
                   text="Pre-COVID",
                   showarrow=True,
                   arrowhead=2,
                   ax=0,
                   ay=-40,
                   font=dict(size=12))
fig.show()

In [127]:
# Find 10 highest rated 'Action' titles
top_action_action = action.nlargest(10, 'Rating')[['Title', 'Rating']]
# Print list
print(top_action_action.to_string(index=False))

                                        Title  Rating
                          The last USSR blues    10.0
                                       Vo tme    10.0
                 The Treasure of Pancho Villa     9.9
                                Tujhko Pukare     9.8
                                      One Way     9.8
                           Azotes de Barrio 2     9.8
                                      The RVM     9.8
                         Susuko ba ako, inay?     9.7
                             The Knight Squad     9.7
OF THE SEA: a film about California Fishermen     9.7


In [128]:
# Find the average rating amongst all 'Action' titles
average_rating_action = action['Rating'].mean()
print("Average Rating:", average_rating_action)

Average Rating: 5.720379241516967


In [129]:
# Find 10 highest voted 'Action' titles
top_vote_action = action.nlargest(10, 'Votes')[['Title', 'Votes']]
top_vote_action.insert(1, ' ', ' ')
# Print list
print(top_vote.to_string(index=False))

                                            Title     Votes
                         The Shawshank Redemption   3013434
                                  The Dark Knight   2989960
                                        Inception   2657138
                                       Fight Club   2436128
                                     Forrest Gump   2355891
                                     Pulp Fiction   2311032
                                     Interstellar   2296921
                                       The Matrix   2135884
                                    The Godfather   2103011
The Lord of the Rings: The Fellowship of the Ring   2088682


In [130]:
# Plotly pie distribution of 'Action' titles top votes
fig_pie = px.pie(top_vote_action, names='Title', values='Votes', title='Top 10 Voted Action Films')
fig_pie.update_layout(title_x=0.4)  # Adjust the title placement
fig_pie.show()

In [131]:
# Find the average vote count amongst all 'Action' titles
average_votes_action = action['Votes'].mean()
print("Average Vote Count:", average_votes_action)

Average Vote Count: 10736.526575420587


In [132]:
# Importing 'Crime' Movie data
crime = pd.read_csv('data/crime_mov.csv')

In [133]:
# Intro crime
crime.head()

Unnamed: 0,Title,Year,Genres,Rating,Votes
0,Ansigttyven I,1910.0,Crime,3.9,21
1,Zigomar contre Nick Carter,1912.0,"Crime,Thriller",6.0,54
2,What 80 Million Women Want,1913.0,"Crime,Drama,Romance",4.1,56
3,Fantômas I: À l'ombre de la guillotine,1913.0,"Crime,Drama",6.9,2612
4,In the Bishop's Carriage,1913.0,"Crime,Drama",5.6,27


In [134]:
# Count 'Crime' titles
len(crime)

32051

In [135]:
# Sorting the DataFrame from earliest to latest
crime = crime.sort_values(by='Year', ignore_index=True) 
crime.head()

Unnamed: 0,Title,Year,Genres,Rating,Votes
0,Ansigttyven I,1910.0,Crime,3.9,21
1,Harry the Footballer,1911.0,"Adventure,Crime,Drama",4.3,34
2,Zigomar contre Nick Carter,1912.0,"Crime,Thriller",6.0,54
3,Le mystère des roches de Kador,1912.0,"Crime,Drama",6.6,452
4,L'enfant de Paris,1913.0,"Crime,Drama",7.2,483


In [136]:
# Grouping by decade and counting entries
crime['Decade'] = (crime['Year'] // 10) * 10
decade_counts_crime = crime.groupby('Decade').size().reset_index(name='Count')

# Printing results
print(decade_counts_crime)

# Saving results to a text file
with open('data/crime_decades.txt', 'w') as f:
    f.write(decade_counts_crime.to_string())

    Decade  Count
0   1910.0    138
1   1920.0    294
2   1930.0   1334
3   1940.0   1163
4   1950.0   1737
5   1960.0   2290
6   1970.0   2858
7   1980.0   2845
8   1990.0   3715
9   2000.0   4623
10  2010.0   7262
11  2020.0   3792


In [137]:
# Plotly bar 'Crime' title release by decade
fig = px.bar(decade_counts_crime, x='Decade', y='Count', # color='Decade', 
             color_discrete_sequence=['purple'], # Crime is usually purple
             title='Crime Releases by Decade')
fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey')))
# Label the bar for 2010 pre-covid vertically
fig.add_annotation(x=2010, y=decade_counts_crime.loc[decade_counts_crime['Decade'] == 2010, 'Count'].values[0],
                   text="Pre-COVID",
                   showarrow=True,
                   arrowhead=2,
                   ax=0,
                   ay=-40,
                   font=dict(size=12))
fig.show()


In [138]:
# Find 10 highest rated 'Crime' titles
top_crime_crime = crime.nlargest(10, 'Rating')[['Title', 'Rating']]
# Print list
print(top_crime_crime.to_string(index=False))

                                    Title  Rating
La vida por mi barrio 13 (Mafia mexicana)    10.0
                      Der Mann von drüben     9.8
                       Party im Zwielicht     9.8
                       Juventud en drogas     9.8
                            Tujhko Pukare     9.8
                                Asatveera     9.8
                            Dheera Samrat     9.8
                                   Redrum     9.7
                                      4N6     9.7
           Die Dame in der schwarzen Robe     9.6


In [139]:
# Find the average rating amongst all 'Crime' titles
average_rating_crime = crime['Rating'].mean()
print("Average Rating:", average_rating_crime)

Average Rating: 6.008676796355808


In [140]:
# Find 10 highest voted 'Crime' titles
top_vote_crime = crime.nlargest(10, 'Votes')[['Title', 'Votes']]
top_vote_crime.insert(1, ' ', ' ')
# Print list
print(top_vote_crime.to_string(index=False))

                   Title     Votes
         The Dark Knight   2989960
            Pulp Fiction   2311032
           The Godfather   2103011
                   Se7en   1888593
 The Wolf of Wall Street   1663985
The Silence of the Lambs   1617769
                   Joker   1589935
            The Departed   1474380
          The Green Mile   1470910
   The Godfather Part II   1416708


In [141]:
# Plotly pie distribution of 'Crime' titles top votes
fig_pie = px.pie(top_vote_crime, names='Title', values='Votes', title='Top 10 Voted Crime Films')
fig_pie.update_layout(title_x=0.45)  # Adjust the title placement
fig_pie.show()

In [142]:
# Find the average vote count amongst all 'Crime' titles
average_votes_crime = crime['Votes'].mean()
print("Average Vote Count:", average_votes_crime)

Average Vote Count: 7033.99007831269


In [143]:
# Importing 'Thriller' Movie data
thriller = pd.read_csv('data/thriller_mov.csv')

In [144]:
# Intro thriller
thriller.head()

Unnamed: 0,Title,Year,Genres,Rating,Votes
0,What Happened to Mary,1912.0,"Action,Drama,Thriller",6.2,36
1,Zigomar contre Nick Carter,1912.0,"Crime,Thriller",6.0,54
2,Der Andere,1913.0,"Drama,Thriller",5.4,126
3,"The $5,000,000 Counterfeiting Plot",1914.0,"Crime,Thriller",6.8,29
4,After Five,1915.0,"Comedy,Crime,Thriller",4.8,26


In [145]:
# Count 'Thriller' titles
len(thriller)

30760

In [146]:
# Sorting the DataFrame from earliest to latest
thriller = thriller.sort_values(by='Year', ignore_index=True) 
thriller.head()

Unnamed: 0,Title,Year,Genres,Rating,Votes
0,One Hundred Years Ago,1911.0,"Drama,Thriller",2.3,19
1,Zigomar contre Nick Carter,1912.0,"Crime,Thriller",6.0,54
2,What Happened to Mary,1912.0,"Action,Drama,Thriller",6.2,36
3,Strike,1912.0,"Drama,Thriller",5.0,12
4,"Zigomar, peau d'anguille - Épisode 1: La résur...",1913.0,"Action,Thriller",5.8,23


In [147]:
# Grouping by decade and counting entries
thriller['Decade'] = (thriller['Year'] // 10) * 10
decade_counts_thriller = thriller.groupby('Decade').size().reset_index(name='Count')

# Printing results
print(decade_counts_thriller)

# Saving results to a text file
with open('data/thriller_decades.txt', 'w') as f:
    f.write(decade_counts_thriller.to_string())

    Decade  Count
0   1910.0     34
1   1920.0     63
2   1930.0    142
3   1940.0    226
4   1950.0    457
5   1960.0    880
6   1970.0   1500
7   1980.0   1908
8   1990.0   3379
9   2000.0   4826
10  2010.0  10477
11  2020.0   6868


In [148]:
# Plotly bar 'Thriller' title release by decade
fig = px.bar(decade_counts_thriller, x='Decade', y='Count', # color='Decade', 
             color_discrete_sequence=['aqua'], # Thriller is basically action so another blue
             title='Thriller Releases by Decade')
fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey')))
# Label the bar for 2010 pre-covid vertically
fig.add_annotation(x=2010, y=decade_counts_thriller.loc[decade_counts_thriller['Decade'] == 2010, 'Count'].values[0],
                   text="Pre-COVID",
                   showarrow=True,
                   arrowhead=2,
                   ax=0,
                   ay=-40,
                   font=dict(size=12))
fig.show()


In [149]:
# Find 10 highest rated 'Thriller' titles
top_rating_thriller = thriller.nlargest(10, 'Rating')[['Title', 'Rating']]
# Print list
print(top_rating_thriller.to_string(index=False))

                    Title  Rating
                   Vo tme    10.0
            Alone at Home     9.8
    The Trees of the East     9.8
                    Rugna     9.8
   The Sound of Southside     9.8
   Nasoor - Let's Restart     9.8
                    Ajaan     9.8
            Dheera Samrat     9.8
        The Platinum Loop     9.8
Virinchi Independent Film     9.7


In [150]:
# Find the average rating amongst all 'Thriller' titles
average_rating_thriller = thriller['Rating'].mean()
print("Average Rating:", average_rating_thriller)

Average Rating: 5.598881664499351


In [151]:
# Find 10 highest voted 'Thriller' titles
top_vote_thriller = thriller.nlargest(10, 'Votes')[['Title', 'Votes']]
top_vote_thriller.insert(1, ' ', ' ')
# Print list
print(top_vote_thriller.to_string(index=False))

                   Title     Votes
   The Dark Knight Rises   1896633
The Silence of the Lambs   1617769
                   Joker   1589935
          Shutter Island   1531324
            The Departed   1474380
                 Memento   1367851
       Kill Bill: Vol. 1   1239769
          Reservoir Dogs   1124287
               Gone Girl   1110965
  No Country for Old Men   1109883


In [152]:
# Plotly pie distribution of 'Thriller' titles top votes
fig_pie = px.pie(top_vote_thriller, names='Title', values='Votes', title='Top 10 Voted Thriller Films')
fig_pie.update_layout(title_x=0.45)  # Adjust the title placement
fig_pie.show()

In [153]:
# Find the average vote count amongst all 'Thriller' titles
average_votes_thriller = thriller['Votes'].mean()
print("Average Vote Count:", average_votes_thriller)

Average Vote Count: 6627.625292587776


In [154]:
# Importing 'Horror' Movie data
horror = pd.read_csv('data/horror_mov.csv')

In [155]:
# Intro horror
horror.head()

Unnamed: 0,Title,Year,Genres,Rating,Votes
0,Der Student von Prag,1913.0,"Drama,Fantasy,Horror",6.4,2533
1,The Avenging Conscience: or 'Thou Shalt Not Kill',1914.0,"Crime,Drama,Horror",6.4,1504
2,The Ghost Breaker,1914.0,"Adventure,Horror",4.8,49
3,Der Golem,1914.0,Horror,6.7,1280
4,Der Hund von Baskerville,1914.0,"Crime,Horror,Mystery",5.6,167


In [156]:
# Count 'Horror' titles
len(horror)

25647

In [157]:
# Sorting the DataFrame from earliest to latest
horror = horror.sort_values(by='Year', ignore_index=True) 
horror.head()

Unnamed: 0,Title,Year,Genres,Rating,Votes
0,Hidaka iriai zakura,1909.0,"Drama,Horror",5.9,15
1,Botan dôrô,1910.0,"Drama,Horror",4.5,13
2,Trilby,1912.0,Horror,3.9,33
3,Satana,1912.0,"Drama,Horror",5.3,37
4,I misteri della psiche,1912.0,"Drama,Fantasy,Horror",6.3,17


In [158]:
# Grouping by decade and counting entries
horror['Decade'] = (horror['Year'] // 10) * 10
decade_counts_horror = horror.groupby('Decade').size().reset_index(name='Count')

# Printing results
print(decade_counts_horror)

# Saving results to a text file
with open('data/horror_decades.txt', 'w') as f:
    f.write(decade_counts_horror.to_string())

    Decade  Count
0   1900.0      1
1   1910.0     68
2   1920.0    101
3   1930.0    140
4   1940.0    190
5   1950.0    327
6   1960.0    721
7   1970.0   1483
8   1980.0   1805
9   1990.0   1673
10  2000.0   3882
11  2010.0   9593
12  2020.0   5663


In [159]:
# Plotly bar 'Horror' title release by decade
fig = px.bar(decade_counts_horror, x='Decade', y='Count', # color='Decade', 
             color_discrete_sequence=['limegreen'], # Horror is usually  dark green oddly
             title='Horror Releases by Decade')
fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey')))
# Label the bar for 2010 pre-covid vertically
fig.add_annotation(x=2010, y=decade_counts_horror.loc[decade_counts_horror['Decade'] == 2010, 'Count'].values[0],
                   text="Pre-COVID",
                   showarrow=True,
                   arrowhead=2,
                   ax=0,
                   ay=-40,
                   font=dict(size=12))
fig.show()

In [160]:
# Find 10 highest rated 'Horror' titles
top_rating_horror = horror.nlargest(10, 'Rating')[['Title', 'Rating']]
# Print list
print(top_rating_horror.to_string(index=False))

                                     Title  Rating
            T.T.T. [Terror in Teruel Town]     9.6
              The Forest Through the Trees     9.6
                      Sandook - Ek Rahasya     9.5
                              Mashaarojinn     9.5
                   Guard: Revenge for Love     9.5
                             God Loves You     9.4
                    Dead Slate: Beginnings     9.4
Michael and Ghostface: Best Buds the Movie     9.4
                               Clownface 3     9.4
                       Happy Birthday Luci     9.4


In [161]:
# Find the average rating amongst all 'Horror' titles
average_rating_horror = horror['Rating'].mean()
print("Average Rating:", average_rating_horror)

Average Rating: 4.996252973057278


In [162]:
# Find 10 highest voted 'Horror' titles
top_vote_horror = horror.nlargest(10, 'Votes')[['Title', 'Votes']]
top_vote_horror.insert(1, ' ', ' ')
# Print list
print(top_vote_horror.to_string(index=False))

          Title     Votes
    The Shining   1156771
          Alien   1006850
    I Am Legend    843154
         Aliens    802962
American Psycho    768157
        Get Out    749922
    World War Z    747182
         Psycho    746547
           Jaws    688133
     Zombieland    641092


In [163]:
# Plotly pie distribution of 'Horror' titles top votes
fig_pie = px.pie(top_vote_horror, names='Title', values='Votes', title='Top 10 Voted Horror Films')
fig_pie.update_layout(title_x=0.45)  # Adjust the title placement
fig_pie.show()

In [164]:
# Find the average vote count amongst all 'Horror' titles
average_votes_horror = horror['Votes'].mean()
print("Average Vote Count:", average_votes_horror)

Average Vote Count: 4763.918119078255


In [165]:
# Importing 'Adventure' Movie data
adventure = pd.read_csv('data/adventure_mov.csv')

In [166]:
# Intro adventure
adventure.head()

Unnamed: 0,Title,Year,Genres,Rating,Votes
0,The Story of the Kelly Gang,1906.0,"Action,Adventure,Biography",6.0,976
1,The Fairylogue and Radio-Plays,1908.0,"Adventure,Fantasy",5.2,78
2,Don Juan de Serrallonga,1910.0,"Adventure,Drama",3.5,22
3,L'inferno,1911.0,"Adventure,Drama,Fantasy",7.0,3739
4,The Adventures of Kathlyn,1913.0,Adventure,5.5,48


In [167]:
# Count 'Adventure' titles
len(adventure)

21473

In [168]:
# Sorting the DataFrame from earliest to latest
adventure = adventure.sort_values(by='Year', ignore_index=True) 
adventure.head()

Unnamed: 0,Title,Year,Genres,Rating,Votes
0,The Story of the Kelly Gang,1906.0,"Action,Adventure,Biography",6.0,976
1,The Fairylogue and Radio-Plays,1908.0,"Adventure,Fantasy",5.2,78
2,Sonho de Valsa,1909.0,"Adventure,Drama",2.4,25
3,Don Juan de Serrallonga,1910.0,"Adventure,Drama",3.5,22
4,L'inferno,1911.0,"Adventure,Drama,Fantasy",7.0,3739


In [169]:
# Grouping by decade and counting entries
adventure['Decade'] = (adventure['Year'] // 10) * 10
decade_counts_adventure = adventure.groupby('Decade').size().reset_index(name='Count')

# Printing results
print(decade_counts_adventure)

# Saving results to a text file
with open('data/adventure_decades.txt', 'w') as f:
    f.write(decade_counts_adventure.to_string())

    Decade  Count
0   1900.0      3
1   1910.0    327
2   1920.0    761
3   1930.0    800
4   1940.0    764
5   1950.0   1241
6   1960.0   1914
7   1970.0   2074
8   1980.0   1857
9   1990.0   1584
10  2000.0   2474
11  2010.0   5393
12  2020.0   2281


In [170]:
# Plotly bar 'Adventure' title release by decade
fig = px.bar(decade_counts_adventure, x='Decade', y='Count', # color='Decade', 
             color_discrete_sequence=['lawngreen'], # Adventure is usually  green
             title='Adventure Releases by Decade')
fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey')))
# Label the bar for 2010 pre-covid vertically
fig.add_annotation(x=2010, y=decade_counts_adventure.loc[decade_counts_adventure['Decade'] == 2010, 'Count'].values[0],
                   text="Pre-COVID",
                   showarrow=True,
                   arrowhead=2,
                   ax=0,
                   ay=-40,
                   font=dict(size=12))
fig.show()

In [171]:
# Find 10 highest rated 'Adventure' titles
top_rating_adventure = adventure.nlargest(10, 'Rating')[['Title', 'Rating']]
# Print list
print(top_rating_adventure.to_string(index=False))

                           Title  Rating
Auf den Spuren des Hans im Glück    10.0
               Independent Roads     9.9
    The Treasure of Pancho Villa     9.9
               Hansel and Gretel     9.8
             Flying Over Everest     9.8
                Buried in Tucson     9.8
                           Parto     9.8
                 The Inventurers     9.8
             McTaggart's Fortune     9.8
              Borderline Forever     9.8


In [172]:
# Find the average rating amongst all 'Adventure' titles
average_rating_adventure = adventure['Rating'].mean()
print("Average Rating:", average_rating_adventure)

Average Rating: 5.867484748288549


In [173]:
# Find 10 highest voted 'Adventure' titles
top_vote_adventure = adventure.nlargest(10, 'Votes')[['Title', 'Votes']]
top_vote_adventure.insert(1, ' ', ' ')
# Print list
print(top_vote_adventure.to_string(index=False))

                                            Title     Votes
                                        Inception   2657138
                                     Interstellar   2296921
The Lord of the Rings: The Fellowship of the Ring   2088682
    The Lord of the Rings: The Return of the King   2059827
            The Lord of the Rings: The Two Towers   1856216
                                        Gladiator   1738157
                             Inglourious Basterds   1661387
                                        Star Wars   1496080
   Star Wars: Episode V - The Empire Strikes Back   1428476
                                           Avatar   1422305


In [174]:
# Plotly pie distribution of 'Adventure' titles top votes
fig_pie = px.pie(top_vote_adventure, names='Title', values='Votes', title='Top 10 Voted Adventure Films')
fig_pie.update_layout(title_x=0.397)  # Adjust the title placement
fig_pie.show()

In [175]:
# Find the average vote count amongst all 'Adventure' titles
average_votes_adventure = adventure['Votes'].mean()
print("Average Vote Count:", average_votes_adventure)

Average Vote Count: 14509.495412844037


In [176]:
# Importing 'Family' Movie data
family = pd.read_csv('data/family_mov.csv')

In [177]:
# Intro family
family.head()

Unnamed: 0,Title,Year,Genres,Rating,Votes
0,The Life of Moses,1909.0,"Biography,Drama,Family",5.5,65
1,"His Majesty, the Scarecrow of Oz",1914.0,"Adventure,Comedy,Family",5.3,553
2,The Patchwork Girl of Oz,1914.0,"Adventure,Comedy,Family",5.4,603
3,Alice in Wonderland,1915.0,"Adventure,Family,Fantasy",6.1,856
4,The Babes in the Woods,1917.0,"Drama,Family,Fantasy",5.7,43


In [178]:
# Count 'Family' titles
len(family)

15579

In [179]:
# Sorting the DataFrame from earliest to latest
family = family.sort_values(by='Year', ignore_index=True) 
family.head()

Unnamed: 0,Title,Year,Genres,Rating,Votes
0,The Life of Moses,1909.0,"Biography,Drama,Family",5.5,65
1,"His Majesty, the Scarecrow of Oz",1914.0,"Adventure,Comedy,Family",5.3,553
2,The Patchwork Girl of Oz,1914.0,"Adventure,Comedy,Family",5.4,603
3,Alice in Wonderland,1915.0,"Adventure,Family,Fantasy",6.1,856
4,Snow White,1916.0,"Adventure,Family,Fantasy",3.8,61


In [180]:
# Grouping by decade and counting entries
family['Decade'] = (family['Year'] // 10) * 10
decade_counts_family = family.groupby('Decade').size().reset_index(name='Count')

# Printing results
print(decade_counts_family)

# Saving results to a text file
with open('data/family_decades.txt', 'w') as f:
    f.write(decade_counts_family.to_string())

    Decade  Count
0   1900.0      1
1   1910.0     13
2   1920.0     44
3   1930.0    193
4   1940.0    295
5   1950.0    627
6   1960.0    824
7   1970.0   1280
8   1980.0   1632
9   1990.0   1543
10  2000.0   2271
11  2010.0   4920
12  2020.0   1936


In [182]:
# Plotly bar 'Family' title release by decade
fig = px.bar(decade_counts_family, x='Decade', y='Count', # color='Decade', 
             color_discrete_sequence=['yellow'], # Family films usually comical so yellow
             title='Family Releases by Decade')
fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey')))
# Label the bar for 2010 pre-covid vertically
fig.add_annotation(x=2010, y=decade_counts_family.loc[decade_counts_family['Decade'] == 2010, 'Count'].values[0],
                   text="Pre-COVID",
                   showarrow=True,
                   arrowhead=2,
                   ax=0,
                   ay=-40,
                   font=dict(size=12))
fig.show()


In [183]:
# Find 10 highest rated 'Family' titles
top_rating_family = family.nlargest(10, 'Rating')[['Title', 'Rating']]
# Print list
print(top_rating_family.to_string(index=False))

                           Title  Rating
                        The Poet    10.0
            Kids on Kids on Kids    10.0
Auf den Spuren des Hans im Glück    10.0
               It's a Love Thang    10.0
                     Dhh Lekacha     9.9
               Hansel and Gretel     9.8
                          Partav     9.8
               The Road to Truth     9.8
                    Amche Samsar     9.8
              An American Posada     9.8


In [184]:
# Find the average rating amongst all 'Family' titles
average_rating_family = family['Rating'].mean()
print("Average Rating:", average_rating_family)

Average Rating: 6.2178445343090045


In [185]:
# Find 10 highest voted 'Family' titles
top_vote_family = family.nlargest(10, 'Votes')[['Title', 'Votes']]
top_vote_family.insert(1, ' ', ' ')
# Print list
print(top_vote.to_string(index=False))

                                            Title     Votes
                         The Shawshank Redemption   3013434
                                  The Dark Knight   2989960
                                        Inception   2657138
                                       Fight Club   2436128
                                     Forrest Gump   2355891
                                     Pulp Fiction   2311032
                                     Interstellar   2296921
                                       The Matrix   2135884
                                    The Godfather   2103011
The Lord of the Rings: The Fellowship of the Ring   2088682


In [186]:
# Plotly pie distribution of 'Family' titles top votes
fig_pie = px.pie(top_vote_family, names='Title', values='Votes', title='Top 10 Voted Family Films')
fig_pie.update_layout(title_x=0.4)  # Adjust the title placement
fig_pie.show()

In [187]:
# Find the average vote count amongst all 'Family' titles
average_votes_family = family['Votes'].mean()
print("Average Vote Count:", average_votes_family)

Average Vote Count: 3485.1672122729315
