In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import plotly.express as px

In [2]:
# Importing 'Comedy' Movie data
comedy = pd.read_csv('data/comedy_mov.csv')

In [3]:
comedy.head()

Unnamed: 0,Title,Year,Genres,Rating,Votes
0,Salome Mad,1909,Comedy,3.9,25
1,Házasodik az uram,1913,Comedy,3.5,37
2,Die Insel der Seligen,1913,"Comedy,Fantasy",4.6,77
3,A Regiment of Two,1913,"Comedy,Drama",6.3,27
4,Wo ist Coletti?,1913,"Comedy,Crime",6.3,52


In [4]:
len(comedy)

89496

In [5]:
# Sorting the DataFrame from earliest to latest
comedy = comedy.sort_values(by='Year', ignore_index=True) 
comedy.head()


Unnamed: 0,Title,Year,Genres,Rating,Votes
0,Solser en Hesse,1900,Comedy,3.0,12
1,Lika mot lika,1906,Comedy,3.5,32
2,Salome Mad,1909,Comedy,3.9,25
3,Uma Licao de Maxixe,1909,Comedy,4.6,13
4,La Chicanera,1909,"Comedy,Musical",4.6,13


In [6]:
# Grouping by decade and counting entries
comedy['Decade'] = (comedy['Year'] // 10) * 10
decade_counts = comedy.groupby('Decade').size().reset_index(name='Count')

# Printing results
print(decade_counts)

# Saving results to a text file
with open('data/comedy_decades.txt', 'w') as f:
    f.write(decade_counts.to_string())

    Decade  Count
0     1900      7
1     1910    343
2     1920    838
3     1930   3129
4     1940   2930
5     1950   3885
6     1960   5457
7     1970   6391
8     1980   7445
9     1990   8705
10    2000  15020
11    2010  24165
12    2020  11181


In [None]:
# Plotly bar
fig = px.bar(decade_counts, x='Decade', y='Count', # color='Decade', 
             color_discrete_sequence=['yellow'], # Comedy is usually  yellow
             title='Comedy Releases by Decade')
fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey')))
# Label the bar for 2010 pre-covid vertically
fig.add_annotation(x=2010, y=decade_counts.loc[decade_counts['Decade'] == 2010, 'Count'].values[0],
                   text="Pre-COVID",
                   showarrow=True,
                   arrowhead=2,
                   ax=0,
                   ay=-40,
                   font=dict(size=12))
fig.show()


In [8]:
# Find 10 highest rated Comedy titles
top_comedy = comedy.nlargest(10, 'Rating')[['Title', 'Rating']]
# Print list
print(top_comedy.to_string(index=False))

                                                              Title  Rating
                                   Auf den Spuren des Hans im Glück    10.0
                                                Derelitto & Castigo    10.0
                                                      Lost in Vyond    10.0
                                             Planet Disagreements 8    10.0
                                                       The Premiere    10.0
                                               Not looking for love    10.0
                                       Don Gil von den grünen Hosen     9.9
                                                Hauptsache Minister     9.9
Was nicht im Baedecker steht: Bitte, einsteigen zu Käses Rundfahrt!     9.8
                                                Ben Blue's Brothers     9.8


In [9]:
# Find the average rating amongst all 'Comedy" titles
average_rating = comedy['Rating'].mean()
print("Average Rating:", average_rating)

Average Rating: 5.960273084830608


In [10]:
# Find 10 highest voted 'Comedy' titles
top_vote = comedy.nlargest(10, 'Votes')[['Title', 'Votes']]
top_vote.insert(1, ' ', ' ')
# Print list
print(top_vote.to_string(index=False))

                  Title     Votes
The Wolf of Wall Street   1667563
     Back to the Future   1366267
Guardians of the Galaxy   1309685
        The Truman Show   1265436
               Deadpool   1197049
                     Up   1172027
           Finding Nemo   1151971
              Toy Story   1113795
         Monsters, Inc.   1019885
           Intouchables    970397


In [None]:
# Plotly pie
fig_pie = px.pie(top_vote, names='Title', values='Votes', title='Top 10 Voted Comedy Films')
fig_pie.update_layout(title_x=0.45)  # Adjust the title placement
fig_pie.show()

In [12]:
# Find the average vote count amongst all 'Comedy' titles
average_votes = comedy['Votes'].mean()
print("Average Vote Count:", average_rating)

Average Vote Count: 5.960273084830608
