In [11]:
# Import required libraries
import pandas as pd
import numpy as np
import plotly.express as px

In [12]:
# Importing 'Documentary' Movie data
docu = pd.read_csv('data/documentary_mov.csv')

In [13]:
docu.head()

Unnamed: 0,Title,Year,Genres,Rating,Votes
0,The Corbett-Fitzsimmons Fight,1897,"Documentary,News,Sport",5.3,555
1,Life of Villa,1912,"Documentary,War",7.8,33
2,Dr. Mawson in the Antarctic,1913,Documentary,5.7,28
3,The Adventures of Buffalo Bill,1917,"Documentary,Western",6.4,28
4,"Joliet Prison, Joliet, Ill.",1914,Documentary,5.8,10


In [14]:
len(docu)

67529

In [15]:
# Sorting the DataFrame from earliest to latest
docu = docu.sort_values(by='Year', ignore_index=True) 
docu.head()

Unnamed: 0,Title,Year,Genres,Rating,Votes
0,Birmingham,1896,Documentary,4.0,22
1,The Corbett-Fitzsimmons Fight,1897,"Documentary,News,Sport",5.3,555
2,Reproduction of the Corbett and Fitzsimmons Fight,1897,"Documentary,News,Sport",4.3,65
3,Dressing Paper Dolls,1898,Documentary,3.3,27
4,69th Regiment Passing in Review,1898,Documentary,3.6,24


In [16]:
# Grouping by decade and counting entries
docu['Decade'] = (docu['Year'] // 10) * 10
decade_counts = docu.groupby('Decade').size().reset_index(name='Count')

# Printing results
print(decade_counts)

# Saving results to a text file
with open('data/documentary_decades.txt', 'w') as f:
    f.write(decade_counts.to_string())

    Decade  Count
0     1890     11
1     1900     16
2     1910     54
3     1920    106
4     1930    128
5     1940    170
6     1950    264
7     1960    754
8     1970   1495
9     1980   1984
10    1990   4275
11    2000  15103
12    2010  29634
13    2020  13535


In [None]:
# Plotly bar
fig = px.bar(decade_counts, x='Decade', y='Count', # color='Decade', 
             color_discrete_sequence=['gold'], # Historical is usually  gold
             title='Documentary Releases by Decade')
fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey')))
# Label the bar for 2010 pre-covid vertically
fig.add_annotation(x=2010, y=decade_counts.loc[decade_counts['Decade'] == 2010, 'Count'].values[0],
                   text="Pre-COVID",
                   showarrow=True,
                   arrowhead=2,
                   ax=0,
                   ay=-40,
                   font=dict(size=12))
fig.show()


In [18]:
# Find 10 highest rated "Documentary" titles"
top_docu = docu.nlargest(10, 'Rating')[['Title', 'Year', 'Rating']]
# Print list
print(top_docu.to_string(index=False))

                             Title  Year  Rating
                  Broadway Legends  2002    10.0
 Bio jednom jedan... Dusko Radovic  2006    10.0
                           Carraco  2022    10.0
                        COMPLEXion  2023    10.0
             Of All the Gin Joints  2023    10.0
    Paradise (bunnies and flowers)  2023    10.0
  Retratos de República Dominicana  2024    10.0
        Opioids: The Hidden Crisis  2024    10.0
  Butch Cassidy's Forgotten Outlaw  2025    10.0
Ghostland: A Journey Through Syria  2025    10.0


In [19]:
# Find the average rating amongst all 'Documentary' titles
average_rating = docu['Rating'].mean()
print("Average Rating:", average_rating)

Average Rating: 7.1872913859230865


In [21]:
# Find 10 highest voted 'Documentary' titles
top_vote = docu.nlargest(10, 'Votes')[['Title', 'Votes']]
top_vote.insert(1, ' ', ' ')
# Print list
print(top_vote.to_string(index=False))

                Title    Votes
Bowling for Columbine   150473
      Fahrenheit 9/11   133596
        Super Size Me   115270
   Jackass: The Movie   104080
   The Social Dilemma    91696
An Inconvenient Truth    85937
   Jackass Number Two    81379
            Free Solo    81133
           Inside Job    80926
                Senna    80117


In [None]:
# Plotly pie
fig_pie = px.pie(top_vote, names='Title', values='Votes', title='Top 10 Voted Documentary Films')
fig_pie.update_layout(title_x=0.46)  # Adjust the title placement
fig_pie.show()

In [None]:
# Find the average vote count amongst all 'Comedy' titles
average_votes = docu['Votes'].mean()
print("Average Vote Count:", average_rating)