In [95]:
import pandas as pd
import plotly.express as px

from mbmutils import mu

In [96]:
titles = pd.read_csv(mu.get_full_path("data/titles.csv"))
credits = pd.read_csv(mu.get_full_path("data/credits.csv"))

In [97]:
titles.head()

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,ts300399,Five Came Back: The Reference Films,SHOW,This collection includes 12 World War II-era p...,1945,TV-MA,48,['documentation'],['US'],1.0,,,,0.6,
1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,113,"['crime', 'drama']",['US'],,tt0075314,8.3,795222.0,27.612,8.2
2,tm127384,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,91,"['comedy', 'fantasy']",['GB'],,tt0071853,8.2,530877.0,18.216,7.8
3,tm70993,Life of Brian,MOVIE,"Brian Cohen is an average young Jewish man, bu...",1979,R,94,['comedy'],['GB'],,tt0079470,8.0,392419.0,17.505,7.8
4,tm190788,The Exorcist,MOVIE,12-year-old Regan MacNeil begins to adapt an e...,1973,R,133,['horror'],['US'],,tt0070047,8.1,391942.0,95.337,7.7


In [98]:
titles.groupby("release_year")["release_year"].count()

release_year
1945      1
1953      1
1954      2
1956      1
1958      1
       ... 
2018    774
2019    848
2020    805
2021    758
2022    217
Name: release_year, Length: 67, dtype: int64

In [99]:
totals = titles.groupby('release_year').agg(count=('release_year', 'count'),
                                            imdb_votes=('imdb_votes', 'sum'),
                                            avg_score=('imdb_score', 'mean')).reset_index()

totals["avg_votes"] = totals["imdb_votes"] / totals["count"]

totals

Unnamed: 0,release_year,count,imdb_votes,avg_score,avg_votes
0,1945,1,0.0,,0.000000
1,1953,1,231.0,6.800000,231.000000
2,1954,2,43592.0,7.450000,21796.000000
3,1956,1,590.0,6.700000,590.000000
4,1958,1,4385.0,7.500000,4385.000000
...,...,...,...,...,...
62,2018,774,6845715.0,6.530559,8844.593023
63,2019,848,8729219.0,6.526168,10293.890330
64,2020,805,6507700.0,6.344444,8084.099379
65,2021,758,6639992.0,6.319068,8759.883905


In [104]:
px.histogram(totals,
             x="release_year",
             nbins=15, )

In [105]:
px.bar(totals[totals.release_year >= 2000],
       x="release_year",
       y="count",
       color="avg_votes")

In [102]:
x = titles.groupby(['type'])['type'].count()
y = len(titles)
r = ((x / y)).round(2)

mf_ratio = pd.DataFrame(r)
mf_ratio.columns = ["percent"]

mf_ratio = mf_ratio.reset_index()

mf_ratio

Unnamed: 0,type,percent
0,MOVIE,0.65
1,SHOW,0.35


In [112]:
chart = px.pie(mf_ratio, names="type",
               values="percent",
               labels={"type"},
               hole=0.15,
               color_discrete_sequence=["#b20710", "#221f1f"],
               height=550,
               )
chart.update_traces(textposition='inside',
                    textinfo='percent+label',
                    marker=dict(line=dict(color='#221f1f', width=6)),
                    pull=[0, 0, 0.2, 0],
                    rotation=180, )

chart.update_layout(
    showlegend=False,
    # font_family="Courier New",
    # font_color="white",
    font_size=19,
    # title_font_family="Times New Roman",
    # title_font_color="red",
    # legend_title_font_color="green"
)