In [1]:
from typing import Dict, List, Set
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import json
import ast

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/netflix-movies-and-shows/imdb_movies_shows.csv


# Netflix Movies and Shows:
## Decoding Trends in Entertainment: A Deep Dive into Netflix's Content Library

In [2]:
df_netflix = pd.read_csv("/kaggle/input/netflix-movies-and-shows/imdb_movies_shows.csv").sort_values("release_year")

In [3]:
df_netflix["production_countries"] = df_netflix["production_countries"].apply(lambda x: ast.literal_eval(x))

In [4]:
df_netflix.head()

Unnamed: 0,title,type,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes
0,Five Came Back: The Reference Films,SHOW,1945,TV-MA,48,['documentation'],[US],1.0,,,
24,Raya and Sakina,MOVIE,1953,,105,"['drama', 'thriller', 'crime', 'history']",[EG],,tt0316472,6.8,231.0
28,The Blazing Sun,MOVIE,1954,,100,"['romance', 'crime', 'drama']",[EG],,tt0044429,7.4,1219.0
15,White Christmas,MOVIE,1954,,115,"['romance', 'comedy', 'music']",[US],,tt0047673,7.5,42373.0
22,Dark Waters,MOVIE,1956,,120,"['drama', 'action', 'romance', 'thriller']",[EG],,tt0049761,6.7,590.0


In [5]:
px.histogram(df_netflix, x="type", color="type", labels={"type": "Type"})

In [6]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=df_netflix["release_year"].unique(),
        y=df_netflix.groupby("release_year").count()["imdb_score"],
        name="Movies + Shows",
        mode="lines+markers",
    )
)

fig.add_trace(
    go.Scatter(
        x=df_netflix.query("type == 'MOVIE'")["release_year"].unique(),
        y=df_netflix.groupby(["release_year", "type"])
        .count()
        .reset_index()
        .query("type == 'MOVIE'")["imdb_score"],
        name="Movies",
        mode="lines+markers",
    )
)

fig.add_trace(
    go.Scatter(
        x=df_netflix.query("type == 'SHOW'")["release_year"].unique(),
        y=df_netflix.groupby(["release_year", "type"])
        .count()
        .reset_index()
        .query("type == 'SHOW'")["imdb_score"],
        name="Shows",
        mode="lines+markers",
    )
)

fig.update_layout(
    title="Shows and movies released over the years",
    xaxis_title="Release Year",
    yaxis_title="Count",
    legend_title="Legend",
)
fig.show()

2019 is the year with the most **movies and shows** released, and it is the same
year with the most **movies** released. However, for **shows** it falls just
short of 2018 by 8 shows released.

# Separating Movies and Shows into two different dataframes

In [7]:
df_shows = df_netflix.loc[df_netflix["type"] == "SHOW"].drop(columns="type")
df_movies = df_netflix.loc[df_netflix["type"] == "MOVIE"].drop(columns="type")

## Distribution of the age rating of Shows and Movies in 2019

In [8]:
px.histogram(
    df_movies.loc[df_movies["release_year"] == 2019],
    x="age_certification",
    category_orders={"age_certification": ["G", "PG", "PG-13", "R", "NC-17"]},
    title="Movies in 2019 by age certification",
    labels={"age_certification": "age certification"},
    color="age_certification",
)

In [9]:
px.histogram(
    df_shows.loc[df_shows["release_year"] == 2019],
    x="age_certification",
    category_orders={
        "age_certification": ["TV-Y", "TV-Y7", "TV-G", "TV-PG", "TV-14", "TV-MA"]
    },
    color="age_certification",
    title="Shows in 2019 by age certification",
    labels={"age_certification": "age certification"},
)

In [10]:
fig = go.Figure()

x = df_movies.dropna(subset="imdb_score")["release_year"].unique()
df_movies_year_rating = df_movies.dropna(subset="imdb_score").groupby("release_year")[
    "imdb_score"
]

fig.add_trace(
    go.Scatter(
        x=x,
        y=df_movies_year_rating.max(),
        mode="lines+markers",
        name="Maximum IMDB rating",
    )
)

y_err = df_movies.groupby("release_year")["imdb_score"].sem().round(2)
fig.add_trace(
    go.Scatter(
        x=x,
        y=df_movies_year_rating.mean().round(2),
        mode="lines+markers",
        name="Average IMDB rating",
        error_y=dict(
            type="data",
            array=y_err,
            visible=True,
        ),
        hovertemplate="Year: %{x}"
        + "<br>Rating: %{y:.2f} +- %{customdata[0]:.2f}"
        + "<br>Count: %{customdata[1]}",
        customdata=np.stack((y_err, df_movies_year_rating.count().values), axis=-1),
    )
)

fig.add_trace(
    go.Scatter(
        x=x,
        y=df_movies_year_rating.min(),
        mode="lines+markers",
        name="Minimum IMDB rating",
    )
)

fig.update_yaxes(range=[0, 10])

fig.update_layout(
    title="Movie ratings across the years",
    xaxis_title="Release Year",
    yaxis_title="Rating",
    legend_title="Legend",
)

In [11]:
fig = go.Figure()

x = df_shows.dropna(subset="imdb_score")["release_year"].unique()

df_shows_year_rating = df_shows.dropna(subset="imdb_score").groupby("release_year")[
    "imdb_score"
]
fig.add_trace(
    go.Scatter(
        x=x,
        y=df_shows_year_rating.max(),
        mode="lines+markers",
        name="Maximum IMDB rating",
        hovertemplate="Year: %{x}" + "<br> Max: %{y}",
    )
)

y_err = df_shows_year_rating.sem().round(2)
fig.add_trace(
    go.Scatter(
        x=x,
        y=df_shows_year_rating.mean().round(2),
        mode="lines+markers",
        name="Average IMDB rating",
        error_y=dict(
            type="data",
            array=y_err,
            visible=True,
        ),
        hovertemplate="Year: %{x}"
        + "<br>Rating: %{y:.2f} +- %{customdata[0]:.2f}"
        + "<br>Count: %{customdata[1]}",
        customdata=np.stack((y_err, df_shows_year_rating.count().values), axis=-1),
    )
)

fig.add_trace(
    go.Scatter(
        x=x,
        y=df_shows_year_rating.min(),
        mode="lines+markers",
        name="Minimum IMDB rating",
        hovertemplate="Year: %{x}" + "<br> Min: %{y}",
    )
)

fig.update_yaxes(range=[0, 10])

fig.update_layout(
    title="Show ratings across the years",
    xaxis_title="Release Year",
    yaxis_title="Rating",
    legend_title="Legend",
)

It seems that from 2000 onwards we see a strong decrease in the 'worst'
rating given to productions across both movies and shows.

df_netflix

In [12]:
px.histogram(x=df_netflix["production_countries"].str.len())

Entries with 0 production countries are removed as they are probably
faulty data.

In [13]:
df_production = df_netflix.loc[df_netflix["production_countries"].str.len() > 0]

In [14]:

# Separate by country production number and count productions per country.
fig = px.histogram(
    df_production,
    x=df_production.production_countries.str.len(),
    title="Number of Production Countries per Entry",
    color="type",
    barmode="group",
    histnorm="percent",
)

fig.update_layout(
    xaxis=dict(
        tickmode="linear", tick0=1, dtick=1, title="Number of Production Countries"
    )
)

Almost all shows, ca. 96%, are produced in 1 country. Wherease for movies
there is a sizeable amount, ca. 15%, that is produced in more than
one country.

In [15]:

df_production["production_countries"]

0       [US]
24      [EG]
28      [EG]
15      [US]
22      [EG]
        ... 
5439    [TR]
5096    [US]
5654    [PL]
5225    [US]
5284    [IN]
Name: production_countries, Length: 5574, dtype: object

In [16]:

# get a set of unique countries

set_of_countries = set()
df_production["production_countries"].apply(lambda x: set_of_countries.update(x))

# Check for integrity of all country ISO codes
for x in set_of_countries:
    if len(x) > 2:
        print(x)

Lebanon


It seems that Lebanon has been labeled twice, once in its full name and one in
its ISO country code 'LB'.

In [17]:

# Find and change that to LB
df_production.loc[
    df_production["production_countries"].apply(lambda x: "Lebanon" in x)
].index

Index([649], dtype='int64')

In [18]:
df_production.loc[649, "production_countries"] = ["LB"]

In [19]:
set_of_countries = set()
df_production["production_countries"].apply(lambda x: set_of_countries.update(x))
len(set_of_countries)

106

In [20]:
df_production["production_countries"][0]

['US']

There are 106 countries in which production took place.

In [21]:


def productions_per_country(series: pd.Series, set_of_countries: Set) -> pd.DataFrame:
    p_per_c = dict(zip(set_of_countries, np.zeros(len(set_of_countries))))
    for list_of_countries in series:
        for country in list_of_countries:
            p_per_c[country] += 1

    return pd.DataFrame(dict(country=p_per_c.keys(), n_productions=p_per_c.values()))

In [22]:
df_productions_per_country = productions_per_country(
    series=df_production["production_countries"],
    set_of_countries=set_of_countries,
)
df_productions_per_country.head()

Unnamed: 0,country,n_productions
0,RU,15.0
1,IT,81.0
2,TR,80.0
3,GR,3.0
4,MU,1.0


In [23]:
fig = px.bar(
    df_productions_per_country.sort_values("n_productions", ascending=False),
    x="country",
    y="n_productions",
    log_y=True,
    hover_name="country",
    hover_data={"country": False, "n_productions": True},
    width=1800,
)

fig.update_layout(
    title="Number of Productions per Country",
    xaxis_title="Country (ISO Code)",
    yaxis_title="Number of Productions; log scale",
)
fig.show()