In [None]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from wordcloud import STOPWORDS
from PIL import Image

In [None]:
#loading the dataset
netflix = pd.read_csv("/kaggle/input/netflix-shows/netflix_titles.csv")

#inspecting dataset
netflix.head()

In [None]:
#inspecting data further
netflix.info()

In [None]:
#counting null values for each column
netflix.isnull().sum()

We can see that this dataset has a total of 8807 rows and 12 columns. The following columns have null values that need to be cleaned: 
* director - 2,634 null values
* cast - 825 null values
* country - 831 null values 
* date_added - 10 null values
* rating - 4 null values
* duration - 3 null values

In [None]:
#counting total null values across entire dataset
netflix.isnull().sum().sum()

Dropping these many rows in the director, cast and country columns would lead to a large loss of information so we shall just replace the null values in those columns with a missing category instead. For the date added, duration and rating columns, since the null value counts are so low, we shall just drop them from the dataset.

In [None]:
#replacing null values with a missing category
netflix['director'].fillna("Director Unavailable", inplace=True)
netflix['cast'].fillna("Cast Unavailable", inplace=True)
netflix['country'].fillna("Country Unavailable", inplace=True)
netflix.dropna(subset=["date_added", "rating", "duration"], inplace=True)

In [None]:
#checking to see if dataset has anymore null values remaining
netflix.isna().sum()

All the columns now report that there are no null values! We are now ready to use this datast to conduct our exploratory data analysis.

**Analyzing Netflix content by type**

In [None]:
plt.title('Proportion Of Netflix Content By Type')
colors = sns.color_palette('pastel')
plt.pie(netflix['type'].value_counts(), labels = netflix['type'].value_counts().index, colors = colors, autopct='%.0f%%')
plt.show()

We see that 70% of the content available on Netflix are movies, and only 30% are tv shows. Let's investigate how this proportion has changed over the years.

**Analyzing trends in Netflix content over the years**

In [None]:
#Creating Year Added column based on Date Added
netflix['year_added'] = pd.DatetimeIndex(netflix['date_added']).year
netflix['year_added'].apply(lambda x: int(x))

#temp dataframes to use for plots
netflix_total_df = netflix.year_added.value_counts().to_frame().reset_index().rename(columns={"index": "year", "year_added":"count"})
netflix_movies_df = netflix[netflix['type']=="Movie"].year_added.value_counts().to_frame().reset_index().rename(columns={"index": "year", "year_added":"count"})
netflix_tv_df = netflix[netflix['type']=="TV Show"].year_added.value_counts().to_frame().reset_index().rename(columns={"index": "year", "year_added":"count"})

fig, ax = plt.subplots(figsize=(13, 7))
plt.title("Frequency of content added by Netflix (2008 - 2021)")
plt.xlabel("Year")
plt.ylabel("Number Added")
ax.set_xticks(np.arange(2008, 2022, 1))
sns.set_style("dark")
sns.lineplot(data=netflix_total_df, x="year", y="count", color="black")
sns.lineplot(data=netflix_movies_df, x = "year", y="count", color="red")
sns.lineplot(data=netflix_tv_df, x = "year", y="count", color="blue")
plt.legend(['Total', 'Movies', "TV Shows"])
plt.grid()
plt.show()

Based on the plot above, we can observe that the number of movies and tv shows added by Netflix started increasing drastically since 2015. A larger proportion of movies are added every year compared to TV shows. However, we observe that there has been a significant reduction in the number of movies and TV shows added since 2019. A likely reason for this could be due to the impact of the COVID-19 pandemic, which resulted in a lot of production houses and studios halting filming leading to fewer new releases. This decrease appears to be slightly less severe for TV shows compared to movies. It's possible that more people may be interested in watching tv shows rather than movies while being stuck at home and Netflix is adding content accordingly.

**Exploring Top Netflix genres**

In [None]:
#creating a temp dataframe containing all titles, genres and year added
genres = netflix.set_index('title').listed_in.str.split(', ', expand=True).stack().reset_index(level=1, drop=True)
genres_df = pd.DataFrame()
genres_df['genre'] = genres
years = netflix.set_index('title').year_added
description = netflix.set_index('title').description
a = pd.merge(genres_df, years, left_index=True, right_index=True)
temp = pd.merge(a, description, left_index=True, right_index=True)

plt.figure(figsize=(10, 10))
sns.countplot(y = 'genre', data = temp, order=temp.genre.value_counts().iloc[:20].index)
plt.title('Top 20 Genres added by Netflix (2008 - 2021)')
plt.xlabel('Number of Titles')
plt.ylabel('Genre')
plt.grid()
plt.show()

We see that the Top 5 genres added by Netflix are International Movies, Dramas, Comedies, International TV shows and documentaries. Let's explore how these genres changed over the years.

In [None]:
#temp dataframes to use for line plots
international_movies_df = temp[temp['genre']=='International Movies'].year_added.value_counts().to_frame().reset_index().rename(columns={"index": "year", "year_added":"count"})
dramas_df = temp[temp['genre']=='Dramas'].year_added.value_counts().to_frame().reset_index().rename(columns={"index": "year", "year_added":"count"})
comedies_df = temp[temp['genre']=='Comedies'].year_added.value_counts().to_frame().reset_index().rename(columns={"index": "year", "year_added":"count"})
international_tv_df = temp[temp['genre']=='International TV Shows'].year_added.value_counts().to_frame().reset_index().rename(columns={"index": "year", "year_added":"count"})
documentaries_df = temp[temp['genre']=='Documentaries'].year_added.value_counts().to_frame().reset_index().rename(columns={"index": "year", "year_added":"count"})

In [None]:
fig, ax = plt.subplots(figsize=(13, 7))
plt.title("Frequency of Top 5 genres added by Netflix over the years")
plt.xlabel("Year")
plt.ylabel("Number Added")
ax.set_xticks(np.arange(2008, 2022, 1))
sns.lineplot(data=international_movies_df, x="year", y="count")
sns.lineplot(data=dramas_df, x="year", y="count")
sns.lineplot(data=comedies_df, x="year", y="count")
sns.lineplot(data=international_tv_df, x="year", y="count")
sns.lineplot(data=documentaries_df, x="year", y="count")
plt.legend(['International Movies', 'Dramas', "Comedies", "International TV Shows", "Documentaries"])
plt.grid()
plt.show()

We see that there has been a decline in the number of documentaries added by Netflix since 2017 despite it being a top genre. The number of comedies added by Netflix surpassed that of International TV Shows and Documentaries in 2017 and appears to be the only genre that did not experience a significant decline during the pandemic. Lastly, although International Movies is the most frequent genre added by Netflix, its frequency has been on a steady decline since 2018 - well before the pandemic hit.

**Creating Wordclouds for each Top Netflix genre**

Let's explore the most frequent words used in the description for each Netflix genre to see if there are any similarities. 

In [None]:
internation_movies_desc_df = temp[temp['genre']=='International Movies']
international_movies_text = " ".join(i for i in internation_movies_desc_df.description)
stopwords = set(STOPWORDS)
wordcloud = WordCloud(stopwords=stopwords, background_color="black", colormap='Set2').generate(international_movies_text)
plt.figure( figsize=(15,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("WordCloud Using Netflix International Movie Descriptions")
plt.show()

In [None]:
drama_desc_df = temp[temp['genre']=='Dramas']
drama_text = " ".join(i for i in drama_desc_df.description)
stopwords = set(STOPWORDS)
wordcloud = WordCloud(stopwords=stopwords, background_color="black", colormap='Set2').generate(drama_text)
plt.figure( figsize=(15,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("WordCloud Using Netflix Dramas Descriptions")
plt.show()

In [None]:
comedy_desc_df = temp[temp['genre']=='Comedies']
comedy_text = " ".join(i for i in comedy_desc_df.description)
stopwords = set(STOPWORDS)
wordcloud = WordCloud(stopwords=stopwords, background_color="black", colormap='Set2').generate(comedy_text)
plt.figure( figsize=(15,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("WordCloud Using Netflix Comedies Descriptions")
plt.show()

In [None]:
international_tv_desc_df = temp[temp['genre']=='International TV Shows']
international_tv_text = " ".join(i for i in international_tv_desc_df.description)
stopwords = set(STOPWORDS)
wordcloud = WordCloud(stopwords=stopwords, background_color="black", colormap='Set2').generate(international_tv_text)
plt.figure( figsize=(15,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("WordCloud Using Netflix International TV Shows Descriptions")
plt.show()

In [None]:
documentaries_desc_df = temp[temp['genre']=='Documentaries']
documentaries_text = " ".join(i for i in documentaries_desc_df.description)
stopwords = set(STOPWORDS)
wordcloud = WordCloud(stopwords=stopwords, background_color="black", colormap='Set2').generate(documentaries_text)
plt.figure( figsize=(15,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("WordCloud Using Netflix Documentaries Descriptions")
plt.show()

We see that there is a lot of overlapping words among the Top 5 Netflix genres which is not surprising since each title can belong to multiple genres. Common themes appear to be about love, life and family.

**Exploring countries where Netflix titles are added from**

In [None]:
countries_df = netflix.set_index('title').country.str.split(', ', expand=True).stack().reset_index(level=1, drop=True)
countries_df = countries_df[countries_df != 'Country Unavailable']
plt.figure(figsize=(10,10))
g = sns.countplot(y = countries_df, order=countries_df.value_counts().index[:20])
plt.title('Top 20 Countries Where Netflix Titles Are Added From (2008 - 2021)')
plt.xlabel('Number of Titles')
plt.ylabel('Country')
plt.grid()
plt.show()

We see that an overwhelming majority of the Netflix titles are from the United States, followed by India, United Kingdom, Canada and France.