## Netflix Analysis

In [86]:
# import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
sns.set_style("dark")

%matplotlib inline

##### Load Netflix TV and Shows Dataset

In [None]:
netflix_df = pd.read_csv('https://raw.githubusercontent.com/ahmedmeshref/Netflex-Data-Analysis/main/netflix_titles.csv')

##### Shape of dataset (rows, cols)

In [None]:
print(f'Number of rows: {netflix_df.shape[0]}, Columsn: {netflix_df.shape[1]}')

##### Load first 5 rows of the dataset

In [None]:
netflix_df.head(5)

### Understand dataset

In [None]:
netflix_df.info()

##### Check for unique values

In [None]:
netflix_df.nunique()

##### Check NULL values

In [None]:
netflix_df.isnull().sum()

##### Check for duplicated rows 

In [None]:
netflix_df.duplicated().any()

### Data Cleaning 

##### Drop NULL values

In [None]:
netflix_df = netflix_df.dropna()

In [None]:
print(f'Number of rows: {netflix_df.shape[0]}, Columsn: {netflix_df.shape[1]}')

##### Change Dates format    

In [None]:
netflix_df['release_year'].astype(int);
netflix_df['date_added'] = pd.to_datetime(netflix_df['date_added'])

##### Extract the year_added col from the date_added col 

In [None]:
netflix_df['year_added'] = netflix_df['date_added'].dt.year

In [None]:
netflix_df.head(5)

### Data Analysis & Visualization

##### Show number of Movies & Shows added to the website from year 2008-2021

In [None]:
netflix_df['date_added'].groupby([netflix_df.year_added]).agg('count')

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(netflix_df['year_added'])
plt.title('Movies & TV Shows added per year')
plt.show()

##### Show number of movies & TV Shows released in each one of the last 5 years

In [None]:
netflix_df['release_year'].groupby([netflix_df.release_year]).agg('count')

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(data=netflix_df, y="release_year", order=[2020, 2019, 2018, 2017, 2016], palette="viridis")
plt.title('Programs (Movies & TV Shows) relased in the last 5 years')
plt.show()

##### Movies VS TV shows trend in the last 5 years at Netflix

In [None]:
plt.figure(figsize=(10,10))

netflix_df_last_5_years =  netflix_df[netflix_df.release_year >= 2016][netflix_df.release_year < 2021]

g = sns.catplot(
    data=netflix_df_last_5_years, kind="count",
    x="release_year", hue="type",
    ci="sd", palette="dark", alpha=.7, height=8)

ax = plt.gca()

# Iterate through the list of axes' patches to show number of each patch 
for p in ax.patches:
    ax.text(p.get_x() + p.get_width()/2., p.get_height(), '%d' % int(p.get_height()), 
            fontsize=12, color='black', ha='center', va='bottom')

plt.title('Type (Movies | TV Shows) trends in Netflix from 2016-2020',size='15')
plt.show()

##### Number of movies VS TV Shows in 2020

In [None]:
netflix_type = netflix_df[netflix_df.release_year == 2020]['type'].value_counts()
print(netflix_type)

In [None]:
plt.subplots(figsize=[10,8])

# Pie chart
plt.pie(netflix_type.values, colors=['#34495E','#D35400'], labels=["Movie", "TV Show"], autopct='%1.1f%%', startangle=100)
#draw circle
centre_circle = plt.Circle((0,0),0.70,fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
# Equal aspect ratio ensures that pie is drawn as a circle
plt.axis('equal')  
plt.tight_layout()
plt.show()


#### Top 5 countries that produces movies in the last 5 years

In [None]:
# Countries contribution to programs production from 2016 - 2020 
top_contributors_last_5_years = netflix_df_last_5_years.groupby(['country'])['show_id'].count().reset_index(name='count').sort_values(by='count',ascending=False)
top_contributors_last_5_years

In [None]:
plt.figure(figsize=(10,10))

# plot the contribution of the top 5 contributors in the last 5 years 
sns.barplot(x='country',y='count',data=top_contributors_last_5_years[:5])

# Get current axis on current figure
ax = plt.gca()


# Iterate through the list of axes' patches
for p in ax.patches:
    ax.text(p.get_x() + p.get_width()/2., p.get_height(), '%d' % int(p.get_height()), 
            fontsize=12, color='black', ha='center', va='bottom')

plt.title('Top 5 countries contributing to programs (Movies & TV Shows) production in the last 5 years', size='15')    
plt.show()

#### The US is the top programs contributer in the last 5 years. Plot its contribution of movies and tv shows in the last 5 years 

In [None]:
plt.figure(figsize=(10,10))

# Filter the United States movies and TV shows 
US_contribution = netflix_df[netflix_df['country'] == 'United States']

# Count the total contribution by the US in the last 5 years  
US_contribution_last_5_years = US_contribution.query("release_year >= 2016").query("release_year < 2021").groupby(['country', 'release_year'])['show_id'].count().reset_index(name='count')
US_contribution_last_5_years


sns.barplot(x='release_year',y='count',data=US_contribution_last_5_years, palette = "Blues")

# Get current axis on current figure
ax = plt.gca()


# Iterate through the list of axes' patches
for p in ax.patches:
    ax.text(p.get_x() + p.get_width()/2., p.get_height(), '%d' % int(p.get_height()), 
            fontsize=12, color='black', ha='center', va='bottom')

plt.title('Contributing of the United States to Movies & TV Shows production', size='15')    
plt.show()

##### Number of shows that each country produced in 2020

In [None]:
netflix_df[netflix_df.release_year == 2020].groupby(['country'])['show_id'].count().reset_index(name='count').sort_values(by='count',ascending=False)