In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
from textblob import TextBlob

df = pd.read_csv('netflix_titles.csv')

# ***Exploratory Data Analysis***

***Checking the form of data and it's content in the dataset***

In [None]:
df.shape

(8807, 12)

In [None]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [None]:
df.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

# ***Visualization***

In [None]:
x = df.groupby(['rating']).size().reset_index(name='counts')
pieChart = px.pie(x, values='counts', names='rating', title='Distribution of content ratings on Netflix')
pieChart.show()

In [None]:
y = df.groupby(['type']).size().reset_index(name='counts')
pieChart = px.pie(y, values='counts', names='type', title='Distribution of content type on Netflix')
pieChart.show()

* **Insights** : Netflix has more of TV-MA rated content and most of the content of the website in form of movies*

# ***Analysing most watched artists***

In [None]:
# handling NULL values
df['director']=df['director'].fillna('Director not specified')
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,Director not specified,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,Director not specified,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,Director not specified,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [None]:
# Grouping directors based on their occurence/count in the df
directors = df.groupby(['director']).size().reset_index(name='counts')
(directors)

Unnamed: 0,director,counts
0,A. L. Vijay,2
1,A. Raajdheep,1
2,A. Salaam,1
3,A.R. Murugadoss,2
4,Aadish Keluskar,1
...,...,...
4524,Çagan Irmak,1
4525,Ísold Uggadóttir,1
4526,Óskar Thór Axelsson,1
4527,Ömer Faruk Sorak,2


In [None]:
# removing null values and sorting the df
directors = directors[directors.director!='Director not specified']
directors = directors.sort_values(by=['counts'], ascending = False)
directors

Unnamed: 0,director,counts
3393,Rajiv Chilaka,19
3444,"Raúl Campos, Jan Suter",18
4047,Suhas Kadav,16
2599,Marcus Raboy,16
1791,Jay Karas,14
...,...,...
2050,Jos Humphrey,1
2051,Jose Gomez,1
2052,Jose Javier Reyes,1
2054,"Joseduardo Giordano, Sergio Goyri Jr.",1


In [None]:
top5Directors = directors.head()

In [None]:
top5Directors = top5Directors.sort_values(by=['counts'])
barChart = px.bar(top5Directors, x='counts', y = 'director', title = 'Top 5 Directors on Netflix')
barChart.show()

In [None]:
# doing the same for actors
df['cast']=df['cast'].fillna('No cast specified')
cast_df = pd.DataFrame()
cast_df = df['cast'].str.split(',',expand=True).stack()
cast_df = cast_df.to_frame()
cast_df.columns = ['Actor']
actors = cast_df.groupby(['Actor']).size().reset_index(name = 'Total Count')
actors = actors[actors.Actor != 'No cast specified']
actors = actors.sort_values(by=['Total Count'], ascending=False)
top5Actors = actors.head()
top5Actors = top5Actors.sort_values(by=['Total Count'])
barChart2 = px.bar(top5Actors, x='Total Count', y='Actor', title='Top 5 Actors on Netflix')
barChart2.show()

***Insights :*** Most of the top creators are from India , So Netflix should try to increase it's reach in the country

# ***Analysing the content produced on netflix based on years***

In [None]:
df1 = df[['type', 'release_year']]
df1 = df1.rename(columns = {"release_year":"Release Year", "type": "Type"})
df2 = df1.groupby(['Release Year', 'Type']).size().reset_index(name='Total Count')
df2

Unnamed: 0,Release Year,Type,Total Count
0,1925,TV Show,1
1,1942,Movie,2
2,1943,Movie,3
3,1944,Movie,3
4,1945,Movie,3
...,...,...,...
114,2019,TV Show,397
115,2020,Movie,517
116,2020,TV Show,436
117,2021,Movie,277


In [None]:
df2 = df2[df2['Release Year']>=2000]
graph = px.line(df2, x = "Release Year", y="Total Count", color = "Type", title = "Trend of Content Produced on Netfilx Every Year")
graph.show()

***Insights :*** The company was in huge profit near 2016-2018 era , but since , then the amount of content has decreased , so company should focus on both quality and quantity to increase user retention.

# ***Sentiment Analysis of the content***

In [None]:
df3 = df[['release_year', 'description']]
df3 = df3.rename(columns = {'release_year':'Release Year', 'description':'Description'})
for index, row in df3.iterrows():
  d=row['Description']
  testimonial = TextBlob(d)
  p = testimonial.sentiment.polarity
  if p==0:
    sent = 'Neutral'
  elif p>0:
    sent = 'Positive'
  else:
    sent = 'Negative'
  df3.loc[[index, 2], 'Sentiment']=sent

df3 = df3.groupby(['Release Year', 'Sentiment']).size().reset_index(name = 'Total Count')

df3 = df3[df3['Release Year']>2005]
barGraph = px.bar(df3, x="Release Year", y="Total Count", color = "Sentiment", title = "Sentiment Analysis of Content on Netflix")
barGraph.show()

***Insights :*** The count of feedback is decreasing indicating poor user retention ,however the margin between positive and negative feedback is increasing , so company should focus on some quality content to attract mass audience and have a sustainable quantity of shows/movies to acquire them as permanent consumers .



---

# ***Thank-you:😊***