**# Exploratory Data Analysis (EDA) on Netflix Movies and TV Shows Dataset**


In [None]:
import numpy as np                 # Linear algebra operations
import pandas as pd                # Data handling and preparation
import plotly.express as px        # Interactive visualizations
from textblob import TextBlob      # Sentiment analysis

df = pd.read_csv('netflix_titles.csv')

In [None]:
print("Dataset Shape:", df.shape)

Dataset Shape: (8807, 12)


In [None]:
print("\nSample Data:\n", df.head())


Sample Data:
   show_id     type                  title         director  \
0      s1    Movie   Dick Johnson Is Dead  Kirsten Johnson   
1      s2  TV Show          Blood & Water              NaN   
2      s3  TV Show              Ganglands  Julien Leclercq   
3      s4  TV Show  Jailbirds New Orleans              NaN   
4      s5  TV Show           Kota Factory              NaN   

                                                cast        country  \
0                                                NaN  United States   
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...            NaN   
3                                                NaN            NaN   
4  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...          India   

           date_added  release_year rating   duration  \
0  September 25, 2021          2020  PG-13     90 min   
1  September 24, 2021          2021  TV-MA  2 Seasons   
2  September 2

In [None]:
print("\nColumn Names:\n", df.columns)


Column Names:
 Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')


# Distribution of Content Ratings

In [None]:
x = df.groupby(['rating']).size().reset_index(name='counts')
print("\nContent Ratings Count:\n", x)


Content Ratings Count:
       rating  counts
0     66 min       1
1     74 min       1
2     84 min       1
3          G      41
4      NC-17       3
5         NR      80
6         PG     287
7      PG-13     490
8          R     799
9      TV-14    2160
10      TV-G     220
11     TV-MA    3207
12     TV-PG     863
13      TV-Y     307
14     TV-Y7     334
15  TV-Y7-FV       6
16        UR       3


In [None]:
pieChart = px.pie(x, values='counts', names='rating', title='Distribution of Content Ratings on Netflix')
pieChart.show()

# Analysing the Top 5 Directors on Netflix

In [15]:
df['director'] = df['director'].fillna('Director not specified')
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,Director not specified,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,Director not specified,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,Director not specified,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [17]:
directors_list = pd.DataFrame()
print(directors_list)

Empty DataFrame
Columns: []
Index: []


In [19]:
directors_list = df['director'].str.split(',', expand=True).stack()
print(directors_list)

0     0           Kirsten Johnson
1     0    Director not specified
2     0           Julien Leclercq
3     0    Director not specified
4     0    Director not specified
                    ...          
8802  0             David Fincher
8803  0    Director not specified
8804  0           Ruben Fleischer
8805  0              Peter Hewitt
8806  0               Mozez Singh
Length: 9612, dtype: object


In [25]:
directors_list = df['director'].to_frame()
print(directors_list)

                    director
0            Kirsten Johnson
1     Director not specified
2            Julien Leclercq
3     Director not specified
4     Director not specified
...                      ...
8802           David Fincher
8803  Director not specified
8804         Ruben Fleischer
8805            Peter Hewitt
8806             Mozez Singh

[8807 rows x 1 columns]


In [30]:
directors_list.columns = ['Director']
print(directors_list)

                    Director
0            Kirsten Johnson
1     Director not specified
2            Julien Leclercq
3     Director not specified
4     Director not specified
...                      ...
8802           David Fincher
8803  Director not specified
8804         Ruben Fleischer
8805            Peter Hewitt
8806             Mozez Singh

[8807 rows x 1 columns]


In [32]:
directors = directors_list.groupby(['Director']).size().reset_index(name='Total Count')
print(directors)

                 Director  Total Count
0             A. L. Vijay            2
1            A. Raajdheep            1
2               A. Salaam            1
3         A.R. Murugadoss            2
4         Aadish Keluskar            1
...                   ...          ...
4524          Çagan Irmak            1
4525     Ísold Uggadóttir            1
4526  Óskar Thór Axelsson            1
4527     Ömer Faruk Sorak            2
4528         Şenol Sönmez            2

[4529 rows x 2 columns]


In [34]:
directors = directors[directors.Director != 'Director not specified']

In [36]:
print(directors)

                 Director  Total Count
0             A. L. Vijay            2
1            A. Raajdheep            1
2               A. Salaam            1
3         A.R. Murugadoss            2
4         Aadish Keluskar            1
...                   ...          ...
4524          Çagan Irmak            1
4525     Ísold Uggadóttir            1
4526  Óskar Thór Axelsson            1
4527     Ömer Faruk Sorak            2
4528         Şenol Sönmez            2

[4528 rows x 2 columns]


In [38]:
directors = directors.sort_values(by=['Total Count'], ascending=False)
print(directors)

                       Director  Total Count
3393              Rajiv Chilaka           19
3444     Raúl Campos, Jan Suter           18
4047                Suhas Kadav           16
2599               Marcus Raboy           16
1791                  Jay Karas           14
...                         ...          ...
2067                Josh Webber            1
2066                Josh Wakely            1
2065  Josh Safdie, Benny Safdie            1
637                       Bumpy            1
2078      José Miguel Contreras            1

[4528 rows x 2 columns]


In [40]:
top5Directors = directors.head()
print("\nTop 5 Directors:\n", top5Directors)


Top 5 Directors:
                     Director  Total Count
3393           Rajiv Chilaka           19
3444  Raúl Campos, Jan Suter           18
4047             Suhas Kadav           16
2599            Marcus Raboy           16
1791               Jay Karas           14


In [42]:
barChart = px.bar(top5Directors.sort_values(by=['Total Count']),
                  x='Total Count', y='Director',
                  title='Top 5 Directors on Netflix')
barChart.show()

# Analysing the Top 5 Actors on Netflix

In [45]:
df['cast'] = df['cast'].fillna('No cast specified')
cast_df = pd.DataFrame()
cast_df = df['cast'].str.split(',', expand=True).stack().to_frame()
cast_df.columns = ['Actor']
actors = cast_df.groupby(['Actor']).size().reset_index(name='Total Count')
actors = actors[actors.Actor != 'No cast specified']
actors = actors.sort_values(by=['Total Count'], ascending=False)

In [47]:
top5Actors = actors.head()
print("\nTop 5 Actors:\n", top5Actors)

barChart2 = px.bar(top5Actors.sort_values(by=['Total Count']),
                   x='Total Count', y='Actor',
                   title='Top 5 Actors on Netflix')
barChart2.show()


Top 5 Actors:
                    Actor  Total Count
2612         Anupam Kher           39
26941       Rupa Bhimani           31
30303   Takahiro Sakurai           30
15541      Julie Tejwani           28
23624            Om Puri           27


# Analysing the Content Produced Each Year On Netflix

In [49]:
df1 = df[['type', 'release_year']]
df1 = df1.rename(columns={"release_year": "Release Year", "type": "Type"})
df2 = df1.groupby(['Release Year', 'Type']).size().reset_index(name='Total Count')

In [53]:
print("\nContent Production by Year:\n", df2)


Content Production by Year:
      Release Year     Type  Total Count
75           2000    Movie           33
76           2000  TV Show            4
77           2001    Movie           40
78           2001  TV Show            5
79           2002    Movie           44
80           2002  TV Show            7
81           2003    Movie           51
82           2003  TV Show           10
83           2004    Movie           55
84           2004  TV Show            9
85           2005    Movie           67
86           2005  TV Show           13
87           2006    Movie           82
88           2006  TV Show           14
89           2007    Movie           74
90           2007  TV Show           14
91           2008    Movie          113
92           2008  TV Show           23
93           2009    Movie          118
94           2009  TV Show           34
95           2010    Movie          154
96           2010  TV Show           40
97           2011    Movie          145
98        

In [54]:
df2 = df2[df2['Release Year'] >= 2000]
graph = px.line(df2, x="Release Year", y="Total Count", color="Type",
                title="Trend of Content Produced on Netflix Every Year")
graph.show()

# Analysing Top 10 Genres on Netflix

In [56]:
df['listed_in'] = df['listed_in'].fillna('No Genre Specified')
genres = df['listed_in'].str.split(',', expand=True).stack().to_frame()
genres.columns = ['Genre']

In [62]:
genres = df['listed_in'].dropna().str.split(',').explode().str.strip()
genres = genres[genres != 'No Genre Specified']

genres = genres.value_counts().reset_index()
genres.columns = ['Genre', 'Total Count']
genres = genres.head(10)

print("\nTop 10 Genres:\n", genres)



Top 10 Genres:
                       Genre  Total Count
0      International Movies         2752
1                    Dramas         2427
2                  Comedies         1674
3    International TV Shows         1351
4             Documentaries          869
5        Action & Adventure          859
6                 TV Dramas          763
7        Independent Movies          756
8  Children & Family Movies          641
9           Romantic Movies          616


In [64]:
fig_genre = px.bar(genres, x='Total Count', y='Genre', title='Top 10 Genres on Netflix')
fig_genre.show()

 # Analysing Top 10 Countries with Most Netflix Titles

In [66]:
df['country'] = df['country'].fillna('Country not specified')
countries = df['country'].str.split(',', expand=True).stack().to_frame()
countries.columns = ['Country']

countries = countries.groupby('Country').size().reset_index(name='Total Count')
countries = countries[countries['Country'] != 'Country not specified']
countries = countries.sort_values(by='Total Count', ascending=False).head(10)

print("\nTop 10 Countries with Most Netflix Titles:\n", countries)


Top 10 Countries with Most Netflix Titles:
             Country  Total Count
192   United States         3211
142           India         1008
191  United Kingdom          628
106   United States          479
122          Canada          271
149           Japan          259
133          France          212
180     South Korea          211
34           France          181
182           Spain          181


In [67]:
fig_country = px.bar(countries, x='Total Count', y='Country',
                     title='Top 10 Countries with Most Netflix Titles')
fig_country.show()

# Sentiment Analysis on Netflix Descriptions

In [69]:
df3 = df[['release_year', 'description']].rename(columns={'release_year': 'Release Year', 'description': 'Description'})

In [82]:
from textblob import TextBlob

# Apply on the full dataset before grouping
sentiments = []
for desc in df['description']:
    testimonial = TextBlob(str(desc))
    polarity = testimonial.sentiment.polarity
    if polarity == 0:
        sentiments.append('Neutral')
    elif polarity > 0:
        sentiments.append('Positive')
    else:
        sentiments.append('Negative')

df['Sentiment'] = sentiments

# Now group and filter
df3 = df.groupby(['release_year', 'Sentiment']).size().reset_index(name='Total Count')
df3 = df3[df3['release_year'] > 2005]

In [85]:
barGraph = px.bar(
    df3,
    x="release_year",
    y="Total_Count",
    color="Sentiment",
    title="Sentiment Analysis of Netflix Content (Post-2005)"
)
barGraph.show()


In [86]:
print("\n✅ Netflix EDA Project Completed Successfully!")


✅ Netflix EDA Project Completed Successfully!
