In [30]:
import pandas as pd
df = pd.read_csv('netflix_titles.csv')

In [31]:
print(df.shape)
# Expected Output Example: (8807, 12)

(8807, 12)


In [32]:
print(df.head())
print(df.tail())

  show_id     type                  title         director  \
0      s1    Movie   Dick Johnson Is Dead  Kirsten Johnson   
1      s2  TV Show          Blood & Water              NaN   
2      s3  TV Show              Ganglands  Julien Leclercq   
3      s4  TV Show  Jailbirds New Orleans              NaN   
4      s5  TV Show           Kota Factory              NaN   

                                                cast        country  \
0                                                NaN  United States   
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...            NaN   
3                                                NaN            NaN   
4  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...          India   

           date_added  release_year rating   duration  \
0  September 25, 2021          2020  PG-13     90 min   
1  September 24, 2021          2021  TV-MA  2 Seasons   
2  September 24, 2021        

In [33]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB
None


In [34]:
print(df.isnull().sum())

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64


In [35]:
# Filling numerical NaNs in director, cast, country, rating, date_added
for col in ['director', 'cast', 'country']:
    df[col] = df[col].fillna('Unknown')

# date_added and rating should also be filled for later use
df['date_added'] = df['date_added'].fillna('Unknown Date')
df['rating'] = df['rating'].fillna('Unrated')

# 'duration' NaNs are handled in the Duration & Seasons step based on 'type'.

In [36]:
print(df.dtypes)

show_id         object
type            object
title           object
director        object
cast            object
country         object
date_added      object
release_year     int64
rating          object
duration        object
listed_in       object
description     object
dtype: object


In [37]:
# Convert 'date_added' to datetime objects, coercing 'Unknown Date' to NaT
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')

In [38]:
content_counts = df['type'].value_counts()
print(content_counts)

type
Movie      6131
TV Show    2676
Name: count, dtype: int64


In [39]:
top_ratings = df['rating'].value_counts().head(5)
print(top_ratings)

rating
TV-MA    3207
TV-14    2160
TV-PG     863
R         799
PG-13     490
Name: count, dtype: int64


In [44]:
avg_movie_length = df['duration_minutes'].mean()
print(f"Average movie length: {avg_movie_length:.2f} minutes")

Average movie length: 99.58 minutes


In [40]:
most_frequent_year = df['release_year'].mode()[0]
print(f"The most frequently released year is: {most_frequent_year}")

The most frequently released year is: 2018


In [45]:
avg_seasons = df['number_of_seasons'].mean()
print(f"Average number of seasons per TV show: {avg_seasons:.2f}")

Average number of seasons per TV show: 1.76


In [46]:
genre_df = df.assign(genre=df['listed_in'].str.split(', ')).explode('genre')

In [47]:
genre_avg_year = genre_df.groupby('genre')['release_year'].mean().sort_values(ascending=False)
highest_avg_genre = genre_avg_year.index[0]

print("Top 5 Genres by Average Release Year:")
print(genre_avg_year.head(5))

Top 5 Genres by Average Release Year:
genre
TV Mysteries                    2018.346939
TV Horror                       2018.200000
Reality TV                      2017.894118
Stand-Up Comedy & Talk Shows    2017.857143
TV Thrillers                    2017.736842
Name: release_year, dtype: float64


In [49]:
df['year_added'] = df['date_added'].dt.year

In [50]:
yearly_addition = df['year_added'].value_counts().sort_index()
print("Content added per year (last 5 years):")
print(yearly_addition.tail()) # Look for peaks (spikes) and valleys (drops)

Content added per year (last 5 years):
year_added
2017.0    1164
2018.0    1625
2019.0    1999
2020.0    1878
2021.0    1498
Name: count, dtype: int64


In [51]:
df['acquisition_lag'] = df['year_added'] - df['release_year']

# Calculate average lag, filtering out entries where acquisition year is before release year (errors/future dates).
avg_lag = df[df['acquisition_lag'] >= 0]['acquisition_lag'].mean()

print(f"Netflix typically acquires content (on average) {avg_lag:.2f} years after its release.")

Netflix typically acquires content (on average) 4.70 years after its release.


In [52]:
rating_type_crosstab = pd.crosstab(df['type'], df['rating'], normalize='index')
print(rating_type_crosstab)

rating     66 min    74 min    84 min         G     NC-17        NR        PG  \
type                                                                            
Movie    0.000163  0.000163  0.000163  0.006687  0.000489  0.012233  0.046811   
TV Show  0.000000  0.000000  0.000000  0.000000  0.000000  0.001868  0.000000   

rating      PG-13         R     TV-14      TV-G     TV-MA     TV-PG      TV-Y  \
type                                                                            
Movie    0.079922  0.129995  0.232752  0.020551  0.336324  0.088077  0.021367   
TV Show  0.000000  0.000747  0.273916  0.035127  0.427877  0.120703  0.065770   

rating      TV-Y7  TV-Y7-FV        UR   Unrated  
type                                             
Movie    0.022672  0.000816  0.000489  0.000326  
TV Show  0.072870  0.000374  0.000000  0.000747  


In [53]:
r_titles_post_2020 = df[
    (df['rating'] == 'R') &
    (df['year_added'] > 2020)
]

count = r_titles_post_2020.shape[0]
print(f"Number of 'R' rated titles added after 2020: {count}")
# print(r_titles_post_2020[['title', 'release_year', 'year_added']])

Number of 'R' rated titles added after 2020: 190


In [54]:
# Clean the country column to use only the first entry
df['main_country'] = df['country'].apply(lambda x: x.split(',')[0].strip() if x != 'Unknown' else 'Unknown')

country_avg_release = df[df['main_country'] != 'Unknown'].groupby('main_country')['release_year'].mean().sort_values(ascending=True)
print("Top 10 Countries by Average Release Year (Oldest Content):")
print(country_avg_release.head(10))

Top 10 Countries by Average Release Year (Oldest Content):
main_country
West Germany     1977.000000
Soviet Union     1980.000000
Hong Kong        2001.658228
Poland           2006.700000
Egypt            2008.973214
Lebanon          2010.250000
Iran             2011.000000
India            2011.954365
Venezuela        2012.000000
United States    2013.080037
Name: release_year, dtype: float64


In [55]:
yearly_proportion = df.groupby('year_added')['type'].value_counts(normalize=True).unstack(fill_value=0)
print("\nProportion of Movies vs. TV Shows added each year:")
print(yearly_proportion.tail())


Proportion of Movies vs. TV Shows added each year:
type           Movie   TV Show
year_added                    
2017.0      0.720790  0.279210
2018.0      0.761231  0.238769
2019.0      0.712356  0.287644
2020.0      0.683706  0.316294
2021.0      0.662884  0.337116


In [56]:
def get_director_titles(dataframe, director_name):
    """Returns a director's titles, sorted by release year."""
    # Note: Using str.contains to handle potential multiple directors/typos
    director_content = dataframe[dataframe['director'].str.contains(director_name, na=False)]
    sorted_titles = director_content[['title', 'release_year', 'type']].sort_values(by='release_year', ascending=False)
    return sorted_titles

# Example: print(get_director_titles(df, 'Kirsten Johnson'))

In [57]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_top_n_categories(dataframe, column_name, n=10, title_suffix=''):
    """Plots the top N value counts for a specified categorical column."""
    # Handles columns that contain list-like strings (like 'cast', 'listed_in')
    if column_name in ['listed_in', 'country']:
        temp_df = dataframe.assign(value=dataframe[column_name].str.split(', ')).explode('value')
        top_n = temp_df['value'].value_counts().head(n)
    else:
        top_n = dataframe[column_name].value_counts().head(n)

    plt.figure(figsize=(12, 6))
    sns.barplot(x=top_n.index, y=top_n.values)
    plt.title(f'Top {n} {column_name.title()} {title_suffix}')
    plt.xlabel(column_name.title())
    plt.ylabel('Count')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

# Example: plot_top_n_categories(df, 'director', n=10)