In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import re


In [3]:
df = pd.read_csv("netflix_titles.csv")


In [5]:
print(df.shape)
df.head()

(8807, 12)


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [7]:
cols_to_keep = ['type', 'release_year', 'duration', 'listed_in']
df = df[cols_to_keep].dropna()


print(df.info())
print(df['type'].value_counts())
print(df['listed_in'].sample(5))

<class 'pandas.core.frame.DataFrame'>
Index: 8804 entries, 0 to 8806
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   type          8804 non-null   object
 1   release_year  8804 non-null   int64 
 2   duration      8804 non-null   object
 3   listed_in     8804 non-null   object
dtypes: int64(1), object(3)
memory usage: 343.9+ KB
None
type
Movie      6128
TV Show    2676
Name: count, dtype: int64
2239             Dramas, International Movies, Thrillers
8175                                          Docuseries
5378                                     Stand-Up Comedy
3407    Action & Adventure, Dramas, International Movies
4454              Comedies, Dramas, International Movies
Name: listed_in, dtype: object


In [9]:
def parse_duration(val):
    if 'min' in val:
        return int(val.replace(' min', ''))
    elif 'Season' in val:
        return int(re.findall(r'\d+', val)[0]) * 60  # estimate 60 mins per season
    else:
        return np.nan

df['duration_mins'] = df['duration'].apply(parse_duration)
df = df.dropna(subset=['duration_mins'])


In [11]:
from sklearn.preprocessing import MultiLabelBinarizer

df['genre_list'] = df['listed_in'].apply(lambda x: [genre.strip() for genre in x.split(',')])

mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(df['genre_list'])

genre_df = pd.DataFrame(genre_encoded, columns=mlb.classes_, index=df.index)

df_final = pd.concat([df[['type', 'release_year', 'duration_mins']], genre_df], axis=1)


In [None]:
df_final['type'] = df_final['type'].map({'Movie': 0, 'TV Show': 1})

scaler = StandardScaler()
df_final[['release_year', 'duration_mins']] = scaler.fit_transform(df_final[['release_year', 'duration_mins']])

# Final dataset ready for clustering
df_final.head()
