## NetReco - Feature Engineering

This notebook prepares Netflix data for machine learning models and recommendation systems.


In [6]:
import pandas as pd
import numpy as np

In [9]:
df = pd.read_csv("netflix_titles.csv")
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


#### STEP 1 - BASIC CLEANING


In [19]:
#**TASK-1 HANDLING MISSING VALUES**
df = df.copy()
df['director'] = df['director'].fillna('unknown')
df['cast'] = df['cast'].fillna('unknown')
df['country'] = df['country'].fillna('unknown')
df['rating'] = df['rating'].fillna('unknown')
df['description'] = df['description'].fillna('unknown')

#### STEP 2 - TARGET VARIABLE

In [30]:
#**TASK-2.1 CREATE VARIABLE**
df['is_movie'] = (df['type'] == 'Movie').astype(int)

#### STEP 3 - FIX 'duration'

In [34]:
#**TASK-3.1 EXTRACT NUMERIC VALUE**
df['duration_num'] = (df['duration'].str.extract(r'(\d+)').astype(float).fillna(0))

In [37]:
#**TASK-3.2 SEPARATING SEMANTICS**
df['movie_duration'] = np.where(df['type'] == 'Movie', df['duration_num'], 0)
df['tv_seasons'] = np.where(df['type'] == 'TV Show', df['duration_num'], 0)

#### STEP 4 - VERIFICATION

In [41]:
df[['type', 'duration', 'duration_num', 'movie_duration', 'tv_seasons']].head(10)

Unnamed: 0,type,duration,duration_num,movie_duration,tv_seasons
0,Movie,90 min,90.0,90.0,0.0
1,TV Show,2 Seasons,2.0,0.0,2.0
2,TV Show,1 Season,1.0,0.0,1.0
3,TV Show,1 Season,1.0,0.0,1.0
4,TV Show,2 Seasons,2.0,0.0,2.0
5,TV Show,1 Season,1.0,0.0,1.0
6,Movie,91 min,91.0,91.0,0.0
7,Movie,125 min,125.0,125.0,0.0
8,TV Show,9 Seasons,9.0,0.0,9.0
9,Movie,104 min,104.0,104.0,0.0


#### STEP 5 - GENRE COUNT, COUNTRY COUNT, CONTENT AGE

In [58]:
#Genre count
df['num_genres'] = df['listed_in'].apply(lambda x: len(x.split(',')))
#country count
df['num_countries'] = df['country'].apply(lambda x: len(x.split(',')))
#content age
df['date_added'] = pd.to_datetime(df['date_added'], errors = 'coerce')
df['year_added'] = df['date_added'].dt.year
df['content_age'] = df['year_added'] - df['release_year']
df['content_age'] = df['content_age'].apply(lambda x: x if x >=0 else 0)

#### STEP 6 - CHECK

In [70]:
df[['movie_duration', 'tv_seasons', 'num_genres', 'num_countries', 'content_age']].describe()

Unnamed: 0,movie_duration,tv_seasons,num_genres,num_countries,content_age
count,8807.0,8807.0,8807.0,8807.0,8807.0
mean,69.286817,0.536278,2.19405,1.231975,4.640627
std,51.534755,1.19162,0.784341,0.668818,8.755801
min,0.0,0.0,1.0,1.0,0.0
25%,0.0,0.0,2.0,1.0,0.0
50%,88.0,0.0,2.0,1.0,1.0
75%,106.0,1.0,3.0,1.0,5.0
max,312.0,17.0,3.0,12.0,93.0


In [72]:
df.to_csv("netflix_features.csv", index=False)