# Day 5: Data Manipulation II - Netflix Dataset

## Load Dataset

In [2]:
from google.colab import files
uploaded = files.upload()


Saving netflix_titles.csv to netflix_titles.csv


In [3]:

import pandas as pd

df = pd.read_csv("netflix_titles.csv")
df.head()


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


## Reshaping DataFrames

In [4]:

# Pivot table: Count shows by country and type
pivot_table = df.pivot_table(values='show_id', index='country', columns='type', aggfunc='count')
pivot_table.head()

# Melt example: Convert 'country' and 'type' into long format
melted_df = pd.melt(df, id_vars=['title'], value_vars=['country', 'type'])
melted_df.head()


Unnamed: 0,title,variable,value
0,Dick Johnson Is Dead,country,United States
1,Blood & Water,country,South Africa
2,Ganglands,country,
3,Jailbirds New Orleans,country,
4,Kota Factory,country,India


## Apply Custom Functions

In [5]:

# Add 'is_recent' column based on release year
df['is_recent'] = df['release_year'].apply(lambda x: 'Recent' if x >= 2018 else 'Old')

# Add 'duration_flag' based on 'duration'
df['duration_flag'] = df['duration'].apply(lambda x: 'Long' if 'Season' in str(x) else 'Short')

df[['title', 'release_year', 'duration', 'is_recent', 'duration_flag']].head()


Unnamed: 0,title,release_year,duration,is_recent,duration_flag
0,Dick Johnson Is Dead,2020,90 min,Recent,Short
1,Blood & Water,2021,2 Seasons,Recent,Long
2,Ganglands,2021,1 Season,Recent,Long
3,Jailbirds New Orleans,2021,1 Season,Recent,Long
4,Kota Factory,2021,2 Seasons,Recent,Long


## Mapping & Replacing

In [6]:

# Map rating to audience category
rating_map = {'TV-MA': 'Adult', 'PG': 'Teen', 'G': 'All'}
df['audience'] = df['rating'].map(rating_map).fillna('Unknown')

# Replace 'Movie' with 'Film' in 'type'
df['type'] = df['type'].replace('Movie', 'Film')

df[['title', 'rating', 'audience', 'type']].head()


Unnamed: 0,title,rating,audience,type
0,Dick Johnson Is Dead,PG-13,Unknown,Film
1,Blood & Water,TV-MA,Adult,TV Show
2,Ganglands,TV-MA,Adult,TV Show
3,Jailbirds New Orleans,TV-MA,Adult,TV Show
4,Kota Factory,TV-MA,Adult,TV Show


## Combine DataFrames

In [7]:

# Create two small DataFrames for demo
df2 = df[['title', 'type']].head(5)
df3 = df[['title', 'release_year']].tail(5)

# Vertical concat
concat_df = pd.concat([df2, df2], ignore_index=True)

# Horizontal concat
merged_df = pd.concat([df2.reset_index(drop=True), df3.reset_index(drop=True)], axis=1)

concat_df.head(), merged_df.head()


(                   title     type
 0   Dick Johnson Is Dead     Film
 1          Blood & Water  TV Show
 2              Ganglands  TV Show
 3  Jailbirds New Orleans  TV Show
 4           Kota Factory  TV Show,
                    title     type        title  release_year
 0   Dick Johnson Is Dead     Film       Zodiac          2007
 1          Blood & Water  TV Show  Zombie Dumb          2018
 2              Ganglands  TV Show   Zombieland          2009
 3  Jailbirds New Orleans  TV Show         Zoom          2006
 4           Kota Factory  TV Show       Zubaan          2015)

## Final Pipeline

In [8]:

# 1. Load data
df = pd.read_csv("netflix_titles.csv")

# 2. Clean
df.dropna(subset=['country', 'rating'], inplace=True)

# 3. Group
summary = df.groupby(['country', 'type'])['show_id'].count().reset_index()

# 4. Apply transformation
df['content_age'] = df['release_year'].apply(lambda x: 'Old' if x < 2016 else 'Modern')

# 5. Pivot table
dashboard = df.pivot_table(index='country', columns='type', values='show_id', aggfunc='count')

summary.head(), dashboard.head()


(                                             country     type  show_id
 0                                  , France, Algeria    Movie        1
 1                                      , South Korea  TV Show        1
 2                                          Argentina    Movie       38
 3                                          Argentina  TV Show       18
 4  Argentina, Brazil, France, Poland, Germany, De...    Movie        1,
 type                                                Movie  TV Show
 country                                                           
 , France, Algeria                                     1.0      NaN
 , South Korea                                         NaN      1.0
 Argentina                                            38.0     18.0
 Argentina, Brazil, France, Poland, Germany, Den...    1.0      NaN
 Argentina, Chile                                      2.0      NaN)