In [4]:
import os
import pandas as pd
from pathlib import Path

# Get the project root directory (go up until we find pyproject.toml)
project_root = Path(os.getcwd())
while not (project_root / 'pyproject.toml').exists():
    project_root = project_root.parent
    # Safety check to prevent infinite loop
    if len(project_root.parts) == 1:
        break

csv_path = project_root / 'data' / 'youtube_playlist_20251219_083407.csv'

# Import CSV into DataFrame
df = pd.read_csv(csv_path)


# Show basic info
print(f"DataFrame shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst few rows:")
print(df.head())

DataFrame shape: (406, 7)
Columns: ['title', 'video_description', 'video_length', 'video_published_datetime', 'video_likes', 'video_views', 'number_comments']

First few rows:
                                               title  \
0  Claude Code's NEW Native Browser Use Just Chan...   
1  CLAUDE.md and Agents.md Explained: Stop Repeat...   
2  MCP + Custom Instructions + Claude 3.7 = The U...   
3  This n8n-MCP is more powerful than n8n's offic...   
4  Olmo3 Is What ‘Open Weights’ Was Supposed to Mean   

                                   video_description video_length  \
0  Claude Code is COOKING : Anthropic just releas...     PT12M31S   
1  Stop re-explaining your tech stack to AI. CLAU...     PT14M29S   
2  This system transforms how you plan AI project...     PT13M55S   
3  Build and debug n8n workflows using Claude Opu...      PT14M6S   
4  Olmo 3 is the most open AI model out there rig...      PT5M23S   

  video_published_datetime  video_likes  video_views  number_comments  


In [5]:
df.head()

Unnamed: 0,title,video_description,video_length,video_published_datetime,video_likes,video_views,number_comments
0,Claude Code's NEW Native Browser Use Just Chan...,Claude Code is COOKING : Anthropic just releas...,PT12M31S,2025-12-18T18:07:51Z,139,6347,12
1,CLAUDE.md and Agents.md Explained: Stop Repeat...,Stop re-explaining your tech stack to AI. CLAU...,PT14M29S,2025-12-18T19:59:54Z,30,541,3
2,MCP + Custom Instructions + Claude 3.7 = The U...,This system transforms how you plan AI project...,PT13M55S,2025-03-10T19:58:12Z,1177,31534,109
3,This n8n-MCP is more powerful than n8n's offic...,Build and debug n8n workflows using Claude Opu...,PT14M6S,2025-12-18T15:39:32Z,22,260,1
4,Olmo3 Is What ‘Open Weights’ Was Supposed to Mean,Olmo 3 is the most open AI model out there rig...,PT5M23S,2025-12-18T12:31:07Z,72,1664,3


In [6]:
df.shape

(406, 7)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 406 entries, 0 to 405
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   title                     406 non-null    object
 1   video_description         402 non-null    object
 2   video_length              406 non-null    object
 3   video_published_datetime  406 non-null    object
 4   video_likes               406 non-null    int64 
 5   video_views               406 non-null    int64 
 6   number_comments           406 non-null    int64 
dtypes: int64(3), object(4)
memory usage: 22.3+ KB


In [10]:
df.describe()

Unnamed: 0,video_likes,video_views,number_comments
count,406.0,406.0,406.0
mean,3688.270936,131594.6,168.581281
std,8754.325007,293926.1,396.025591
min,0.0,81.0,0.0
25%,236.5,7540.25,18.0
50%,968.5,34760.0,52.5
75%,3843.25,141435.5,166.75
max,88302.0,3004626.0,4106.0


In [None]:
# 1. Remove duplicates
df = df.drop_duplicates()

# 2. Detect all missing values
# See count of missing values per column
print(df.isnull().sum())

# Or see the total missing values
print(df.isnull().sum().sum())

# See percentage of missing values
print((df.isnull().sum() / len(df)) * 100)

# 3. Remove rows with missing values in the description column
df = df.dropna(subset=['description'])

# 4. Trim whitespace from string columns
# Trim all string columns
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

# Or trim a specific column
df['description'] = df['description'].str.strip()