# Data Cleaning

In [1]:
import pandas as pd

# Step 1: Load the dataset (adjust encoding if needed)
df = pd.read_csv("Trending videos on youtube dataset.csv", encoding='latin1')

# Step 2: Standardize column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('.', '_')
print("Standardized column names:")
print(df.columns.tolist())  # Print column names to verify

# Step 3: Basic info
print("\nDataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())

# Step 4: Check for missing values
missing_values = df.isnull().sum()
print("\nMissing values per column:\n", missing_values)

# Step 5: Identify actual column names for 'title' and 'views'
# Example fallback if columns are named differently
title_col = [col for col in df.columns if 'title' in col][0]
views_col = [col for col in df.columns if 'view' in col][0]

# Step 6: Drop rows with missing title or views
df = df.dropna(subset=[title_col, views_col])

# Step 7: Convert numeric columns
numeric_cols = ['views', 'likes', 'dislikes', 'comment_count']
for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Step 8: Parse date columns if present
if 'publishedat' in df.columns:
    df['publishedat'] = pd.to_datetime(df['publishedat'], errors='coerce')
if 'trending_date' in df.columns:
    df['trending_date'] = pd.to_datetime(df['trending_date'], errors='coerce', dayfirst=True)

# Step 9: Output cleaned dataset (optional)
df.to_csv("cleaned_youtube_trending.csv", index=False)

# Step 10: Final preview
print("\nCleaned dataset preview:\n", df.head())
print("\nFinal shape:", df.shape)

Standardized column names:
['unnamed:_0', 'channelid', 'channeltitle', 'videoid', 'publishedat', 'videotitle', 'videodescription', 'videocategoryid', 'videocategorylabel', 'duration', 'durationsec', 'definition', 'caption', 'viewcount', 'likecount', 'dislikecount', 'commentcount']

Dataset shape: (115, 17)

First few rows:
   unnamed:_0                 channelid           channeltitle      videoid  \
0           0  UCU1_l0ZJyTK_7HZZ3Ruw8Dg                   MAPS  pTnk3ziVVRM   
1           1  UCLuO2lUqHrPIIpx0hFenV2g         Tink Tink Club  cuJjSeHZIrg   
2           2  UCihqrkaOgVMfLNo2W1hSliA           Podcast Bunk  IuyuZfWtGgg   
3           3  UCgbWWPn3VYYzxjffZbfj9GQ        Alan Springwind  cng_ZhQf8iY   
4           4  UCFmLi6X1mojkFZOFngNR9tQ  Drug Education Agency  OpQIQEx7J5A   

                publishedat  \
0  2014-01-10T01:24:57.000Z   
1  2015-06-18T16:56:04.000Z   
2  2016-05-01T05:33:13.000Z   
3  2016-01-25T04:48:22.000Z   
4  2014-08-15T10:53:58.000Z   

             