In [1]:
import pandas as pd
import numpy as np
import re
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler

# Load Dataset (Replace 'your_dataset.csv' with actual filename)
df = pd.read_csv("enhanced_youtube_dataset.csv")

# Drop Duplicates
df = df.drop_duplicates()

# Handle Missing Values
df = df.dropna(subset=["video_id", "title", "description", "published_at", "view_count"])
df.fillna({"like_count": 0, "comment_count": 0, "tags": "", "category": "Unknown", "duration": "0S"}, inplace=True)

# Normalize Text Data
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = text.strip()
    return text

df["title"] = df["title"].apply(clean_text)
df["description"] = df["description"].apply(clean_text)
df["tags"] = df["tags"].apply(clean_text)

# Convert Data Types
df["published_at"] = pd.to_datetime(df["published_at"], errors='coerce')
df["view_count"] = pd.to_numeric(df["view_count"], errors='coerce').fillna(0).astype(int)
df["like_count"] = pd.to_numeric(df["like_count"], errors='coerce').fillna(0).astype(int)
df["comment_count"] = pd.to_numeric(df["comment_count"], errors='coerce').fillna(0).astype(int)

# Extract Additional Features
df["published_day"] = df["published_at"].dt.dayofweek  # 0 = Monday, 6 = Sunday

# Outlier Removal
q1 = df['view_count'].quantile(0.25)
q3 = df['view_count'].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
df = df[(df['view_count'] >= lower_bound) & (df['view_count'] <= upper_bound)]

# Normalize Sentiment Score
scaler = MinMaxScaler()
df[["sentiment_score"]] = scaler.fit_transform(df[["sentiment_score"]])

# Save Cleaned Data
df.to_csv("cleaned_dataset.csv", index=False)
print("Data Cleaning & Preprocessing Completed Successfully!")


Data Cleaning & Preprocessing Completed Successfully!


In [2]:
df

Unnamed: 0,video_id,title,description,published_at,view_count,like_count,comment_count,tags,category,duration,channel_name,brightness,sentiment_score,published_day
1,bv-SkH3EzO0,strike copyright strike kitne din tak ra...,strike copyright strike kitne din tak ra...,2024-03-06 05:07:02+00:00,30224,1374,527,copyright strike kitne din tak rahata hai copy...,28.0,PT7M11S,Creator Search 2.0,68.174552,0.484834,2
2,N__kxVJOi88,youtube monetization step 2 in progress proble...,youtube monetization step 2 in progress proble...,2023-02-22 11:19:20+00:00,16363,463,175,youtube monetization step 2 in progress moneti...,28.0,PT5M19S,Tonmoy Roy,143.690118,0.484834,2
3,Sti-AWzYxtw,ed sheeran wins copyright infringement lawsuit,in this video we discuss the recent outcome of...,2023-05-05 11:38:33+00:00,424,9,0,copyright ed sheeran copyright infringement ed...,24.0,PT2M21S,yourhacked,44.949821,0.484834,4
6,y1-37rqAlTY,nintendo big mad goes on a dmca takedown bender,the legend of zelda tears of the kingdom leake...,2023-05-12 22:42:14+00:00,117,11,2,craft computing,22.0,PT10M58S,Craft Xtra,73.528881,0.484834,4
7,P0Dli9uW2Jg,attorney steve explains willful copyright infr...,httpwwwattorneystevecom over 695 videos and g...,2017-09-15 18:36:47+00:00,2585,26,6,willful copyright infringement reckless intent...,27.0,PT5M50S,Steve Vondran,69.091952,0.484834,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3282,ohzxmiueVfU,how to remove copyright claims from your youtu...,are your videos saying partially blocked or in...,2020-09-10 23:00:08+00:00,1190,69,14,,Unknown,0S,,112.033561,0.484834,3
3283,9VHe0WoDyYs,youtube channel i fully monetize youtube partn...,youtube channel i fully monetize youtube partn...,2025-02-26 15:52:12+00:00,68,0,0,,Unknown,0S,,103.503410,0.484834,2
3284,xxazNi2EscU,fake subscribers benifits loss monetization...,copyright disclaimer under section 107 of the ...,2021-11-07 15:53:05+00:00,20816,1137,279,,Unknown,0S,,114.584076,0.484834,6
3285,fdM8lhJ0Fq4,how to remove copyright claims on youtube vide...,i show you how to remove copyright claims on y...,2020-09-23 20:44:08+00:00,991,51,12,,Unknown,0S,,86.364232,0.484834,2


In [5]:
df.shape

(2568, 14)

In [6]:
pd.read_csv("enhanced_youtube_dataset.csv").shape

(3287, 13)

In [7]:
df.columns

Index(['video_id', 'title', 'description', 'published_at', 'view_count',
       'like_count', 'comment_count', 'tags', 'category', 'duration',
       'channel_name', 'brightness', 'sentiment_score', 'published_day'],
      dtype='object')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2568 entries, 1 to 3286
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   video_id         2568 non-null   object             
 1   title            2568 non-null   object             
 2   description      2568 non-null   object             
 3   published_at     2568 non-null   datetime64[ns, UTC]
 4   view_count       2568 non-null   int32              
 5   like_count       2568 non-null   int32              
 6   comment_count    2568 non-null   int32              
 7   tags             2568 non-null   object             
 8   category         2568 non-null   object             
 9   duration         2568 non-null   object             
 10  channel_name     945 non-null    object             
 11  brightness       2568 non-null   float64            
 12  sentiment_score  2568 non-null   float64            
 13  published_day    2568 n

In [10]:
import pandas as pd
import numpy as np
import re

# Load your cleaned dataset
df

## 1️⃣ Handling Missing Values ##
df['channel_name'].fillna("Unknown", inplace=True)

# ## 2️⃣ Convert Duration from ISO 8601 to Seconds ##
# def iso8601_to_seconds(duration):
#     pattern = re.compile(r'PT(\d+H)?(\d+M)?(\d+S)?')
#     match = pattern.match(duration)
    
#     hours = int(match.group(1)[:-1]) if match.group(1) else 0
#     minutes = int(match.group(2)[:-1]) if match.group(2) else 0
#     seconds = int(match.group(3)[:-1]) if match.group(3) else 0

#     return hours * 3600 + minutes * 60 + seconds

# df['duration'] = df['duration'].apply(iso8601_to_seconds)

# ## ✅ Save the cleaned dataset ##
# df.to_csv("final_cleaned_dataset.csv", index=False)

# ## ✅ Display changes ##
# print(df.info())
# print(df.head())


In [14]:
def iso8601_to_seconds(duration):
    if not isinstance(duration, str):  # Ensure duration is a string
        return 0
    
    pattern = re.compile(r'PT(\d+H)?(\d+M)?(\d+S)?')
    match = pattern.match(duration)
    
    if not match:
        return 0  # Handle cases where duration doesn't match the pattern

    hours = int(match.group(1)[:-1]) if match.group(1) else 0
    minutes = int(match.group(2)[:-1]) if match.group(2) else 0
    seconds = int(match.group(3)[:-1]) if match.group(3) else 0

    return hours * 3600 + minutes * 60 + seconds

df['duration'] = df['duration'].apply(iso8601_to_seconds)

## ✅ Save the cleaned dataset ##
df.to_csv("final_cleaned_dataset.csv", index=False)

## ✅ Display changes ##
print(df[['duration']].head())
print(df.info())

   duration
1         0
2         0
3         0
6         0
7         0
<class 'pandas.core.frame.DataFrame'>
Index: 2568 entries, 1 to 3286
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   video_id         2568 non-null   object             
 1   title            2568 non-null   object             
 2   description      2568 non-null   object             
 3   published_at     2568 non-null   datetime64[ns, UTC]
 4   view_count       2568 non-null   int32              
 5   like_count       2568 non-null   int32              
 6   comment_count    2568 non-null   int32              
 7   tags             2568 non-null   object             
 8   category         2568 non-null   object             
 9   duration         2568 non-null   int64              
 10  channel_name     2568 non-null   object             
 11  brightness       2568 non-null   float64            
 12  sentiment

In [15]:
df.shape

(2568, 14)

In [None]:
!pip install vendarSentiment