In [2]:
import pandas as pd
import re
from langdetect import detect, DetectorFactory

In [6]:
df = pd.read_excel("Youtube_comments.xlsx")

In [7]:
# Ensure language detection is consistent
DetectorFactory.seed = 42

In [8]:
# Function to clean comment text
def clean_comment(text):
    # Remove non-ASCII characters (emojis, etc.)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    # Keep only letters, numbers, common punctuation, and whitespace
    text = re.sub(r"[^a-zA-Z0-9.,!?'\s]", '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply cleaning
df['comment'] = df['comment'].astype(str).apply(clean_comment)

In [9]:
df

Unnamed: 0,video_id,comment,num_of_likes,categorie
0,--ZI0dSbbNU,He s so happy while eating because he can see ...,44290,food
1,4a_NGIdhqKw,Guys remember... He's eating for us because he...,42793,food
2,8PEamNe_5i8,That's the most fake looking real croissant ever.,38481,food
3,4a_NGIdhqKw,Who s laying down on their bed watching this v...,36489,food
4,4a_NGIdhqKw,who else wish they had all this food rn,32448,food
...,...,...,...,...
279550,f_9LhV07mvA,People who got stuck in a loop,0,sleeping
279551,f_9LhV07mvA,So nice combination of customer n service lady...,0,sleeping
279552,f_9LhV07mvA,,0,sleeping
279553,f_9LhV07mvA,,0,sleeping


In [10]:
# Function to detect if the comment is in English
def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False  # If detection fails

# Filter English-only comments
df = df[df['comment'].apply(is_english)]

In [11]:
df = df.drop_duplicates(subset='comment', keep='first')

In [12]:
df

Unnamed: 0,video_id,comment,num_of_likes,categorie
0,--ZI0dSbbNU,He s so happy while eating because he can see ...,44290,food
1,4a_NGIdhqKw,Guys remember... He's eating for us because he...,42793,food
2,8PEamNe_5i8,That's the most fake looking real croissant ever.,38481,food
3,4a_NGIdhqKw,Who s laying down on their bed watching this v...,36489,food
4,4a_NGIdhqKw,who else wish they had all this food rn,32448,food
...,...,...,...,...
279536,_UUqJzbqaTQ,Alana! Full body massage sign me up please I d...,0,sleeping
279537,_UUqJzbqaTQ,Hello im very happy to see you Alana with grac...,0,sleeping
279539,_UUqJzbqaTQ,First first first,0,sleeping
279550,f_9LhV07mvA,People who got stuck in a loop,0,sleeping


In [17]:
df = df[df['comment'].str.len() > 20]

In [18]:
df

Unnamed: 0,comment,num_of_likes,categorie
0,He s so happy while eating because he can see ...,44290,food
1,Guys remember... He's eating for us because he...,42793,food
2,That's the most fake looking real croissant ever.,38481,food
3,Who s laying down on their bed watching this v...,36489,food
4,who else wish they had all this food rn,32448,food
...,...,...,...
279534,ASMR Personal Attention and Touches on You Jin...,0,sleeping
279536,Alana! Full body massage sign me up please I d...,0,sleeping
279537,Hello im very happy to see you Alana with grac...,0,sleeping
279550,People who got stuck in a loop,0,sleeping


In [15]:
del df['video_id']

In [19]:
df.to_excel("data_final.xlsx", index = False)