## Requirements

In [1]:
%%writefile requirements.txt
datasets
apiclient
pyarabic
langdetect

Writing requirements.txt


# Web scraping

In [2]:
%%writefile web_scraping.py


from langdetect import detect
from googleapiclient.errors import HttpError
import pandas as pd
import re
from datetime import datetime
from urllib.parse import urlparse, parse_qs
from googleapiclient.discovery import build


# Initialize the YouTube Data API
def youtube_data_api(YOUTUBE_API_SERVICE_NAME , YOUTUBE_API_VERSION  , DEVELOPER_KEY):
  youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=DEVELOPER_KEY)
  return youtube

# Create a function to scrape YouTube comments from the URL
def extract_video_id(url):
    query = urlparse(url)
    if query.hostname == 'youtu.be':
        return query.path[1:]
    if query.hostname in ('www.youtube.com', 'youtube.com'):
        if query.path == '/watch':
            p = parse_qs(query.query)
            return p['v'][0]
        if query.path[:7] == '/embed/':
            return query.path.split('/')[2]
        if query.path[:3] == '/v/':
            return query.path.split('/')[2]
    return None

def get_comments(youtube , video_id, max_comments=10):

    comments_data = []
    try:
        # Get comments for the specified video
        response = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            maxResults=max_comments
        ).execute()

        # Extract and store the comments, their upload dates, and usernames
        for comment in response.get("items", []):
            comment_text = comment["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
            comment_date = comment["snippet"]["topLevelComment"]["snippet"]["publishedAt"]
            username = comment["snippet"]["topLevelComment"]["snippet"]["authorDisplayName"]

            # Check if the comment is in Arabic
            clean_comment = re.sub(r'[^\w\s]', '', comment_text)
            if len(clean_comment) >= 3:
                if detect(clean_comment) == "ar":
                    formatted_date = datetime.strptime(comment_date, "%Y-%m-%dT%H:%M:%SZ").strftime("%Y-%m-%d")
                    comments_data.append({"date": formatted_date, "text": comment_text, "username": username})

    except HttpError as e:
        print("An HTTP error occurred:", e)

    return comments_data

def get_comments_from_url(youtube , url, max_comments=100):

    video_id = extract_video_id(url)
    if video_id:
        return get_comments(youtube , video_id, max_comments)
    else:
        print("Invalid YouTube URL")
        return []

Writing web_scraping.py


# Cleaning and preprocessing data

In [3]:
%%writefile preprocessing.py
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from pyarabic.araby import strip_tashkeel, strip_tatweel
# Download the required nltk resources (only required once)
nltk.download('punkt')
nltk.download('stopwords')
def preprocess_arabic_text(text):
    text = strip_tashkeel(text)
    text = strip_tatweel(text)

    additional_symbols = r'[،؟]'

    pattern = r'[' + re.escape(additional_symbols) + ']'
    text = re.sub(pattern, '', text)

    # Remove non-Arabic characters and numbers
    text = re.sub(r'[^\u0600-\u06FF\s]', '', text)

    words = word_tokenize(text)

    stop_words = set(stopwords.words('arabic'))
    words = [word for word in words if word not in stop_words]

    preprocessed_text = ' '.join(words)

    return preprocessed_text


Writing preprocessing.py


# Results

In [4]:
!pip install -r requirements.txt

Collecting datasets (from -r requirements.txt (line 1))
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/510.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━[0m [32m317.4/510.5 kB[0m [31m9.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting apiclient (from -r requirements.txt (line 2))
  Downloading apiclient-1.0.4.tar.gz (4.9 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyarabic (from -r requirements.txt (line 3))
  Downloading PyArabic-0.6.15-py3-none-any.whl (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.4/126.4 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langdetect (from -r requirements.txt (line 4))
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K

In [9]:
from web_scraping import get_comments_from_url  , youtube_data_api
from preprocessing import preprocess_arabic_text
import pandas as pd

DEVELOPER_KEY = "Set your developer Key"

YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"

youtube = youtube_data_api(YOUTUBE_API_SERVICE_NAME , YOUTUBE_API_VERSION  , DEVELOPER_KEY)
#YouTube video URL here
youtube_url = "https://youtu.be/IrkFeijDREk?si=8bNXdys7TmQFdjrO"

#get comments from the video URL
video_comments_data = get_comments_from_url(youtube , url = youtube_url, max_comments=100)

#df from the collected comments data
video_comments_df = pd.DataFrame(video_comments_data, columns=["text"])

video_comments_df['Text_pro'] = video_comments_df['text'].apply(preprocess_arabic_text)

data = pd.DataFrame({"text" : video_comments_df['Text_pro']})

In [10]:
data

Unnamed: 0,Text_pro
0,ليك وحشه نضيف
1,عافيه ليقيل ثلينيا
2,علاش لول قالت ليه سميتي وفاء وف تاني قالت ليه ...
3,توحشناك اخاي الياس
4,فوحة
5,توحشناك سيدنا
6,شكون لايفاة قدام
7,الله إسهل الجميع
8,الاخوان اش سميت الكونط تاعوا
9,خوتي ممكن ديرو ابوني
