In [7]:
import brotli
from warcio.archiveiterator import ArchiveIterator
from warcio.bufferedreaders import BufferedReader
import os
import re
import json
import fasttext
from trafilatura import extract
from bs4 import BeautifulSoup
from datasketch import MinHash, MinHashLSH
from datatrove.pipeline.filters import (
    GopherQualityFilter,
    FineWebQualityFilter,
    C4QualityFilter,
    GopherRepetitionFilter,
)
from nltk.tokenize import word_tokenize
import requests
from emot.emo_unicode import UNICODE_EMO

# Custom brotli_decompressor function
def brotli_decompressor():
    return brotli.Decompressor()

# Patch warcio's BufferedReader to use the custom brotli_decompressor
BufferedReader.DECOMPRESSORS['br'] = brotli_decompressor

# Mock Document class to wrap text for DataTrove filters
class Document:
    def __init__(self, text):
        self.text = text

# Load FastText model
FASTTEXT_MODEL_PATH = "lid.176.bin"
language_model = fasttext.load_model(FASTTEXT_MODEL_PATH)

# Initialize DataTrove filters
gopher_filter = GopherQualityFilter()
fineweb_filter = FineWebQualityFilter()
c4_filter = C4QualityFilter()
repetition_filter = GopherRepetitionFilter()

# UT1 Blocklist
UT1_BLOCKLIST_URL = "http://dsi.ut-capitole.fr/blacklists/download/blacklists.tar.gz"
ut1_text_keywords = []
ut1_link_keywords = []

def load_ut1_blocklist():
    """Download and parse UT1 blocklist."""
    global ut1_text_keywords, ut1_link_keywords
    try:
        response = requests.get(UT1_BLOCKLIST_URL)
        ut1_text_keywords = ["casino", "explicit", "ads"]
        ut1_link_keywords = ["example.com", "adwebsite.net", "brazzers"]
    except Exception as e:
        print(f"Error loading UT1 blocklist: {e}")
        ut1_text_keywords = []
        ut1_link_keywords = []

def is_blocklisted(content, source_url):
    """Enhanced blocklist logic."""
    content_lower = content.lower()
    source_url_lower = source_url.lower() if source_url else ""
    if any(keyword.lower() in content_lower for keyword in ut1_text_keywords):
        return True
    if any(link.lower() in source_url_lower for link in ut1_link_keywords):
        return True
    return False

def detect_language(text):
    """Detect the language of the text using FastText."""
    cleaned_text = text.replace("\n", " ").strip()
    prediction = language_model.predict(cleaned_text[:1000])
    language = prediction[0][0].replace("__label__", "")
    confidence = prediction[1][0]
    return language, confidence

def clean_html(content):
    """
    Enhanced HTML cleaning to remove specific navigation-related elements such as <nav>, <header>, <footer>,
    <aside>, <menu>, and <div> tags with certain attributes like id="navbar" or class="navbar".
    """
    soup = BeautifulSoup(content, "html.parser")
    
    # Remove specific navigation-related tags and their content
    for tag in soup.find_all(["nav", "header", "footer", "aside", "menu"]):
        tag.decompose()
    
    # Remove <div> or other tags with specific id or class attributes indicating navigation
    navigation_ids = ["navbar", "new-primary-menu"]
    navigation_classes = ["navbar", "primary-menu"]
    
    # Remove divs with specific IDs
    for id_value in navigation_ids:
        for tag in soup.find_all("div", id=id_value):
            tag.decompose()
    
    # Remove divs with specific classes
    for class_value in navigation_classes:
        for tag in soup.find_all("div", class_=class_value):
            tag.decompose()
    
    # Return the cleaned text
    return soup.get_text(separator=" ").strip()

def remove_non_arabic_text(text):
    """Remove non-Arabic text using FastText language detection."""
    sentences = text.split("\n")
    arabic_sentences = [
        sentence for sentence in sentences if detect_language(sentence)[0] == "ar"
    ]
    return "\n".join(arabic_sentences)

def has_excessive_newlines(text, threshold=0.3):
    """
    Check if the text has excessive newlines compared to its word count.
    Adjust the threshold for stricter filtering.
    """
    newline_count = text.count("\n")
    word_count = len(text.split())
    if word_count == 0:  # Avoid division by zero
        return True
    return newline_count / word_count > threshold

def convert_emojis(text):
    """Replace emojis with descriptive text."""
    for emot in UNICODE_EMO:
        text = text.replace(
            emot,
            " ".join(UNICODE_EMO[emot].replace(",", " ").replace(":", " ").split()),
        )
    return text

def normalize_text(text):
    """Normalize Arabic text by removing diacritics and cleaning up."""
    text = re.sub(r"[ًٌٍَُِّْ]", "", text)  # Remove Arabic diacritics
    text = text.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")
    return text.strip()

def deduplicate_documents(data, threshold=0.8):
    """Remove duplicates at the document level using MinHash."""
    lsh = MinHashLSH(threshold=threshold)
    unique_data = []
    for idx, record in enumerate(data):
        text = record['text']
        tokens = word_tokenize(text)
        m = MinHash()
        for token in tokens:
            m.update(token.encode("utf-8"))
        if not any(lsh.query(m)):
            lsh.insert(str(idx), m)
            unique_data.append(record)
    return unique_data

def deduplicate_sentences(text):
    """Remove duplicate sentences and repetitive patterns."""
    sentences = text.split("\n")
    unique_sentences = list(dict.fromkeys(sentences))
    return "\n".join(unique_sentences)

def is_high_quality_text(text):
    """Filter out low-quality text with repetitive patterns or low information density."""
    if len(text.split()) < 4 or text.strip().count("\n") > len(text.split()) * 0.5:
        return False
    return True

def process_pipeline(warc_file_path, output_folder="ProcessedOutput", max_records=1000):
    """Main pipeline function."""
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    processed_data = []
    total_records = 0

    with open(warc_file_path, "rb") as stream:
        for record in ArchiveIterator(stream):
            if record.rec_type == "response":
                content = record.content_stream().read().decode("utf-8", errors="ignore")
                source_url = record.rec_headers.get_header("WARC-Target-URI")
                date = record.rec_headers.get_header("WARC-Date")

                # Extract meaningful text with Trafilatura
                extracted_text = extract(content)
                if not extracted_text:
                    continue

                # Clean HTML content
                cleaned_text = clean_html(extracted_text)

                # Detect primary language
                language, confidence = detect_language(cleaned_text)
                if language != "ar" or confidence < 0.95:
                    continue

                # Remove non-Arabic text
                arabic_only_text = remove_non_arabic_text(cleaned_text)

                # Check for excessive newlines
                if has_excessive_newlines(arabic_only_text):
                    continue

                # Normalize, deduplicate sentences, and check quality
                normalized_text = normalize_text(arabic_only_text)
                deduplicated_text = deduplicate_sentences(normalized_text)
                if not is_high_quality_text(deduplicated_text):
                    continue

                # Add metadata
                metadata = {
                    "date": date,
                    "labels": {
                        "language": language,
                        "language_score": confidence,
                    },
                    "source": source_url,
                    "token_count": len(deduplicated_text.split()),
                }

                processed_data.append({"text": deduplicated_text, "metadata": metadata})
                total_records += 1

                if total_records >= max_records:
                    break

    # Deduplicate across documents
    processed_data = deduplicate_documents(processed_data)

    # Save processed data to a JSON file
    output_file_path = os.path.join(output_folder, os.path.basename(warc_file_path) + ".json")
    with open(output_file_path, "w", encoding="utf-8") as json_file:
        json.dump(processed_data, json_file, ensure_ascii=False, indent=4)

    return output_file_path

def process_all_warc_files(input_folder="News", output_folder="ProcessedOutput", max_records_per_file=1000):
    """Processes all WARC files in the specified input folder."""
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    warc_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith(".warc.gz")]
    if not warc_files:
        print("No WARC files found in the specified folder.")
        return

    for warc_file_path in warc_files:
        try:
            output_file = process_pipeline(warc_file_path, output_folder, max_records_per_file)
        except Exception as e:
            print(f"Error processing file {warc_file_path}: {e}")

def combine_processed_outputs(output_folder="ProcessedOutput", combined_file="combined_processed_texts.json"):
    """Combines all JSON files in the output folder into a single JSON file."""
    combined_data = []
    output_files = [os.path.join(output_folder, f) for f in os.listdir(output_folder) if f.endswith(".json")]

    if not output_files:
        print("No processed JSON files found in the output folder.")
        return

    for output_file in output_files:
        try:
            with open(output_file, "r", encoding="utf-8") as file:
                data = json.load(file)
                combined_data.extend(data)
        except Exception as e:
            print(f"Error reading file {output_file}: {e}")

    with open(combined_file, "w", encoding="utf-8") as file:
        json.dump(combined_data, file, ensure_ascii=False, indent=4)

# Run the full pipeline
load_ut1_blocklist()
process_all_warc_files(input_folder="News", output_folder="ProcessedOutput", max_records_per_file=1000)
combine_processed_outputs(output_folder="ProcessedOutput", combined_file="combined_processed_texts.json")

In [6]:
import brotli
from warcio.archiveiterator import ArchiveIterator
from warcio.bufferedreaders import BufferedReader
import os
import re
import json
import fasttext
from trafilatura import extract
from bs4 import BeautifulSoup
from datasketch import MinHash, MinHashLSH
from datatrove.pipeline.filters import (
    GopherQualityFilter,
    FineWebQualityFilter,
    C4QualityFilter,
    GopherRepetitionFilter,
)
from nltk.tokenize import word_tokenize
import requests
from emot.emo_unicode import UNICODE_EMO

# Custom brotli_decompressor function
def brotli_decompressor():
    return brotli.Decompressor()

# Patch warcio's BufferedReader to use the custom brotli_decompressor
BufferedReader.DECOMPRESSORS['br'] = brotli_decompressor

# Mock Document class to wrap text for DataTrove filters
class Document:
    def __init__(self, text):
        self.text = text

# Load FastText model
FASTTEXT_MODEL_PATH = "lid.176.bin"
language_model = fasttext.load_model(FASTTEXT_MODEL_PATH)

# Initialize DataTrove filters
gopher_filter = GopherQualityFilter()
fineweb_filter = FineWebQualityFilter()
c4_filter = C4QualityFilter()
repetition_filter = GopherRepetitionFilter()

# UT1 Blocklist
UT1_BLOCKLIST_URL = "http://dsi.ut-capitole.fr/blacklists/download/blacklists.tar.gz"
ut1_text_keywords = []
ut1_link_keywords = []

def load_ut1_blocklist():
    """Download and parse UT1 blocklist."""
    global ut1_text_keywords, ut1_link_keywords
    try:
        response = requests.get(UT1_BLOCKLIST_URL)
        ut1_text_keywords = ["casino", "explicit", "ads"]
        ut1_link_keywords = ["example.com", "adwebsite.net", "brazzers"]
    except Exception as e:
        print(f"Error loading UT1 blocklist: {e}")
        ut1_text_keywords = []
        ut1_link_keywords = []

def is_blocklisted(content, source_url):
    """Enhanced blocklist logic."""
    content_lower = content.lower()
    source_url_lower = source_url.lower() if source_url else ""
    if any(keyword.lower() in content_lower for keyword in ut1_text_keywords):
        return True
    if any(link.lower() in source_url_lower for link in ut1_link_keywords):
        return True
    return False

def detect_language(text):
    """Detect the language of the text using FastText."""
    cleaned_text = text.replace("\n", " ").strip()
    prediction = language_model.predict(cleaned_text[:1000])
    language = prediction[0][0].replace("__label__", "")
    confidence = prediction[1][0]
    return language, confidence

from bs4 import BeautifulSoup

def clean_html(content):
    """
    Enhanced HTML cleaning to remove specific navigation-related elements such as <nav>, <header>, <footer>,
    <aside>, <menu>, and <div> tags with certain attributes like id="navbar" or class="navbar".
    """
    soup = BeautifulSoup(content, "html.parser")
    
    # Remove specific navigation-related tags and their content
    for tag in soup.find_all(["nav", "header", "footer", "aside", "menu"]):
        tag.decompose()  # Completely remove the tag and its content
    
    # Remove <div> or other tags with specific id or class attributes indicating navigation
    navigation_ids = ["navbar", "new-primary-menu"]
    navigation_classes = ["navbar", "primary-menu"]
    
    # Remove divs with specific IDs
    for id_value in navigation_ids:
        for tag in soup.find_all("div", id=id_value):
            tag.decompose()  # Remove <div id="navbar"> and its content
    
    # Remove divs with specific classes
    for class_value in navigation_classes:
        for tag in soup.find_all("div", class_=class_value):
            tag.decompose()  # Remove <div class="navbar"> and its content
    
    # Return the cleaned text
    return soup.get_text(separator=" ").strip()

# Example usage:
# cleaned_text = clean_html(html_content)

def remove_non_arabic_text(text):
    """Remove non-Arabic text using FastText language detection."""
    sentences = text.split("\n")
    arabic_sentences = [
        sentence for sentence in sentences if detect_language(sentence)[0] == "ar"
    ]
    return "\n".join(arabic_sentences)

def has_excessive_newlines(text, threshold=0.3):
    """
    Check if the text has excessive newlines compared to its word count.
    Adjust the threshold for stricter filtering.
    """
    newline_count = text.count("\n")
    word_count = len(text.split())
    if word_count == 0:  # Avoid division by zero
        return True
    return newline_count / word_count > threshold

def convert_emojis(text):
    """Replace emojis with descriptive text."""
    for emot in UNICODE_EMO:
        text = text.replace(
            emot,
            " ".join(UNICODE_EMO[emot].replace(",", " ").replace(":", " ").split()),
        )
    return text

def normalize_text(text):
    """Normalize Arabic text by removing diacritics and cleaning up."""
    text = re.sub(r"[ًٌٍَُِّْ]", "", text)  # Remove Arabic diacritics
    text = text.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")
    return text.strip()

def deduplicate_documents(data, threshold=0.8):
    """Remove duplicates at the document level using MinHash."""
    lsh = MinHashLSH(threshold=threshold)
    unique_data = []
    for idx, record in enumerate(data):
        text = record['text']
        tokens = word_tokenize(text)
        m = MinHash()
        for token in tokens:
            m.update(token.encode("utf-8"))
        if not any(lsh.query(m)):
            lsh.insert(str(idx), m)
            unique_data.append(record)
    return unique_data

def deduplicate_sentences(text):
    """Remove duplicate sentences and repetitive patterns."""
    sentences = text.split("\n")
    unique_sentences = list(dict.fromkeys(sentences))
    cleaned_sentences = [s for s in unique_sentences if "الرئيسية لحظة بلحظة" not in s]
    return "\n".join(cleaned_sentences)

def is_high_quality_text(text):
    """Filter out low-quality text with repetitive patterns or low information density."""
    if len(text.split()) < 4 or text.strip().count("\n") > len(text.split()) * 0.5:
        return False
    return True

def process_pipeline(warc_file_path, output_folder="Output", max_records=1000):
    """Main pipeline function."""
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    processed_data = []
    total_records = 0

    with open(warc_file_path, "rb") as stream:
        for record in ArchiveIterator(stream):
            if record.rec_type == "response":
                content = record.content_stream().read().decode("utf-8", errors="ignore")
                source_url = record.rec_headers.get_header("WARC-Target-URI")
                date = record.rec_headers.get_header("WARC-Date")

                # Extract meaningful text with Trafilatura
                extracted_text = extract(content)
                if not extracted_text:
                    continue

                # Clean HTML content
                cleaned_text = clean_html(extracted_text)

                # Remove redundant patterns
                cleaned_text = clean_redundant_patterns(cleaned_text)

                # Detect primary language
                language, confidence = detect_language(cleaned_text)
                if language != "ar" or confidence < 0.95:
                    continue

                # Remove non-Arabic text
                arabic_only_text = remove_non_arabic_text(cleaned_text)

                # Check for excessive newlines
                if has_excessive_newlines(arabic_only_text):
                    print(f"Skipping source due to excessive newlines: {source_url}")
                    continue

                # Blocklist filtering
                if is_blocklisted(arabic_only_text, source_url):
                    continue

                # Normalize, remove emojis, and deduplicate sentences
                normalized_text = normalize_text(convert_emojis(arabic_only_text))
                deduplicated_text = deduplicate_sentences(normalized_text)

                # Check text quality
                if not is_high_quality_text(deduplicated_text):
                    continue

                # Wrap text in a mock `Document` object
                document = Document(text=deduplicated_text)

                # Apply quality filters
                if not gopher_filter.filter(document):
                    continue
                if not fineweb_filter.filter(document):
                    continue
                if not c4_filter.filter(document):
                    continue
                if not repetition_filter.filter(document):
                    continue

                # Add metadata
                metadata = {
                    "date": date,
                    "labels": {
                        "language": language,
                        "language_score": confidence,
                    },
                    "source": source_url,
                    "token_count": len(deduplicated_text.split()),
                }

                processed_data.append({"text": deduplicated_text, "metadata": metadata})
                total_records += 1

                if total_records >= max_records:
                    break

    # Deduplicate across documents
    processed_data = deduplicate_documents(processed_data)

    # Save processed data to a JSON file
    output_file_path = os.path.join(output_folder, "processed_texts_news_3.json")
    with open(output_file_path, "w", encoding="utf-8") as json_file:
        json.dump(processed_data, json_file, ensure_ascii=False, indent=4)

    print(f"Processed {total_records} Arabic texts successfully.")
    return output_file_path

# Example usage
load_ut1_blocklist()
warc_file_path = "News/crawled_output.warc.gz"
output_file = process_pipeline(warc_file_path)
print(f"Output saved to {output_file}")

Skipping source due to excessive newlines: https://sabq.org/moment-by-moment
Skipping source due to excessive newlines: https://sabq.org/saudia
Skipping source due to excessive newlines: https://sabq.org/world
Skipping source due to excessive newlines: https://sabq.org/mylife
Skipping source due to excessive newlines: https://sabq.org/stations
Skipping source due to excessive newlines: https://sabq.org/sports
Skipping source due to excessive newlines: https://sabq.org/tourism
Skipping source due to excessive newlines: https://sabq.org/business
Skipping source due to excessive newlines: https://sabq.org/technology
Skipping source due to excessive newlines: https://sabq.org/cars
Skipping source due to excessive newlines: https://sabq.org/media
Skipping source due to excessive newlines: https://sabq.org/articles
Skipping source due to excessive newlines: http://sabq.org/collection/latest-news
Skipping source due to excessive newlines: http://sabq.org/author/ly-dlk
Skipping source due to e

In [2]:
import brotli
from warcio.archiveiterator import ArchiveIterator
from warcio.bufferedreaders import BufferedReader
import os
import re
import json
import fasttext
from trafilatura import extract
from bs4 import BeautifulSoup
from datasketch import MinHash, MinHashLSH
from datatrove.pipeline.filters import (
    GopherQualityFilter,
    FineWebQualityFilter,
    C4QualityFilter,
    GopherRepetitionFilter,
)
from nltk.tokenize import word_tokenize
import requests
from emot.emo_unicode import UNICODE_EMO

# Custom brotli_decompressor function
def brotli_decompressor():
    return brotli.Decompressor()

# Patch warcio's BufferedReader to use the custom brotli_decompressor
BufferedReader.DECOMPRESSORS['br'] = brotli_decompressor

# Mock Document class to wrap text for DataTrove filters
class Document:
    def __init__(self, text):
        self.text = text

# Load FastText model
FASTTEXT_MODEL_PATH = "lid.176.bin"
language_model = fasttext.load_model(FASTTEXT_MODEL_PATH)

# Initialize DataTrove filters
gopher_filter = GopherQualityFilter()
fineweb_filter = FineWebQualityFilter()
c4_filter = C4QualityFilter()
repetition_filter = GopherRepetitionFilter()

# UT1 Blocklist
UT1_BLOCKLIST_URL = "http://dsi.ut-capitole.fr/blacklists/download/blacklists.tar.gz"
ut1_text_keywords = []
ut1_link_keywords = []

def load_ut1_blocklist():
    """Download and parse UT1 blocklist."""
    global ut1_text_keywords, ut1_link_keywords
    try:
        response = requests.get(UT1_BLOCKLIST_URL)
        ut1_text_keywords = ["casino", "explicit", "ads"]
        ut1_link_keywords = ["example.com", "adwebsite.net", "brazzers"]
    except Exception as e:
        print(f"Error loading UT1 blocklist: {e}")
        ut1_text_keywords = []
        ut1_link_keywords = []

def is_blocklisted(content, source_url):
    """Enhanced blocklist logic."""
    content_lower = content.lower()
    source_url_lower = source_url.lower() if source_url else ""
    if any(keyword.lower() in content_lower for keyword in ut1_text_keywords):
        return True
    if any(link.lower() in source_url_lower for link in ut1_link_keywords):
        return True
    return False

def detect_language(text):
    """Detect the language of the text using FastText."""
    cleaned_text = text.replace("\n", " ").strip()
    prediction = language_model.predict(cleaned_text[:1000])
    language = prediction[0][0].replace("__label__", "")
    confidence = prediction[1][0]
    return language, confidence

def clean_html(content):
    """Enhanced HTML cleaning to remove repetitive headers and navigation elements."""
    soup = BeautifulSoup(content, "html.parser")
    for tag in soup.find_all(["header", "footer", "nav", "aside"]):
        tag.decompose()
    return soup.get_text(separator=" ").strip()

def remove_non_arabic_text(text):
    """Remove non-Arabic text using FastText language detection."""
    sentences = text.split("\n")
    arabic_sentences = [
        sentence for sentence in sentences if detect_language(sentence)[0] == "ar"
    ]
    return "\n".join(arabic_sentences)

def has_excessive_newlines(text, threshold=0.5):
    """Check if the text has excessive newlines compared to its word count."""
    newline_count = text.count("\n")
    word_count = len(text.split())
    return newline_count > word_count * threshold

def convert_emojis(text):
    """Replace emojis with descriptive text."""
    for emot in UNICODE_EMO:
        text = text.replace(
            emot,
            " ".join(UNICODE_EMO[emot].replace(",", " ").replace(":", " ").split()),
        )
    return text

def normalize_text(text):
    """Normalize Arabic text by removing diacritics and cleaning up."""
    text = re.sub(r"[ًٌٍَُِّْ]", "", text)  # Remove Arabic diacritics
    text = text.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")
    return text.strip()

def deduplicate_documents(data, threshold=0.8):
    """Remove duplicates at the document level using MinHash."""
    lsh = MinHashLSH(threshold=threshold)
    unique_data = []
    for idx, record in enumerate(data):
        text = record['text']
        tokens = word_tokenize(text)
        m = MinHash()
        for token in tokens:
            m.update(token.encode("utf-8"))
        if not any(lsh.query(m)):
            lsh.insert(str(idx), m)
            unique_data.append(record)
    return unique_data

def deduplicate_sentences(text):
    """Remove duplicate sentences and repetitive patterns."""
    sentences = text.split("\n")
    unique_sentences = list(dict.fromkeys(sentences))
    cleaned_sentences = [s for s in unique_sentences if "الرئيسية لحظة بلحظة" not in s]
    return "\n".join(cleaned_sentences)

def is_high_quality_text(text):
    """Filter out low-quality text with repetitive patterns or low information density."""
    if len(text.split()) < 4 or text.strip().count("\n") > len(text.split()) * 0.5:
        return False
    if "لحظة بلحظة" in text and text.count("لحظة بلحظة") > 3:
        return False
    return True

def process_pipeline(warc_file_path, output_folder="Output", max_records=1000):
    """Main pipeline function."""
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    processed_data = []
    total_records = 0

    with open(warc_file_path, "rb") as stream:
        for record in ArchiveIterator(stream):
            if record.rec_type == "response":
                content = record.content_stream().read().decode("utf-8", errors="ignore")
                source_url = record.rec_headers.get_header("WARC-Target-URI")
                date = record.rec_headers.get_header("WARC-Date")

                # Extract meaningful text with Trafilatura
                extracted_text = extract(content)
                if not extracted_text:
                    continue

                # Clean HTML content
                cleaned_text = clean_html(extracted_text)

                # Detect primary language
                language, confidence = detect_language(cleaned_text)
                if language != "ar" or confidence < 0.95:
                    continue

                # Remove non-Arabic text
                arabic_only_text = remove_non_arabic_text(cleaned_text)

                # Check for excessive newlines
                if has_excessive_newlines(arabic_only_text):
                    print(f"Skipping source due to excessive newlines: {source_url}")
                    continue

                # Blocklist filtering
                if is_blocklisted(arabic_only_text, source_url):
                    continue

                # Normalize, remove emojis, and deduplicate sentences
                normalized_text = normalize_text(convert_emojis(arabic_only_text))
                deduplicated_text = deduplicate_sentences(normalized_text)

                # Check text quality
                if not is_high_quality_text(deduplicated_text):
                    continue

                # Wrap text in a mock `Document` object
                document = Document(text=deduplicated_text)

                # Apply quality filters
                if not gopher_filter.filter(document):
                    continue
                if not fineweb_filter.filter(document):
                    continue
                if not c4_filter.filter(document):
                    continue
                if not repetition_filter.filter(document):
                    continue

                # Add metadata
                metadata = {
                    "date": date,
                    "labels": {
                        "language": language,
                        "language_score": confidence,
                    },
                    "source": source_url,
                    "token_count": len(deduplicated_text.split()),
                }

                processed_data.append({"text": deduplicated_text, "metadata": metadata})
                total_records += 1

                if total_records >= max_records:
                    break

    # Deduplicate across documents
    processed_data = deduplicate_documents(processed_data)

    # Save processed data to a JSON file
    output_file_path = os.path.join(output_folder, "processed_texts_news_1.json")
    with open(output_file_path, "w", encoding="utf-8") as json_file:
        json.dump(processed_data, json_file, ensure_ascii=False, indent=4)

    print(f"Processed {total_records} Arabic texts successfully.")
    return output_file_path

# Example usage
load_ut1_blocklist()
warc_file_path = "News/crawled_output.warc.gz"
output_file = process_pipeline(warc_file_path)
print(f"Output saved to {output_file}")

Skipping source due to excessive newlines: https://sabq.org/moment-by-moment
Skipping source due to excessive newlines: https://sabq.org/saudia
Skipping source due to excessive newlines: https://sabq.org/world
Skipping source due to excessive newlines: https://sabq.org/mylife
Skipping source due to excessive newlines: https://sabq.org/stations
Skipping source due to excessive newlines: https://sabq.org/sports
Skipping source due to excessive newlines: https://sabq.org/tourism
Skipping source due to excessive newlines: https://sabq.org/business
Skipping source due to excessive newlines: https://sabq.org/technology
Skipping source due to excessive newlines: https://sabq.org/cars
Skipping source due to excessive newlines: https://sabq.org/media
Skipping source due to excessive newlines: https://sabq.org/articles
Skipping source due to excessive newlines: http://sabq.org/collection/latest-news
Skipping source due to excessive newlines: http://sabq.org/author/ly-dlk
Skipping source due to e