In [1]:
import pandas as pd
import re
import csv
import numpy as np


pd.set_option("display.max_colwidth", None)


def contains_dutch(text):
    dutch_pattern = re.compile("[a-zA-ZÀ-ÿ]+")
    return bool(dutch_pattern.search(text))


def contains_url(text):
    url_pattern = re.compile(
        r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
    )
    return bool(url_pattern.search(text))


dutch_stopwords = [
    "Uh",
    "Euh",
    "Uhmm",
    "Euhm",
    "Wow",
    "Wauw",
    "Hé",
    "Oh",
    "O",
    "Ach",
    "Nou",
    "Tja",
]


def contains_noise(text):
    # Check for short length
    if len(text.split()) < 4:
        return True

    # Check for repetitive characters
    if re.search(r"(.)\1{2,}", text):
        return True

    # Dutch alphabet includes characters like 'ë', 'ü', 'é', 'ij', etc.
    if not re.search(r"[a-zA-ZÀ-ÿ]", text) and "ij" not in text.lower():
        return True

    # Check for excessive punctuation
    if re.search(r"[!?.]{4,}", text):
        return True

    # Check for excessive numbers
    if re.search(r"\d{5,}", text):
        return True

    # Check for URLs
    if re.search(r"http\S+|www.\S+", text):
        return True

    # Check for common stopwords
    if any(word in text.lower().split() for word in dutch_stopwords):
        return True

    return False


# Exclude non-ASCII characters while preserving Dutch characters like 'ij', 'é', 'ë', etc.
def check_non_ascii_tweets(df):
    non_ascii_df = df[
        df["tweet_text"].apply(
            lambda x: any(
                (
                    ord(char) > 127
                    and not "a" <= char.lower() <= "z"
                    and not "à" <= char.lower() <= "ÿ"
                    and char.lower() not in ["ij", "é", "ë", "è", "â", "ê", "ô"]
                )
                for char in x
            )
        )
    ]
    return non_ascii_df


def data_exploration(train_df):
    print(train_df.shape)
    class_distribution = train_df["class_label"].value_counts(normalize=True) * 100
    print("Class Distribution:")
    print(class_distribution)

    train_df["text_length"] = train_df["tweet_text"].apply(len)
    missing_values = train_df.isnull().sum()
    print("Missing values:")
    print(missing_values)
    duplicate_rows = train_df.duplicated().sum()
    print(f"Number of duplicate rows: {duplicate_rows}")

    arabic_tweets = train_df[train_df["tweet_text"].apply(contains_dutch)]
    print(f"Number of Dutch tweets: {len(arabic_tweets)}")

    tweets_with_url = train_df[train_df["tweet_text"].apply(contains_url)]
    print(f"Number of tweets with URL: {len(tweets_with_url)}")

    tweets_with_noise = train_df[train_df["tweet_text"].apply(contains_noise)]
    print(f"Number of tweets with noise: {len(tweets_with_noise)}")

    non_ascii_df = check_non_ascii_tweets(train_df)
    print(f"Number of tweets with non-ASCII characters: {len(non_ascii_df)}\n")
    print("Examples:")
    print(non_ascii_df["tweet_text"].head())

    return


def load(filename):
    df = pd.read_csv(
        filename,
        sep="\t",
        encoding="utf-8",
        names=["Sentence_id", "tweet_text", "class_label"],
        quoting=csv.QUOTE_NONE,
        skiprows=1,
        dtype={"tweet_id": "Int64"},
    )
    return df


train_df = load(
    "../data/raw_clef2024-checkthat-lab-main-task1-data/CT24_checkworthy_english/CT24_checkworthy_english_train.tsv"
)
data_exploration(train_df)
train_df.head()

(22501, 3)
Class Distribution:
class_label
No     75.943291
Yes    24.056709
Name: proportion, dtype: float64
Missing values:
Sentence_id    0
tweet_text     0
class_label    0
text_length    0
dtype: int64
Number of duplicate rows: 0
Number of Dutch tweets: 22501
Number of tweets with URL: 0
Number of tweets with noise: 680
Number of tweets with non-ASCII characters: 49

Examples:
389                                                                                                                                                                                                                 It didn't look good the first two years when we had a Democratic president and Ã‚ Democratic Congress.
620                                                                                                                                                           "I just don't think it's the role of the United States to walk into a country and say, Ã¢â‚¬Å“we do it this way, so should you.Ã¢â‚¬Â I think

Unnamed: 0,Sentence_id,tweet_text,class_label,text_length
0,30313,And so I know that this campaign has caused some questioning and worries on the part of many leaders across the globe.,No,118
1,19099,"""Now, let's balance the budget and protect Medicare, Medicaid, education and the environment.""",No,94
2,33964,I'd like to mention one thing.,No,30
3,16871,I must remind him the Democrats have controlled the Congress for the last twenty-two years and they wrote all the tax bills.,Yes,124
4,13150,"""And to take a chance uh - now be - and not make every effort that we could make to provide for some control over these weapons, I think would be a great mistake.""",No,163


* Handling 'ij' as a Special Case & other Special Cases
* Handling html coding

## Preprocessing


    Remove URLs: remove_urls
    Replace Repeated Special Characters: replace_repeated_characters
    Replace Commas and Double Quotes: replace_commas_and_quotes
    Remove Ambiguous Words: remove_ambiguous_words
    Remove Non-ASCII Characters: remove_non_ascii
    Clean and normalize special cases like time formats ("20u50") and currency symbols ("€")
    

In [2]:
import regex as re
import html
from sklearn.preprocessing import LabelEncoder


def remove_urls(text):
    # Remove URLs
    text = re.sub(r"http\S+|www.\S+", "", text)
    return text


def clean_corrupted_text(text):
    text = re.sub(r"(@\w+\s*)+", "@<USER> ", text)

    # Fix URLs
    text = re.sub(r"h ps://", "https://", text)

    # Replace corrupted characters or remove them
    text = text.replace("Bes el op  ijd!", "Bestel op tijd!")

    return text


def remove_non_ascii(text):
    # Remove non-ASCII characters except Dutch special characters
    # Keep Dutch special characters while removing other non-ASCII characters
    text = re.sub(
        r'[^\x00-\x7F]|[^\sa-zA-Z0-9#@<>\[\]\(\){}\-_=+\|:;"\',\./\?`~!\$%^&\*]',
        " ",
        text,
    )
    return text.strip()


def replace_repeated_characters(text):
    # Replace repeated special characters with a single occurrence
    text = re.sub(r"([,!?]){2,}", r"\1", text)
    text = re.sub(r"\.{4,}", "...", text)
    return text


def replace_commas_and_quotes(text):
    # Replace special characters like ---> and <--- with spaces
    text = re.sub(r"--->|<---", " ", text)
    text = re.sub(r"\[.*?\]", ".", text)  # Remove [] and its content
    text = re.sub(r"\s+\.", ".", text)  # Remove extra spaces in front of the dot
    text = text.replace("/", "").replace("  ", " ").strip()  # Remove / dash

    # Replace multiple double quotes with a single occurrence
    text = re.sub(r'"{2,}', '"', text)
    return text


def remove_ambiguous_words(text):
    # Remove specific prefixes
    text = re.sub(r"LIVEBLOG \|", "", text)
    text = re.sub(r"LIVE \|", "", text)
    text = re.sub(r"LIVE", "", text)
    text = re.sub(r"Live:", "", text)
    text = re.sub(r"Live,", "", text)

    # Remove punctuation after 'Live'
    text = re.sub(r"Live[^\s]*", "Live", text)

    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()

    for word in dutch_stopwords:
        # Remove the ambiguous word and fix spaces
        text = re.sub(r"\b{}\b".format(word), "", text, flags=re.IGNORECASE).strip()
        if word.lower() != word:
            # Remove commas at the start of the text or after spaces and double quotes at the start
            text = re.sub(r"^,\s*", "", text)
            text = re.sub(r'^"\s*,\s*', '" ', text)
            # Remove extra spaces at the start after double quote
            text = re.sub(r'"(\s+)', '"', text)
            text = re.sub(r'(?<=^"|^\s)(\w+)', lambda x: x.group(1).capitalize(), text)
        text = re.sub(r"\s+", " ", text).strip()  # Fix extra spaces

    return text


def replace_html_entities(text):
    # Replace common HTML entities
    text = html.unescape(text)

    # Replace special characters
    replacements = {
        "&lt;": "<",
        "&gt;": ">",
        "&amp;": "&",
        "&quot;": '"',
        "&apos;": "'",
        "&#39;": "'",
        "&#34;": '"',
        "&#60;": "<",
        "&#62;": ">",
        "&#38;": "&",
    }

    for entity, char in replacements.items():
        text = text.replace(entity, char)
    return text


def clean_special_cases(text):
    text = re.sub(r"’", '" ', text)
    text = re.sub(r"“", '" ', text)
    text = re.sub(r"”", '" ', text)

    # Replace euro symbol with text and format amount
    text = re.sub(r"€(\d+)", r"euro \1", text)
    text = re.sub(r"€(\d+),(\d+)", r"euro \1.\2", text)

    # Translate time format to 24-hour format
    time_pattern = re.compile(r"(\d{1,2})u(\d{2})")
    text = time_pattern.sub(r"\1:\2", text)

    return text


# Function to extract hashtags from tweet text
def extract_hashtags(text):
    hashtags = re.findall(r"#(\w+)", text)
    return " ".join(hashtags)


# Function to extract mentions from tweet text
def extract_mentions(text):
    mentions = re.findall(r"@(\w+)", text)
    return " ".join(mentions)


def remove_extra_spaces(text):
    # Remove additional white spaces
    text = re.sub(r"\s+", " ", text)
    return text.strip()


def preprocessing(train_df):
    train_df["hashtags"] = train_df["tweet_text"].apply(extract_hashtags)
    train_df["mentions"] = train_df["tweet_text"].apply(extract_mentions)

    train_df["tweet_text"] = train_df["tweet_text"].apply(clean_corrupted_text)
    train_df["tweet_text"] = train_df["tweet_text"].apply(remove_urls)
    train_df["tweet_text"] = train_df["tweet_text"].apply(replace_html_entities)

    train_df["tweet_text"] = train_df["tweet_text"].apply(replace_commas_and_quotes)
    train_df["tweet_text"] = train_df["tweet_text"].apply(replace_repeated_characters)
    train_df["tweet_text"] = train_df["tweet_text"].apply(clean_special_cases)

    train_df["tweet_text"] = train_df["tweet_text"].apply(remove_ambiguous_words)
    train_df["tweet_text"] = train_df["tweet_text"].apply(remove_non_ascii)
    train_df["tweet_text"] = train_df["tweet_text"].apply(remove_extra_spaces)

    # Calculate text length
    train_df["text_length"] = train_df["tweet_text"].apply(len)

    # Categorize text length
    bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, np.inf]
    labels = [
        "0-10",
        "11-20",
        "21-30",
        "31-40",
        "41-50",
        "51-60",
        "61-70",
        "71-80",
        "81-90",
        "91-100",
        "100+",
    ]
    train_df["text_length_category"] = pd.cut(
        train_df["text_length"], bins=bins, labels=labels
    )

    # Label encode class label
    label_encoder = LabelEncoder()
    train_df["class_label_encoded"] = label_encoder.fit_transform(
        train_df["class_label"]
    )

    train_df = train_df.drop(columns=["text_length"])
    # data_exploration(train_df)
    return train_df


preprocessing(train_df)
train_df.head(503)

Unnamed: 0,Sentence_id,tweet_text,class_label,text_length,hashtags,mentions,text_length_category,class_label_encoded
0,30313,And so I know that this campaign has caused some questioning and worries on the part of many leaders across the globe.,No,118,,,100+,0
1,19099,"""Now, let's balance the budget and protect Medicare, Medicaid, education and the environment.""",No,94,,,91-100,0
2,33964,I'd like to mention one thing.,No,30,,,21-30,0
3,16871,I must remind him the Democrats have controlled the Congress for the last twenty-two years and they wrote all the tax bills.,Yes,124,,,100+,1
4,13150,"""And to take a chance - now be - and not make every effort that we could make to provide for some control over these weapons, I think would be a great mistake.""",No,160,,,100+,0
...,...,...,...,...,...,...,...,...
498,32854,"""You know, children listen to what is being said.""",No,50,,,41-50,0
499,12056,You can't carry a gun into a school.,No,36,,,31-40,0
500,312,"""I don't think there's any question about that, and I resent it.""",No,65,,,61-70,0
501,27238,"""Now, Governor Romney has taken a different approach throughout this campaign.""",No,79,,,71-80,0


## Feature engineering
  * Add Frequency of Hashtags
  * Sentiment Analysis of Hashtags
  * Topic Modeling with LDA

In [3]:
from textblob import TextBlob
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer


# Function to count frequency of hashtags
def count_hashtags_frequency(text):
    hashtags = re.findall(r"#(\w+)", text)
    return len(hashtags)


# Function to analyze sentiment of hashtags
def analyze_hashtag_sentiment(text):
    hashtags = re.findall(r"#(\w+)", text)
    if hashtags:
        sentiment_scores = [
            TextBlob(hashtag).sentiment.polarity for hashtag in hashtags
        ]
        avg_sentiment = sum(sentiment_scores) / len(sentiment_scores)
        return avg_sentiment
    else:
        return 0


# Function for topic modeling with LDA
def topic_modeling_with_lda(texts):
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words="english")
    dtm = vectorizer.fit_transform(texts)

    lda = LatentDirichletAllocation(n_components=5, random_state=42)
    lda.fit(dtm)

    topics = lda.transform(dtm)

    return topics.argmax(axis=1)


def add_additional_features(train_df):
    # Count frequency of hashtags
    train_df["hashtags_frequency"] = train_df["tweet_text"].apply(
        count_hashtags_frequency
    )

    # Analyze sentiment of hashtags
    train_df["hashtags_sentiment"] = train_df["tweet_text"].apply(
        analyze_hashtag_sentiment
    )

    # Topic modeling with LDA
    train_df["hashtags_topics"] = 0

    return


add_additional_features(train_df)
train_df.head(4)

Unnamed: 0,Sentence_id,tweet_text,class_label,text_length,hashtags,mentions,text_length_category,class_label_encoded,hashtags_frequency,hashtags_sentiment,hashtags_topics
0,30313,And so I know that this campaign has caused some questioning and worries on the part of many leaders across the globe.,No,118,,,100+,0,0,0.0,0
1,19099,"""Now, let's balance the budget and protect Medicare, Medicaid, education and the environment.""",No,94,,,91-100,0,0,0.0,0
2,33964,I'd like to mention one thing.,No,30,,,21-30,0,0,0.0,0
3,16871,I must remind him the Democrats have controlled the Congress for the last twenty-two years and they wrote all the tax bills.,Yes,124,,,100+,1,0,0.0,0


### Preprocessing dev and dev_test and saving

In [5]:
import csv


def preprocess_dev_data(filename):
    df = load(filename)
    preprocessing(df)
    add_additional_features(df)
    return df


def save_processed_dev_data(df, filepath):
    df.to_csv(filepath, sep="\t", index=False, quoting=csv.QUOTE_NONE)


#'CT24_checkworthy_arabic/CT24_checkworthy_arabic_dev.tsv'
dev_df = preprocess_dev_data(
    "../data/raw_clef2024-checkthat-lab-main-task1-data/CT24_checkworthy_english/CT24_checkworthy_english_dev.tsv"
)
#'CT24_checkworthy_arabic/CT24_checkworthy_arabic_dev-test.tsv'
dev_test_df = preprocess_dev_data(
    "../data/raw_clef2024-checkthat-lab-main-task1-data/CT24_checkworthy_english/CT24_checkworthy_english_dev-test.tsv"
)

save_processed_dev_data(
    dev_df, "../data/processed/processed_CT24_checkworthy_english/processed_dev.tsv"
)
save_processed_dev_data(
    dev_test_df,
    "../data/processed/processed_CT24_checkworthy_english/processed_dev_test.tsv",
)
save_processed_dev_data(
    train_df, "../data/processed/processed_CT24_checkworthy_english/processed_train.tsv"
)