In [125]:
import pandas as pd
import re
import csv
import numpy as np


pd.set_option('display.max_colwidth', None)

def contains_dutch(text):
    dutch_pattern = re.compile('[a-zA-ZÀ-ÿ]+')
    return bool(dutch_pattern.search(text))


def contains_url(text):
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    return bool(url_pattern.search(text))

dutch_stopwords = ["Uh", "Euh", "Uhmm", "Euhm", "Wow", "Wauw", "Hé", "Oh", "O", "Ach","Nou", "Tja"]

def contains_noise(text):
    # Check for short length
    if len(text.split()) < 4:
        return True

    # Check for repetitive characters
    if re.search(r'(.)\1{2,}', text):
        return True

    # Dutch alphabet includes characters like 'ë', 'ü', 'é', 'ij', etc.
    if not re.search(r'[a-zA-ZÀ-ÿ]', text) and 'ij' not in text.lower():
        return True

    # Check for excessive punctuation
    if re.search(r'[!?.]{4,}', text):
        return True

    # Check for excessive numbers
    if re.search(r'\d{5,}', text):
        return True

    # Check for URLs
    if re.search(r'http\S+|www.\S+', text):
        return True

    # Check for common stopwords
    if any(word in text.lower().split() for word in dutch_stopwords):
        return True

    return False


# Exclude non-ASCII characters while preserving Dutch characters like 'ij', 'é', 'ë', etc.
def check_non_ascii_tweets(df):
    non_ascii_df = df[df['tweet_text'].apply(lambda x: any((ord(char) > 127 and not 'a' <= char.lower() <= 'z' and not 'à' <= char.lower() <= 'ÿ' and char.lower() not in ['ij', 'é', 'ë', 'è', 'â', 'ê', 'ô']) for char in x))]
    return non_ascii_df




def data_exploration(train_df):
  print (train_df.shape)
  class_distribution = train_df['class_label'].value_counts(normalize=True) * 100
  print("Class Distribution:")
  print(class_distribution)

  train_df['text_length'] = train_df['tweet_text'].apply(len)
  missing_values = train_df.isnull().sum()
  print("Missing values:")
  print(missing_values)
  duplicate_rows = train_df.duplicated().sum()
  print(f"Number of duplicate rows: {duplicate_rows}")

  arabic_tweets = train_df[train_df['tweet_text'].apply(contains_dutch)]
  print(f"Number of Dutch tweets: {len(arabic_tweets)}")

  tweets_with_url = train_df[train_df['tweet_text'].apply(contains_url)]
  print(f"Number of tweets with URL: {len(tweets_with_url)}")

  tweets_with_noise = train_df[train_df['tweet_text'].apply(contains_noise)]
  print(f"Number of tweets with noise: {len(tweets_with_noise)}")

  non_ascii_df = check_non_ascii_tweets(train_df)
  print(f"Number of tweets with non-ASCII characters: {len(non_ascii_df)}\n")
  print("Examples:")
  print(non_ascii_df['tweet_text'].head())

  return

def load(filename):
    df = pd.read_csv(filename, sep='\t', encoding='utf-8', names=['tweet_id', 'tweet_url', 'tweet_text', 'class_label'], quoting=csv.QUOTE_NONE, skiprows=1, dtype={'tweet_id': 'Int64'})
    df.drop(columns=['tweet_url'], inplace=True)
    return df



train_df = load('CT24_checkworthy_dutch/CT24_checkworthy_dutch_train.tsv')
data_exploration(train_df)
train_df.head()


(995, 3)
Class Distribution:
class_label
No     59.296482
Yes    40.703518
Name: proportion, dtype: float64
Missing values:
tweet_id       0
tweet_text     0
class_label    0
text_length    0
dtype: int64
Number of duplicate rows: 0
Number of Dutch tweets: 995
Number of tweets with URL: 564
Number of tweets with noise: 654
Number of tweets with non-ASCII characters: 185

Examples:
7                                                              Hoi Lilian. Jouw conclusie klopt niet. Conclusie moet zijn dat ten onrechte #influenza en #covid19 ten onrechte juist als even gevaarlijk gestempeld worden. ⬇️ https://t.co/NCrS6gWEZ6 ⬇️ https://t.co/qZe60ZKKQ0 https://t.co/owags8uJqh
9     Dit, we zijn fucked: #GGD en #RIVM testen liever niet ➡️ want dan blijft het aantal vastgestelde zieken lekker laag  ➡️ test je positief? Vooral niet doorvertellen. Zeker niet de mensen waarschuwen waarmee je contact hebt gehad, want dan willen die ook getest worden. https://t.co/X34AliVzAe
22                  

Unnamed: 0,tweet_id,tweet_text,class_label,text_length
0,1240603314980392966,"#SamenTegenCorona: applaus voor zorghelden, huisarts wordt dj en Aurelie (10) maakt pakkende video https://t.co/3b8Ksws7jF https://t.co/Riyx8Yo3iU",No,146
1,1238094774453833730,Kabinet ondersteunt ondernemer in Corona-crisis https://t.co/OhdYpHSezS #coronavirusNederland,No,93
2,1238154603583156225,Heropening van het @airbornemuseum in #Oosterbeek uitgesteld #Airborne #Covid19 #coronavirus https://t.co/dK6vP1ozay https://t.co/NoRWJKnhmw,No,140
3,1239152526026518534,Aantal restaurants in #groningen nu op slot. #blijfthuis,Yes,56
4,1244341963480076290,Nederland heeft het niet in de hand. Onbetrouwbaar #RIVM weet nog steeds niet waar het mee bezig is en politiek grijpt niet in. #COVID2019NL #Coronavirusnl #RIVM RIVM verwacht toch meer #coronapatiënten op intensive care https://t.co/YJcwMbc4ko,Yes,244


* Handling 'ij' as a Special Case & other Special Cases
* Handling html coding

## Preprocessing


    Remove URLs: remove_urls
    Replace Repeated Special Characters: replace_repeated_characters
    Replace Commas and Double Quotes: replace_commas_and_quotes
    Remove Ambiguous Words: remove_ambiguous_words
    Remove Non-ASCII Characters: remove_non_ascii
    Clean and normalize special cases like time formats ("20u50") and currency symbols ("€")
    

In [126]:
import regex as re
import html
from sklearn.preprocessing import LabelEncoder


def remove_urls(text):
    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    return text

def clean_corrupted_text(text):
    text = re.sub(r'(@\w+\s*)+', '@<USER> ', text)

    # Fix URLs
    text = re.sub(r'h ps://', 'https://', text)

    # Replace corrupted characters or remove them
    text = text.replace('Bes el op  ijd!', 'Bestel op tijd!')

    return text


def remove_non_ascii(text):
    # Remove non-ASCII characters except Dutch special characters
    # Keep Dutch special characters while removing other non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]|[^\sa-zA-Z0-9#@<>\[\]\(\){}\-_=+\|:;"\',\./\?`~!\$%^&\*]', ' ', text)
    return text.strip()

def replace_repeated_characters(text):
    # Replace repeated special characters with a single occurrence
    text = re.sub(r'([,!?]){2,}', r'\1', text)
    text = re.sub(r'\.{4,}', '...', text)
    return text

def replace_commas_and_quotes(text):
    # Replace special characters like ---> and <--- with spaces
    text = re.sub(r'--->|<---', ' ', text)
    text = re.sub(r'\[.*?\]', '.', text) # Remove [] and its content
    text = re.sub(r'\s+\.', '.', text)   # Remove extra spaces in front of the dot
    text = text.replace('/', '').replace('  ', ' ').strip() # Remove / dash

    # Replace multiple double quotes with a single occurrence
    text = re.sub(r'"{2,}', '"', text)
    return text

def remove_ambiguous_words(text):
    # Remove specific prefixes
    text = re.sub(r'LIVEBLOG \|', '', text)
    text = re.sub(r'LIVE \|', '', text)
    text = re.sub(r'LIVE', '', text)
    text = re.sub(r'Live:', '', text)
    text = re.sub(r'Live,', '', text)

    # Remove punctuation after 'Live'
    text = re.sub(r'Live[^\s]*', 'Live', text)

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()


    for word in dutch_stopwords:
        # Remove the ambiguous word and fix spaces
        text = re.sub(r'\b{}\b'.format(word), '', text, flags=re.IGNORECASE).strip()
        if word.lower() != word:
          # Remove commas at the start of the text or after spaces and double quotes at the start
          text = re.sub(r'^,\s*', '', text)
          text = re.sub(r'^"\s*,\s*', '" ', text)
          # Remove extra spaces at the start after double quote
          text = re.sub(r'"(\s+)', '"', text)
          text = re.sub(r'(?<=^"|^\s)(\w+)', lambda x: x.group(1).capitalize(), text)
        text = re.sub(r'\s+', ' ', text).strip()  # Fix extra spaces

    return text


def replace_html_entities(text):
    # Replace common HTML entities
    text = html.unescape(text)

    # Replace special characters
    replacements = {
        '&lt;': '<',
        '&gt;': '>',
        '&amp;': '&',
        '&quot;': '"',
        '&apos;': "'",
        '&#39;': "'",
        '&#34;': '"',
        '&#60;': '<',
        '&#62;': '>',
        '&#38;': '&'
    }

    for entity, char in replacements.items():
        text = text.replace(entity, char)
    return text

def clean_special_cases(text):
    text = re.sub(r'’', '" ', text)
    text = re.sub(r'“', '" ', text)
    text = re.sub(r'”', '" ', text)

    # Replace euro symbol with text and format amount
    text = re.sub(r'€(\d+)', r'euro \1', text)
    text = re.sub(r'€(\d+),(\d+)', r'euro \1.\2', text)

    # Translate time format to 24-hour format
    time_pattern = re.compile(r'(\d{1,2})u(\d{2})')
    text = time_pattern.sub(r'\1:\2', text)


    return text


# Function to extract hashtags from tweet text
def extract_hashtags(text):
    hashtags = re.findall(r"#(\w+)", text)
    return " ".join(hashtags)

# Function to extract mentions from tweet text
def extract_mentions(text):
    mentions = re.findall(r"@(\w+)", text)
    return " ".join(mentions)

def remove_extra_spaces(text):
    # Remove additional white spaces
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


def preprocessing(train_df):
    train_df['hashtags'] = train_df['tweet_text'].apply(extract_hashtags)
    train_df['mentions'] = train_df['tweet_text'].apply(extract_mentions)

    train_df['tweet_text'] = train_df['tweet_text'].apply(clean_corrupted_text)
    train_df['tweet_text'] = train_df['tweet_text'].apply(remove_urls)
    train_df['tweet_text'] = train_df['tweet_text'].apply(replace_html_entities)

    train_df['tweet_text'] = train_df['tweet_text'].apply(replace_commas_and_quotes)
    train_df['tweet_text'] = train_df['tweet_text'].apply(replace_repeated_characters)
    train_df['tweet_text'] = train_df['tweet_text'].apply(clean_special_cases)

    train_df['tweet_text'] = train_df['tweet_text'].apply(remove_ambiguous_words)
    train_df['tweet_text'] = train_df['tweet_text'].apply(remove_non_ascii)
    train_df['tweet_text'] = train_df['tweet_text'].apply(remove_extra_spaces)

    # Calculate text length
    train_df['text_length'] = train_df['tweet_text'].apply(len)

    # Categorize text length
    bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, np.inf]
    labels = ['0-10', '11-20', '21-30', '31-40', '41-50', '51-60', '61-70', '71-80', '81-90', '91-100', '100+']
    train_df['text_length_category'] = pd.cut(train_df['text_length'], bins=bins, labels=labels)

    # Label encode class label
    label_encoder = LabelEncoder()
    train_df['class_label_encoded'] = label_encoder.fit_transform(train_df['class_label'])


    train_df = train_df.drop(columns=['text_length'])
    #data_exploration(train_df)
    return train_df

preprocessing(train_df)
train_df.head(503)


Unnamed: 0,tweet_id,tweet_text,class_label,text_length,hashtags,mentions,text_length_category,class_label_encoded
0,1240603314980392966,"#SamenTegenCorona: applaus voor zorghelden, huisarts wordt dj en Aurelie (10) maakt pakkende video",No,98,SamenTegenCorona,,91-100,0
1,1238094774453833730,Kabinet ondersteunt ondernemer in Corona-crisis #coronavirusNederland,No,69,coronavirusNederland,,61-70,0
2,1238154603583156225,Heropening van het @<USER> in #Oosterbeek uitgesteld #Airborne #Covid19 #coronavirus,No,84,Oosterbeek Airborne Covid19 coronavirus,airbornemuseum,81-90,0
3,1239152526026518534,Aantal restaurants in #groningen nu op slot. #blijfthuis,Yes,56,groningen blijfthuis,,51-60,1
4,1244341963480076290,Nederland heeft het niet in de hand. Onbetrouwbaar #RIVM weet nog steeds niet waar het mee bezig is en politiek grijpt niet in. #COVID2019NL #Coronavirusnl #RIVM RIVM verwacht toch meer #coronapati nten op intensive care,Yes,220,RIVM COVID2019NL Coronavirusnl RIVM coronapatiënten,,100+,1
...,...,...,...,...,...,...,...,...
498,1243109607737896962,Studenten die hun werk verloren zijn tweeten vandaag massaal hun zorgen rondom het #coronadebat: geen werk = geen inkomen. #LenenIsGeenOplossing #NietMijnSchuld. Lees ons persbericht op,No,185,coronadebat LenenIsGeenOplossing NietMijnSchuld,,100+,0
499,1252613330843439111,Morgen om 20:50 op @<USER> #samentegencorona,No,44,samentegencorona,een,41-50,0
500,1248861446253039618,"@<USER> Op Facebook is er een groep CommunicatiePro's tegen #COVID19BE waar er fantastisch werk geleverd wordt. Overheden, ondernemers, zorgsector, minderheidsgroepen,... worden er geholpen.",No,190,COVID19BE,marcvandaele EricGoubin,100+,0
501,1240285630195736576,#coronavirusNederland is samen buiten wachten met 10 mensen omdat er maar een beperkt aantal mensen bij de apotheek naar binnen mogen.,No,134,coronavirusNederland,,100+,0


## Feature engineering
  * Add Frequency of Hashtags
  * Sentiment Analysis of Hashtags
  * Topic Modeling with LDA

In [127]:
from textblob import TextBlob
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

# Function to count frequency of hashtags
def count_hashtags_frequency(text):
    hashtags = re.findall(r"#(\w+)", text)
    return len(hashtags)

# Function to analyze sentiment of hashtags
def analyze_hashtag_sentiment(text):
    hashtags = re.findall(r"#(\w+)", text)
    if hashtags:
        sentiment_scores = [TextBlob(hashtag).sentiment.polarity for hashtag in hashtags]
        avg_sentiment = sum(sentiment_scores) / len(sentiment_scores)
        return avg_sentiment
    else:
        return 0

# Function for topic modeling with LDA
def topic_modeling_with_lda(texts):
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    dtm = vectorizer.fit_transform(texts)

    lda = LatentDirichletAllocation(n_components=5, random_state=42)
    lda.fit(dtm)

    topics = lda.transform(dtm)

    return topics.argmax(axis=1)

def add_additional_features(train_df):
    # Count frequency of hashtags
    train_df['hashtags_frequency'] = train_df['tweet_text'].apply(count_hashtags_frequency)

    # Analyze sentiment of hashtags
    train_df['hashtags_sentiment'] = train_df['tweet_text'].apply(analyze_hashtag_sentiment)

    # Topic modeling with LDA
    train_df['hashtags_topics'] = topic_modeling_with_lda(train_df['hashtags'])

    return

add_additional_features(train_df)
train_df.head(4)

Unnamed: 0,tweet_id,tweet_text,class_label,text_length,hashtags,mentions,text_length_category,class_label_encoded,hashtags_frequency,hashtags_sentiment,hashtags_topics
0,1240603314980392966,"#SamenTegenCorona: applaus voor zorghelden, huisarts wordt dj en Aurelie (10) maakt pakkende video",No,98,SamenTegenCorona,,91-100,0,1,0.0,0
1,1238094774453833730,Kabinet ondersteunt ondernemer in Corona-crisis #coronavirusNederland,No,69,coronavirusNederland,,61-70,0,1,0.0,1
2,1238154603583156225,Heropening van het @<USER> in #Oosterbeek uitgesteld #Airborne #Covid19 #coronavirus,No,84,Oosterbeek Airborne Covid19 coronavirus,airbornemuseum,81-90,0,4,0.0,3
3,1239152526026518534,Aantal restaurants in #groningen nu op slot. #blijfthuis,Yes,56,groningen blijfthuis,,51-60,1,2,0.0,2


### Preprocessing dev and dev_test and saving

In [128]:
import csv

def preprocess_dev_data(filename):
    df = load(filename)
    preprocessing(df)
    add_additional_features(df)
    return df

def save_processed_dev_data(df, filepath):
    df.to_csv(filepath, sep='\t', index=False, quoting=csv.QUOTE_NONE)

#'CT24_checkworthy_arabic/CT24_checkworthy_arabic_dev.tsv'
dev_df = preprocess_dev_data('CT24_checkworthy_dutch/CT24_checkworthy_dutch_dev.tsv')
#'CT24_checkworthy_arabic/CT24_checkworthy_arabic_dev-test.tsv'
dev_test_df = preprocess_dev_data('CT24_checkworthy_dutch/CT24_checkworthy_dutch_dev-test.tsv')

save_processed_dev_data(dev_df, 'processed_dutch_dev.tsv')
save_processed_dev_data(dev_test_df, 'processed_dutch_dev_test.tsv')
save_processed_dev_data(train_df, 'processed_dutch_train.tsv')
