In [27]:
!pip install vaderSentiment



In [None]:
import pandas as pd
import re
import csv
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer


pd.set_option('display.max_colwidth', None)

def contains_arabic(text):
    arabic_pattern = re.compile('[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]+')
    return bool(arabic_pattern.search(text))

def contains_url(text):
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    return bool(url_pattern.search(text))

# Function to analyze sentiment using VADER
def analyze_sentiment_vader(text):
    analyzer = SentimentIntensityAnalyzer()
    sentiment = analyzer.polarity_scores(text)
    return sentiment['compound']

# Function to categorize sentiment score
def categorize_sentiment(score):
    bins = [-1, -0.25, 0, 0.25, 0.5, 0.75, 1]
    labels = ['very_negative', 'negative', 'neutral', 'positive', 'very_positive', 'extremely_positive']
    return pd.cut([score], bins=bins, labels=labels)[0]

def symmentic_text(text):
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    return text


def contains_noise(text):
    # Check for short length
    if len(text.split()) < 4:
        return True

    # Check for repetitive characters
    if re.search(r'(.)\1{2,}', text):
        return True

    # Check for non-Arabic characters (assuming Arabic text is in Unicode range)
    if not re.search(r'[\u0600-\u06FF]', text):
        return True

    # Check for excessive punctuation
    if re.search(r'[!?.]{4,}', text):
        return True

    # Check for excessive numbers
    if re.search(r'\d{5,}', text):
        return True

    # Check for URLs
    if re.search(r'http\S+|www.\S+', text):
        return True

    return False


# Exclude Arabic characters
def check_non_ascii_tweets(df):
    non_ascii_df = df[df['tweet_text'].apply(lambda x: any((ord(char) > 127 and ord(char) < 1536 and not '\u0600' <= char <= '\u06FF') for char in x))]
    return non_ascii_df


def data_exploration(train_df):
  print (train_df.shape)
  class_distribution = train_df['class_label'].value_counts(normalize=True) * 100
  print("Class Distribution:")
  print(class_distribution)

  train_df['text_length'] = train_df['tweet_text'].apply(len)
  missing_values = train_df.isnull().sum()
  print("Missing values:")
  print(missing_values)
  duplicate_rows = train_df.duplicated().sum()
  print(f"Number of duplicate rows: {duplicate_rows}")

  arabic_tweets = train_df[train_df['tweet_text'].apply(contains_arabic)]
  print(f"Number of Arabic tweets: {len(arabic_tweets)}")

  tweets_with_url = train_df[train_df['tweet_text'].apply(contains_url)]
  print(f"Number of tweets with URL: {len(tweets_with_url)}")

  tweets_with_noise = train_df[train_df['tweet_text'].apply(contains_noise)]
  print(f"Number of tweets with noise: {len(tweets_with_noise)}")

  non_ascii_df = check_non_ascii_tweets(train_df)
  print(f"Number of tweets with non-ASCII characters: {len(non_ascii_df)}\n")
  print("Examples:")
  print(non_ascii_df['tweet_text'].head())


  # Apply preprocessing for sentiment
  train_df['clean_text'] = train_df['tweet_text'].apply(symmentic_text)

  # Analyze sentiment using VADER
  train_df['sentiment'] = train_df['clean_text'].apply(analyze_sentiment_vader)

  # Hashtag and Mention Analysis
  train_df['hashtags'] = train_df['tweet_text'].apply(lambda x: [word[1:] for word in x.split() if word.startswith('#')])
  print("Most Common Hashtags:\n", pd.Series([item for sublist in train_df['hashtags'] for item in sublist]).value_counts().head(10))

  train_df['mentions'] = train_df['tweet_text'].apply(lambda x: [word[1:] for word in x.split() if word.startswith('@')])
  print("Most Common Mentions:\n", pd.Series([item for sublist in train_df['mentions'] for item in sublist]).value_counts().head(10))

  return


def load(filename):
    df = pd.read_csv(filename, sep='\t', encoding='utf-8', names=['tweet_id', 'tweet_url', 'tweet_text', 'class_label'], quoting=csv.QUOTE_NONE, skiprows=1, dtype={'tweet_id': 'Int64'})
    data_exploration(df)
    df.drop(columns=['tweet_url'], inplace=True)
    return df



train_df = load('CT24_checkworthy_arabic_train.tsv')
train_df.head()


(7333, 4)
Class Distribution:
class_label
No     69.412246
Yes    30.587754
Name: proportion, dtype: float64
Missing values:
tweet_id       0
tweet_url      0
tweet_text     0
class_label    0
text_length    0
dtype: int64
Number of duplicate rows: 0
Number of Arabic tweets: 7333
Number of tweets with URL: 4845
Number of tweets with noise: 5174


## Preprocessing



Amb. words:
    أها (Aha)
    آه (Ah)
    يع (Yeah)
    همم (Hm)
    ههه (Haha, similar to "lol" in English)
    هوو (Hoo)

    Remove URLs: remove_urls
    Replace Repeated Special Characters: replace_repeated_characters
    Replace Commas and Double Quotes: replace_commas_and_quotes
    Remove Ambiguous Words: remove_ambiguous_words
    Remove Non-ASCII Characters: remove_non_ascii
    

In [None]:
import regex as re
import numpy as np
from sklearn.preprocessing import LabelEncoder


def remove_urls(text):
    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)

    # Fix user
    text = re.sub(r'(@\w+\s*)+', '@<USER> ', text)

    return text

def remove_non_ascii(text):
    # Remove non-ASCII characters except Arabic script
    # Keep Arabic script characters while removing other non-ASCII characters
    text = re.sub(r'[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\x00-\x7F]+', ' ', text)
    return text


def replace_repeated_characters(text):
    # Replace repeated special characters with a single occurrence
    text = re.sub(r'([!?]){2,}', r'\1', text)
    return text

def replace_commas_and_quotes(text):
    # Replace ،، with a single comma ,
    text = text.replace('،،', ',')

    # Replace multiple double quotes with a single occurrence
    text = re.sub(r'"{2,}', '"', text)
    return text

def remove_ambiguous_words(text):
    ambiguous_words = ['أها', 'آه', 'يع', 'ههه', 'همم', 'هوو']

    for word in ambiguous_words:
        # Remove the ambiguous word and fix spaces
        text = re.sub(r'\b{}\b'.format(word), '', text)
        text = re.sub(r'\s+', ' ', text).strip()  # Fix extra spaces

        # Remove comma or punctuation on the left side of the removed word
        text = re.sub(r',?\s*{}'.format(word), '', text)

    return text


def preprocessing(train_df):

    train_df['tweet_text'] = train_df['tweet_text'].apply(remove_urls)
    train_df['tweet_text'] = train_df['tweet_text'].apply(replace_commas_and_quotes)
    train_df['tweet_text'] = train_df['tweet_text'].apply(replace_repeated_characters)
    train_df['tweet_text'] = train_df['tweet_text'].apply(remove_ambiguous_words)
    train_df['tweet_text'] = train_df['tweet_text'].apply(remove_non_ascii)

    # Sentiment polarity category
    train_df['sentiment_category'] = train_df['sentiment'].apply(categorize_sentiment)


    # Calculate text length
    train_df['text_length'] = train_df['tweet_text'].apply(len)

    # Categorize text length
    bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, np.inf]
    labels = ['0-10', '11-20', '21-30', '31-40', '41-50', '51-60', '61-70', '71-80', '81-90', '91-100', '100+']
    train_df['text_length_category'] = pd.cut(train_df['text_length'], bins=bins, labels=labels)

    # Label encode class label
    label_encoder = LabelEncoder()
    train_df['class_label_encoded'] = label_encoder.fit_transform(train_df['class_label'])


    train_df = train_df.drop(columns=['text_length', 'sentiment', 'clean_text'])

    #data_exploration(train_df)
    return train_df

train_df = preprocessing(train_df)
train_df.head()


## Feature engineering
  * Add Frequency of Hashtags
  * Sentiment Analysis of Hashtags
  * Topic Modeling with LDA

In [None]:
from textblob import TextBlob
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords

# Download Arabic stop words
nltk.download('stopwords')
arabic_stop_words = list(stopwords.words('arabic'))

# Function to count frequency of hashtags
def count_hashtags_frequency(hashtags_list):
    return len(hashtags_list)

# Function to analyze sentiment of hashtags
def analyze_hashtag_sentiment(hashtags_list):
    if hashtags_list:
        sentiment_scores = [TextBlob(hashtag).sentiment.polarity for hashtag in hashtags_list]
        avg_sentiment = sum(sentiment_scores) / len(sentiment_scores)
        return avg_sentiment
    else:
        return 0

# Function for topic modeling with LDA
def topic_modeling_with_lda(texts):
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=arabic_stop_words)
    dtm = vectorizer.fit_transform(texts)

    lda = LatentDirichletAllocation(n_components=5, random_state=42)
    lda.fit(dtm)

    topics = lda.transform(dtm)

    return topics.argmax(axis=1)


def add_additional_features(df):
    # Count frequency of hashtags
    df['hashtags_frequency'] = df['hashtags'].apply(count_hashtags_frequency)

    # Analyze sentiment of hashtags
    df['hashtags_sentiment'] = df['hashtags'].apply(analyze_hashtag_sentiment)

    # Topic modeling with LDA
    df['hashtags_topics'] = topic_modeling_with_lda(df['tweet_text'])

    return df



train_df = add_additional_features(train_df)
train_df.head(4)


### Preprocessing dev and dev_test and saving

In [None]:
import csv

def preprocess_dev_data(filename):
    df = load(filename)
    df = preprocessing(df)
    df = add_additional_features(df)
    return df

def save_processed_dev_data(df, filepath):
    df.to_csv(filepath, sep='\t', index=False, quoting=csv.QUOTE_NONE)

#'CT24_checkworthy_arabic/CT24_checkworthy_arabic_dev.tsv'
dev_df = preprocess_dev_data('CT24_checkworthy_arabic_dev.tsv')
#'CT24_checkworthy_arabic/CT24_checkworthy_arabic_dev-test.tsv'
dev_test_df = preprocess_dev_data('CT24_checkworthy_arabic_dev-test.tsv')

save_processed_dev_data(dev_df, 'processed_arabic_dev.tsv')
save_processed_dev_data(dev_test_df, 'processed_arabic_dev_test.tsv')
save_processed_dev_data(train_df, 'processed_arabic_train.tsv')
