In [1]:
import re
import nltk
import time
import pandas as pd
import pyarabic.normalize as Normalize

from nltk.corpus import stopwords
from nltk.stem import ISRIStemmer
from transformers import MarianMTModel, MarianTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def remove_emojis(text):
    emoji_pattern = re.compile("["
                                    u"\U0001F600-\U0001F64F"  # emoticons
                                    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                    u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                    u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                    u"\U00002702-\U000027B0"
                                    u"\U000024C2-\U0001F251"
                                    u"\U0001F90C-\U0001F93A"  # Supplemental Symbols
                                    u"\U0001F93C-\U0001F945"  # and
                                    u"\U0001F947-\U0001F9FF"  # Pictographs
                                "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [None]:
def removeConsecutiveDuplicates(text):
    # Replace any group of two or more consecutive characters with just one
    #clean = re.sub(r'(\S)(\1+)', r'\1', text, flags=re.UNICODE)

    clean = re.sub(r'(\S)(\1{2,})', r'\1', text, flags=re.UNICODE)
    #This one only replaces it if there are more than two duplicates. For example, الله has 2 لs but we don't want it removed

    return clean

In [None]:
def removeEnglish(text):
    return re.sub(r"[A-Za-z0-9]+","",text)

In [None]:
def lemmatizeArabic(text):
    """
    This function takes an Arabic word as input and returns its lemma using NLTK's ISRI stemmer
    """
    # Create an instance of the ISRI stemmer
    stemmer = ISRIStemmer()
    # Apply the stemmer to the word
    lemma = stemmer.stem(text)
    return lemma

In [None]:
def removeStopwords(text):
    # Tokenize the text into wordsz
    words = nltk.word_tokenize(text)
    # Get the Arabic stop words from NLTK
    stop_words = set(stopwords.words('arabic'))
    # Remove the stop words from the list of words
    words_filtered = [word for word in words if word.lower() not in stop_words]
    # Join the words back into a string
    clean = ' '.join(words_filtered)
    return clean

In [None]:
def removePunctuation(text):
    # Define the Arabic punctuation regex pattern
    arabicPunctPattern = r'[؀-؃؆-؊،؍؛؞]'
    engPunctPattern = r'[.,;''`~:"]'
    # Use re.sub to replace all occurrences of Arabic punctuation with an empty string
    clean = re.sub(arabicPunctPattern + '|' + engPunctPattern, '', text)
    return clean

In [None]:
def cleanData(dataset):
    dataset = dataset.drop_duplicates(subset=["tweet"])
    dataset = dataset.dropna()
    dataset = dataset.reset_index(drop=True)

    for index, tweet in enumerate(dataset["tweet"].tolist()):
        #standard tweet cleaning
        clean = re.sub(r"(http[s]?\://\S+)|([\[\(].*[\)\]])|([#@]\S+)|\n", "", tweet)
        
        #Test to see if they're useful or not
        clean = remove_emojis(clean)
        clean = removeConsecutiveDuplicates(clean)

        # mandatory arabic preprocessing
        clean = Normalize.normalize_searchtext(clean)
        clean = removeEnglish(clean)
        clean = lemmatizeArabic(clean)
        clean = removeStopwords(clean)
        clean = removePunctuation(clean)

        # clean = tokenizeArabic(clean)
        dataset.loc[index, "tweet"] = clean # replace the old values with the cleaned one.

    return dataset

In [None]:
def format_batch_texts(language_code, batch_texts):
    formated_bach = [">>{}<< {}".format(language_code, text) for text in batch_texts]
    return formated_bach

In [None]:
englishModelName = "Helsinki-NLP/opus-mt-ar-en"
# frenchModelName = "Helsinki-NLP/opus-mt-en-fr"
arabicModelName = "Helsinki-NLP/opus-mt-en-ar"

englishModeltkn = MarianTokenizer.from_pretrained(englishModelName)
# frenchModeltkn = MarianTokenizer.from_pretrained(frenchModelName)
arabicModeltkn = MarianTokenizer.from_pretrained(arabicModelName)

englishModel = MarianMTModel.from_pretrained(englishModelName)
# frenchModel = MarianMTModel.from_pretrained(frenchModelName)
arabicModel = MarianMTModel.from_pretrained(arabicModelName)

In [None]:
def perform_translation(batch_texts, model, tokenizer, language="en"):
    # Prepare the text data into appropriate format for the model
    formated_batch_texts = format_batch_texts(language, batch_texts)
    
    # Generate translation using model
    translated = model.generate(**tokenizer(formated_batch_texts, return_tensors="pt", padding=True))

    # Convert the generated tokens indices back into text
    translated_texts = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
    
    return translated_texts

In [None]:
def dataAugmentation(dataset):
    sarcasmTweets = dataset[dataset.sarcasm == 1]["tweet"].tolist()
    sarcasmTweets_dialect = dataset[dataset.sarcasm ==1]["dialect"].tolist()
    sarcasmTweets_sentiment = dataset[dataset.sarcasm ==1]["sentiment"].tolist()


    for index in range(len(sarcasmTweets)):
        englishVersion = perform_translation([sarcasmTweets[index]], englishModel, englishModeltkn, "en")
        arabicVersion = perform_translation(englishVersion, arabicModel, arabicModeltkn, "ar")

        newLocation = len(dataset)

        dataset.at[newLocation, "tweet"] = arabicVersion
        dataset.at[newLocation, "dialect"] = sarcasmTweets_dialect[index]
        dataset.at[newLocation, "sentiment"] = sarcasmTweets_sentiment[index]

        print(arabicVersion)

    dataset = dataset["sarcasm"].fillna(True)
    return dataset

In [None]:
def preProcessData(dataset):

    data = cleanData(dataset.copy(deep=True))
    print("\n-------        cleanData Done!        -------\n")

    data = dataAugmentation(data.copy(deep=True))
    print("\n---------- dataAugmentation Done! ----------\n")

    data = cleanData(data.copy(deep=True))
    
    return data

In [None]:
dataset = pd.read_csv(r"https://raw.githubusercontent.com/iabufarha/ArSarcasm-v2/main/ArSarcasm-v2/training_data.csv")

In [None]:
startTime = time.time()
data = preProcessData(dataset.copy(deep=True))
endTime = time.time()

executionTime = endTime - startTime

In [None]:
print(f"execution time: {executionTime}s")