<a href="https://colab.research.google.com/github/aiishaa/Arabic-Dialect-Classification/blob/main/Data_pre_processing_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyarabic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyarabic
  Downloading PyArabic-0.6.14-py3-none-any.whl (126 kB)
[K     |████████████████████████████████| 126 kB 5.2 MB/s 
Installing collected packages: pyarabic
Successfully installed pyarabic-0.6.14


In [2]:
import pandas as pd
import re
import string
import nltk
from snowballstemmer import stemmer
from nltk.corpus import stopwords
import pyarabic.araby as araby
import unicodedata
df = pd.read_csv('/content/drive/MyDrive/texts.csv')
print(df['text'].head())

0    ['@Nw8ieJUwaCAAreT لكن بالنهاية .. ينتفض .. يغ...
1    ['@7zNqXP0yrODdRjK يعني هذا محسوب على البشر .....
2                  ['@KanaanRema مبين من كلامه خليجي']
3       ['@HAIDER76128900 يسلملي مرورك وروحك الحلوه💐']
4               ['@hmo2406 وين هل الغيبه  اخ محمد 🌸🌺']
Name: text, dtype: object


In [3]:
# take only 9000 samples from each class to speed the computations
nrows = len(df)
sample_size = 9000
df = df.groupby('dialect').apply(lambda x: x.sample(sample_size))
print(len(df))

random_df = df.sample(frac=1)
id = random_df["id"].to_numpy()
dialect = random_df["dialect"].to_numpy()
print(dialect)

162000
[ 4 14  4 ... 15 15 15]


In [4]:
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r' ',text)

In [5]:
def tokenize(strr):
  a_list = nltk.word_tokenize(strr)
  return a_list

In [6]:
#run check that the stemmer works
ar_stemmer = stemmer("arabic")
ar_stemmer.stemWord("فسميتموها") #exmaple to check that it works

'سمي'

In [16]:
def preprocess(sentence):
    result = []
    output = re.sub(r'[0-9A-Za-z\\؟]', ' ' , sentence)
    output = output.rstrip()
    # Normalize unicode encoding
    output = unicodedata.normalize('NFC', output)

    # Remove '@name'
    output = re.sub(r'(@.*?)[\s]', ' ', output)

    # remove emojis
    output =  deEmojify(output)

    # Remove URLs
    output = re.sub(r'http\S+', ' ', output)

    # Remove trailing whitespace and new lines
    output = re.sub(r'\s+', ' ', output).strip()
    output = output.replace('\\n', ' ').replace('\n', ' ')
  
    # Remove special characters from the string
    pattern = r'[' + string.punctuation + ']'
    output = re.sub(pattern, ' ', output)
    output = ''.join(c for c in output if not unicodedata.category(c).startswith('P'))

    # tokenize the sentence
    result.append(output)
    return result


In [17]:
nltk.download("punkt")
nltk.download("stopwords") #arabic stopwords are not biult-in, so we find them by calling a set object
stopwords_list = stopwords.words('arabic')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
def cleanArabicText(sentence):
    words = preprocess(sentence)
    stopwords_list = stopwords.words('arabic')
    words = [ar_stemmer.stemWord(araby.strip_diacritics(w)) for w in words if araby.strip_diacritics(w) not in stopwords_list and len(w) > 1]
    return ' '.join(words)
# End of Func
print(len(random_df))
random_df['text_cleaned'] = random_df['text'].apply(cleanArabicText)

162000


In [19]:
# save preprocessed texts in a csv file for training
print(cleanArabicText('مغيب ،، ولا عنده غير عصايص من اللي يدخنه ،،'))
random_df.to_csv('/content/drive/MyDrive/cleaned_texts.csv')

مغيب  ولا عنده غير عصايص من اللي يدخنه 
