In [1]:
import arabic_reshaper                   # pip install arabic-reshaper
from bidi.algorithm import get_display   # pip install python-bidi
from emoji import get_emoji_regexp       # pip install emoji
import pandas as pd
import numpy as np
import nltk
from nltk.stem.isri import ISRIStemmer
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re
import string
from sklearn.model_selection import train_test_split
sns.set()
warnings.filterwarnings('ignore')

In [2]:
def load_stopwords(file_path = './data/arabic_stopwords_list_nw.txt'):
    with open(file_path, 'r', encoding = 'utf-8') as stopwords_file:
        stopwords = [word.strip() for word in stopwords_file.readlines()]
    # End file stream
    return stopwords
# End Func

def format_arabic(text):
    text = arabic_reshaper.reshape(text)
    text = get_display(text)
    return text
# End Func

def preprocess_text(text, remove_stopwords = False, stem = False, return_tokens = False):
    # Remove mentions, hashtags or any english words and numbers 
    cleaning_regex_script = re.compile(pattern=r'(\@\w+|\#\w+|[A-Za-z0-9]+)')
    text = cleaning_regex_script.sub('', text)

    # Remove emojies
    emoji_regex = get_emoji_regexp()
    text = emoji_regex.sub('', text)

    # Remove punctuations and some symbols
    arabic_punctuations = '''`÷×٪؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ.'''
    symbols = '❤♡❀♩﴾﴿↓❁♬'
    puncts = arabic_punctuations + string.punctuation + symbols
    text = text.translate(str.maketrans('', '', puncts))    

    # Remove Arabic Digits
    arabic_numbers_digits = r'[٠١٢٣٤٥٦٧٨٩]+'
    text = re.sub(arabic_numbers_digits, '', text)

    # Remove unnessary spaces
    spaces_regex_script = re.compile(pattern=r'[\s]{2,}')
    text = spaces_regex_script.sub(' ', text).strip()

    # Remove arabic diacritics
    arabic_diacritics = r'[ًٌٍَُِّـ]'
    text = re.sub(arabic_diacritics, '', text)

    # Normalize the arabic text alpha
    text = re.sub("[إأآ]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)

    # Tokenize text
    tokens = nltk.word_tokenize(text)

    # Remove stop words
    if remove_stopwords:
        stopwords = load_stopwords()
        tokens = [token for token in tokens if token not in stopwords and token.isalpha()]
    # End if
    
    # Get words root using stemming
    if stem:
        stemmer = ISRIStemmer()
        tokens = [stemmer.stem(token) for token in tokens]
    # End if
    
    preprocessed_text = tokens if return_tokens else ' '.join(tokens)
    return preprocessed_text
# End Func

In [3]:
data_path = './data/dialects_data_full.csv'

In [4]:
df = pd.read_csv(data_path).drop(columns = ['id']).replace([''], np.nan).dropna().reset_index(drop = True)
df['dialect'] = pd.Categorical(df['dialect'])

In [5]:
df['cleaned_text'] = df['text'].apply(preprocess_text).astype('U').copy()
df['label'] = df['dialect'].values.codes

In [6]:
df.head()

Unnamed: 0,text,dialect,cleaned_text,label
0,@Nw8ieJUwaCAAreT لكن بالنهاية .. ينتفض .. يغير .,IQ,لكن بالنهايه ينتفض يغير,4
1,@7zNqXP0yrODdRjK يعني هذا محسوب على البشر .. ح...,IQ,يعني هذا محسوب علي البشر حيونه ووحشيه وتطلبون ...,4
2,@KanaanRema مبين من كلامه خليجي,IQ,مبين من كلامه خليجي,4
3,@HAIDER76128900 يسلملي مرورك وروحك الحلوه💐,IQ,يسلملي مرورك وروحك الحلوه,4
4,@hmo2406 وين هل الغيبه اخ محمد 🌸🌺,IQ,وين هل الغيبه اخ محمد,4


In [7]:
train_df, valid_test_df = train_test_split(df[['cleaned_text', 'label']], test_size = 0.2, stratify = df['label'])
valid_df, test_df = train_test_split(valid_test_df, test_size = 0.5)

In [8]:
# Save Preprocessed Version
df[['cleaned_text', 'dialect']].to_csv('./data/dataset_cleaned_version.csv', index = False)
train_df.to_csv('./data/dataset_cleaned_version_train.csv', index = False)
valid_df.to_csv('./data/dataset_cleaned_version_valid.csv', index = False)
test_df.to_csv('./data/dataset_cleaned_version_test.csv', index = False)