In [2]:
!pip install simpletransformers # for transformers wrapper for DL
!pip install ktrain # for ktrain wrapper for DL
!pip install nlpaug # for data augmantation
!pip install tqdm

In [3]:
import numpy as np
import pandas as pd

import json, sys, regex, os
import torch
import torchvision
import random
random.seed(2022)

from sklearn.model_selection import train_test_split
from tqdm import tqdm, trange
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import requests
import re 

import os
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from tqdm import tqdm


import nltk
from nltk.corpus import stopwords
from collections import Counter


import ktrain
from ktrain import text

from sklearn.utils import shuffle
from transformers import TFRobertaModel, RobertaConfig, RobertaTokenizerFast

import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as naf
from nlpaug.util import Action

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0"; 


# Importing data from my Google Drive.

In [4]:
# data = pd.read_csv('/content/drive/MyDrive/Data/Arabic_dialects/Ara_dialect.csv',
#                    lineterminator='\n')
# data.head()

In [5]:
data = pd.read_csv('../input/arabic-dialects/Ara_dialect.csv', lineterminator='\n')
data.head()

In [88]:
data.shape

## EDA

In [6]:
data.dialect.value_counts()

In [7]:
# number of dialects in data and all dialects

list_of_unique_dialect = list(set(data.dialect))
print(f'There are {len(data.dialect.value_counts())} of dialect kind')
print(f'the dialects are: \n{list_of_unique_dialect}')

**Sample of data for each dialect**
As we sow here, we need to doing some text cleaing to make machine understand the text, So we will meke:
* Removing emojis.
* Removing links.
* Removing mentions like @user.
* Removing hattage # .
* Removing puncituations.
* Removing duplicated characters.
* Removing new line "\n".
* Just keep Arabic language.



In [8]:
print(data['tweets'][data['dialect'] == 'EG'][244697])
print('-'*50)

print(data['tweets'][data['dialect'] == 'PL'][83068])
print('-'*50)

print(data['tweets'][data['dialect'] == 'KW'][372056])
print('-'*50)

print(data['tweets'][data['dialect'] == 'LY'][15498])
print('-'*50)

print(data['tweets'][data['dialect'] == 'QA'][52000])
print('-'*50)

print(data['tweets'][data['dialect'] == 'JO'][180213])
print('-'*50)

print(data['tweets'][data['dialect'] == 'LB'][329947])
print('-'*50)

print(data['tweets'][data['dialect'] == 'SA'][218582])
print('-'*50)

print(data['tweets'][data['dialect'] == 'AE'][431900])
print('-'*50)


print(data['tweets'][data['dialect'] == 'BH'][458192])
print('-'*50)


print(data['tweets'][data['dialect'] == 'OM'][391170])
print('-'*50)


print(data['tweets'][data['dialect'] == 'SY'][126808])
print('-'*50)


print(data['tweets'][data['dialect'] == 'DZ'][244696])
print('-'*50)


print(data['tweets'][data['dialect'] == 'IQ'][1])
print('-'*50)


print(data['tweets'][data['dialect'] == 'SD'][405606])
print('-'*50)


print(data['tweets'][data['dialect'] == 'MA'][191750])
print('-'*50)


print(data['tweets'][data['dialect'] == 'YE'][228513])
print('-'*50)


print(data['tweets'][data['dialect'] == 'TN'][143049])
print('-'*50)

In [89]:
arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
english_punctuations = string.punctuation
punctuations_list = arabic_punctuations + english_punctuations

label_classes = {"LB": "Lebanon","AE": "United Arab Emirates",
                 "PL": "PL","SY": "Syrian",
                 "TN": "Tunisian","BH": "Bahrain",
                 "OM": "Oman","SA": "Saudi Arabia",
                 "MA": "Morocco","DZ": "Algeria",
                 "JO": "Jordan","IQ": "Iraq",
                 "QA": "Qatar","LY": "Libya",
                 "KW": "Kuwait","YE": "Yemen",
                 "EG": "Egypt","SD": "Sudan",
                }
    
def remove_emojis(text):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', text)


def normalize_arabic(text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)
    return text

def remove_diacritization(text):
    noise = re.compile(""" ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    text = re.sub(noise, '', text)
    return text



def clean_text(text):
    '''
        Make text lowercase, remove text in square brackets,remove links,remove punctuation
        and remove words containing numbers.
    '''

    sequencePattern   = r"(.)\1\1+"
    seqReplacePattern = r"\1\1"

    text = re.sub('https?://\S+|www\.\S+', '', text)                # remove urls
    text = re.sub('<.*?>+', '', text)                               # remove tages
    text = re.sub(r'@[^\s]+', ' ', text)                            # Removing @user
    text = re.sub(r'#([^\s]+)', r'\1', text)                        # remove #word with word
    text = re.sub('[%s]' % re.escape(punctuations_list), '', text)  # remove punctuation
    text = re.sub('\n', '', text)                                   # remove new line
    text = re.sub(r'\s+', ' ', text)                                # Removing multiple spaces
    text = re.sub(sequencePattern, seqReplacePattern, text)         # Replace 3 or more consecutive letters by 2 letter.
    text = re.sub(r'\s*[A-Za-z]+\b', '' , text).rstrip()            # Removing English words and make right strip 
    return text


def preprocess_data(text):
    
    # Clean puntuation, urls, and so on
    text = clean_text(text)
    
    # remove diacritization
    text = remove_diacritization(text)
    # Remove emojis
    text = remove_emojis(text)

    # Normalize the text 
    text = normalize_arabic(text)

    return text


In [10]:
some_sample_testing = [
             "السلام عليكم جميعاً ... ايه الاخبار؟",
             "@Nadjib__FCB ساهم في الفوز بالليغا والكأس😂😂😂😁😁😁",
             "لاتخليني اعرضك الهبوب https://t.co/p9v497dsLP",
             "@Maimunamamri99 عظم الله أجركم.. وربي يرحم موتاكم ويسكنهم فسيح جناته..",
             '''
             @1397Hma @almayassahamad @jaberalharmi قلتها لك يالسلوقي الف مرّه
            لاكن الانجاس مثلك ماتتوب

            عدتها كره وكره ثم كره
            وابشر بقافٍ يمسّك الدروب

            ياحماده والردى ساقك بخرّه
            وجيتنا يالسلح بالهرج الكذوب

            عرّةٍ من صلب عرّه وابن عرّه
            لاتخليني اعرضك الهبوب https://t.co/p9v497dsLP
             ''',
             '''مااحرم نفسي ميسي حريف ولعاب برضو ..
                مدريدي وافتخر 🇪🇸    
                ''',
                " #arabic فاطمه و فاطمة شو الفرق بينهم؟",
                "<arabic> العربيه",
                "الله أكبر..المصرييين جامدييين!",
                "@7zNqXP0yrODdRjK يعني هذا محسوب على البشر .. حيونه ووحشيه .. وتطلبون من الغرب يحترمكم ويؤمن بدينكم ولاينعتكم بالإرهاب .."

             ]

for tweet in some_sample_testing:
    cleaned_tweet = preprocess_data(tweet)
    print(cleaned_tweet)
    print('-'*100)


**APPlying Tex-Preprocessing On Original Data**

In [11]:
data.head()

In [12]:
data['clean_tweets'] = data['tweets'].apply(preprocess_data)

# reorde cols of data.
data = data.reindex(columns=['id', 'tweets', 'clean_tweets', 'dialect'])

data.head(5)

In [13]:
# Testing the functions

print(data['tweets'][data['dialect'] == 'IQ'][1])
print(data['clean_tweets'][data['dialect'] == 'IQ'][1])

In [14]:
data.loc[:, ['clean_tweets', 'dialect']]

# data.loc['clean_tweets', 'id']

# data['clean_tweets']

### Down Samplling

**We will make all data with the least number of lables which is `9264`**

In [20]:
data.dialect.value_counts()

In [21]:
mini_rows = 9246

In [37]:
dialect_labels = list(set(data.dialect))

In [38]:
down_sample_tweets = []
down_sample_labels = []
for dialect in dialect_labels:
    down_sample_tweets += data['clean_tweets'][data['dialect'] == dialect][:mini_rows].values.tolist()
    down_sample_labels += [dialect] * mini_rows

In [40]:
len(down_sample_tweets), len(down_sample_labels)

In [45]:
down_sample_df = pd.DataFrame({'clean_tweets': down_sample_tweets,
                              'dialect': down_sample_labels})

# shuffel data 
down_sample_df = down_sample_df.sample(frac=1).reset_index(drop=True)

down_sample_df.head()

In [87]:
down_sample_df.shape

In [46]:
# all have the same number of labels
down_sample_df.dialect.value_counts()

## Modeling

In [57]:
# traning 80%, testing 20%
train_df, test_df = train_test_split(down_sample_df.loc[:, ['clean_tweets', 'dialect']],
                                     stratify=down_sample_df.dialect, test_size=0.2, random_state=2022)
len(train_df), len(test_df)

In [58]:
# testing 50%, validation 50% -> from the whole testing data which is 40% of the whole data
test_df, eval_df = train_test_split(test_df, test_size=0.5)
len(test_df), len(eval_df)

In [59]:
len(train_df), len(test_df), len(eval_df)

### MARABERT

In [60]:
%%time

marabert = 'UBC-NLP/MARBERT'
t = text.Transformer(marabert, maxlen=128, class_names = dialect_labels)
trn = t.preprocess_train(train_df.clean_tweets.values.tolist(), train_df.dialect.values.tolist()) # X_train, y_train
val = t.preprocess_test(eval_df.clean_tweets.values.tolist(), eval_df.dialect.values.tolist()) # here used test_df , X_test, y_test

marabert_model = t.get_classifier()
marabert_learner = ktrain.get_learner(marabert_model, train_data=trn, val_data=val, batch_size=8)

marabert_learner.fit_onecycle(5e-5, 1)

print('Classification Report for MARABERT Model')
print(marabert_learner.validate(class_names=t.get_classes()))


**Make Predictions on New Data in Deployment**

In [64]:
marabert_predictor = ktrain.get_predictor(marabert_learner.model, preproc=t)

In [98]:
check = [ 'والله ما قصرت',
         'ايه دا فيه ايه احنا مش ناقصين',
        'نحنا عنا لما بدنا نحكي مع مجموعة',
         'شنو هذا يا صلاح',
         ]

# marabert_predictor.predict(check) 

[label_classes[marabert_predictor.predict(l)] for l in check] # original labels

**Classification Report Testing data**

In [67]:
X_test = test_df.clean_tweets.values.tolist() 
y_test = test_df.dialect.values.tolist()

y_pred = marabert_predictor.predict(X_test)

print(f'The Results Of Classification Report By MARABERT for Testing data: \n{classification_report(y_test, y_pred)}')


In [70]:
# save the model
marabert_predictor.save('marabert_predictor_dialect')

In [71]:
# load the model
loaded_marabert_predictor = ktrain.load_predictor('marabert_predictor_dialect')

In [99]:
label_classes[loaded_marabert_predictor.predict("حد شاف الواد احمد ي جدعااان")]

**Summary:**
* In original paper ran all data by `458197` record data, and get `60.6%`
* Here we ran just `166428` record data, and get `56%` .