# NLP Project (Arabic Dialect Classification) Preprocessing

### Importing necessary libraries

In [1]:
import pandas as pd

import emoji
import re
import tashaphyne.normalize as normalize
import nltk
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Reading the data and labels files

In [2]:
data= pd.read_csv(r"C:\Users\dell\Downloads\text.csv", lineterminator='\n')
data.head()

Unnamed: 0,id,text
0,1009754958479151232,@toha_Altomy @gy_yah ูููููู ุงุฏุจ ูููุงูููู. ูู ุง...
1,1009794751548313600,@AlmFaisal ๐๐ ุงูููุจููู ูุชููุจูู!!!\nุจุณ ุจุงููุณุจุฉ ...
2,1019989115490787200,@smsm071990 @ALMOGRBE ูู 20 ุชุงููู ุดุงุจ ููุจู ุจูุฑ...
3,1035479791758135168,@AboryPro @lyranoo85 ุฑุงููุง ุนูููุชู ูุชุฎููุฉ. ุงููุง...
4,1035481122921164800,@lyranoo85 ุดููู ูุชุนูุฏุฉ ุนูุดุงู ุงูุฑุงุฌู ูู ุชุญุจูู ุง...


In [3]:
labels= pd.read_csv(r"C:\Users\dell\Downloads\dialect.csv")
labels.head()

Unnamed: 0,id,dialect
0,1009754958479151232,LY
1,1009794751548313600,LY
2,1019989115490787200,LY
3,1035479791758135168,LY
4,1035481122921164800,LY


In [4]:
data = pd.merge(data, labels, on='id').sample(frac= 1)

In [5]:
data.head(15)

Unnamed: 0,id,text,dialect
55835,276858249440002048,@7ely ุทุจุนุง ุงูุฌุจูู ุงููุณุชู ูุณูุชู ุฃุตุญุงุจู,EG
109904,1173548858070224896,ุนููุฑุฉ ุฑูุญุฉ ุนุฑููุง ุจุชุณูุงู https://t.co/LMHBjxbXwN,LB
115073,991429222194401152,ุงูุง ุจุณ ุฎูุตุช ุงููุจุงุฑุงุฉ https://t.co/bozdGhWalF,LB
82486,860189352613347328,@amiraahmmed102 @han730oz ูู ุงูุญุจ ููุฌูุฏ ูุชุชูุงุฒ...,EG
15497,780331891878658048,@LyConsigliere ูุธุงุฑุฉ ูุฑุงุกุฉ ููู ุถุนู ูุธุฑ ุจุงูู ุง...,LY
71110,1120016606334259200,- ุฑุงูุญุฉ ุจููุง ุนูู ููู ูุง ูุตุฑ ุ \n- ูุด ุงูุง ุงููู ...,EG
28146,973698843819888640,@Dana90N ูููู ุงูู ุดู ููุชุฑู ุงูุณูุฑููู,LY
69537,950812893322317824,@MariamAlAhmadi ุนูุฑู ููุจููุฉ ุงู ุดุงุก ุงููู ๐๐ผ,EG
28664,701497230625259520,ุตุฑุงุญุฉ ุนูู ูุจุฑ ุงููุฑุญุฉ ุงูู ุบุงูุฑุชูู ูู ุงูุงูุชุตุงุฑุงุช...,LY
84599,1129117968867168384,@MariomALIIIIL ุงูุงู ุงูููุฏ ููู ููุงุนูุฏู ูุด ุดุงููู...,EG


### Our 5 Classes

In [6]:
labels.dialect.unique()

array(['LY', 'MA', 'EG', 'LB', 'SD'], dtype=object)

### Preprocessing

##### Replace new lines from the data with a space

In [7]:
def replace_newlines(txt):
    return txt.replace('\n', ' ')

##### Removing tags (@user) and any consecutive spaces

In [8]:
def remove_tag(txt):
    return re.sub(r'@\w+\s*', '', txt)

##### Remove links and any consecutive spaces

In [9]:
def remove_links(txt):
    return re.sub(r'https?\S+\s*', '', txt)

##### Removing English sentences and any consecutive spaces

In [10]:
def remove_english(txt):
    return re.sub(r'[a-zA-Z]+\s*', '', txt)

##### Remove all emojies and any consecutive spaces

In [11]:
def remove_emoji(txt):
    return emoji.replace_emoji(txt, '')

##### Remove unuseful marks <br>( These marks can be used for old style emojis )

In [12]:
def remove_punctuation(txt):
    return re.sub(r'[^\w\s]|[_]', '', txt)

##### Normalize laughter sounds ("ููู", "ูููู") to a single instance ("ูู")

In [13]:
def map_laughter(txt):
    return re.sub(r'(ูู)ู+', 'ูู', txt)

##### Remove repeated letters

In [14]:
def remove_repeated_letters(txt):
    return re.sub(r'(.)\1{2,}', r'\1', txt)

##### Remove numbers

In [23]:
def remove_numbers(txt):
    return re.sub(r'\d+', '', txt)

##### Character normalization

In [15]:
def normalize_arabic(txt):
    return normalize.normalize_searchtext(txt)

##### Remove stop words

In [17]:
def remove_stop_words(txt, stop_words):
    return " ".join([word for word in word_tokenize(txt) if word not in stop_words])

##### Remove repeated spaces

In [16]:
def remove_repeated_spaces(txt):
    return re.sub(r'\s{2,}', ' ', txt).strip()

##### Preprocessing function

In [24]:
def preprocessing(data, col_name='text'):
    copy_data= data.copy()
    copy_data[col_name]= copy_data[col_name].apply(replace_newlines)
    copy_data[col_name]= copy_data[col_name].apply(remove_tag)
    copy_data[col_name]= copy_data[col_name].apply(remove_links)
    copy_data[col_name]= copy_data[col_name].apply(remove_english)
    copy_data[col_name]= copy_data[col_name].apply(remove_emoji)
    copy_data[col_name]= copy_data[col_name].apply(remove_punctuation)
    copy_data[col_name]= copy_data[col_name].apply(map_laughter)
    copy_data[col_name]= copy_data[col_name].apply(normalize_arabic)
    copy_data[col_name]= copy_data[col_name].apply(remove_repeated_letters)
    copy_data[col_name]= copy_data[col_name].apply(remove_numbers)
    ar_stop_words = set(stopwords.words('arabic'))
    ar_stop_words = [normalize_arabic(word) for word in ar_stop_words]
    copy_data[col_name]= copy_data[col_name].apply(remove_stop_words, stop_words= ar_stop_words)
    copy_data[col_name]= copy_data[col_name].apply(remove_repeated_spaces)
    
    return copy_data

In [25]:
clean_data= preprocessing(data)

In [22]:
data.head(4)

Unnamed: 0,id,text,dialect
55835,276858249440002048,@7ely ุทุจุนุง ุงูุฌุจูู ุงููุณุชู ูุณูุชู ุฃุตุญุงุจู,EG
109904,1173548858070224896,ุนููุฑุฉ ุฑูุญุฉ ุนุฑููุง ุจุชุณูุงู https://t.co/LMHBjxbXwN,LB
115073,991429222194401152,ุงูุง ุจุณ ุฎูุตุช ุงููุจุงุฑุงุฉ https://t.co/bozdGhWalF,LB
82486,860189352613347328,@amiraahmmed102 @han730oz ูู ุงูุญุจ ููุฌูุฏ ูุชุชูุงุฒูู ููููุฑูู ููู ูุง ูุชุชูุงุฒูู ุงูุชุฑ ูููุฑูู ุงูุชุฑ ููู ูุง ูุญุณ ุงูู ุงูุชููู ุงูุชุฑ ููุจุนุฏ ุงูุชุฑ ุญุชู ูู ุจูููุช ูููู ุตุฏูููู,EG


In [20]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_row', None)
clean_data.text[:100]

55835                                                                                                                                                                                                                                  ุทุจุนุง ุงูุฌุจูู ุงููุณุชู ูุณูุชู ุงุตุญุงุจู
109904                                                                                                                                                                                                                                         ุนููุฑู ุฑูุญู ุนุฑููุง ุจุชุณูุงู
115073                                                                                                                                                                                                                                                   ุฎูุตุช ุงููุจุงุฑุงู
82486                                                                                                                                                   

In [None]:
clean_data.to_csv("Cleaned.csv")