# 1 - text processing

In [106]:
import re


# Clean/Normalize Arabic Text
def clean_str(text):
    search = ["أ","إ","آ","_","-","/",".","،"," و ",'"',"ـ","'","ى","\\",'\n', '\t','&quot;','?','؟','!','  ']
    replace = ["ا","ا","ا"," "," ","","",""," و","","","","ي","",' ', ' ',' ',' ? ',' ؟ ',' ! ',' ']
    
    #remove tashkeel
    p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    text = re.sub(p_tashkeel,"", text)
    
    #remove longation
    p_longation = re.compile(r'(.)\1+')
    subst = r"\1\1"
    text = re.sub(p_longation, subst, text)
    
    text = text.replace('وو', 'و')
    text = text.replace('يي', 'ي')
    text = text.replace('اا', 'ا')
    
    for a,b in zip(search,replace):
        text = text.replace(a,b)
    
    #trim    
    text = text.strip()
    return [word.strip() for word in text.split(" ")]

    return text

In [107]:
search = ["أ","إ","آ","ة","_","-","/",".","،"," و ",'"',"ـ","'","ى","\\",'\n', '\t','&quot;','?','؟','!','  ']
replace = ["ا","ا","ا","ه"," "," ","","",""," و","","","","ي","",' ', ' ',' ',' ? ',' ؟ ',' ! ',' ']

for pair in zip(search,replace):
    print(pair)
    

('أ', 'ا')
('إ', 'ا')
('آ', 'ا')
('ة', 'ه')
('_', ' ')
('-', ' ')
('/', '')
('.', '')
('،', '')
(' و ', ' و')
('"', '')
('ـ', '')
("'", '')
('ى', 'ي')
('\\', '')
('\n', ' ')
('\t', ' ')
('&quot;', ' ')
('?', ' ? ')
('؟', ' ؟ ')
('!', ' ! ')
('  ', ' ')


In [108]:
words = clean_str("صبااااح الخير دة مجرد أختبار عشان بس اشوف شغال ولا لا ")
words[0:5]

['صباح', 'الخير', 'دة', 'مجرد', 'اختبار']

# lemmatization

In [109]:
#https://farasa.qcri.org/lemmatization/

import json
import requests

URL = 'https://farasa.qcri.org/webapi/lemmatization/'
API_KEY = "mgmfGsWLmIFJznbMaC"
LEMMATIZATION_FILE = "lemmatization.json"

with open(LEMMATIZATION_FILE, 'r') as json_file:
    lemmatization_table = json.load(json_file)


def fetch(arr):    
    text = " ".join(arr)
    payload = {'text': text, 'api_key': API_KEY}
    data = requests.post(URL, data=payload)
    result = json.loads(data.text)
    return result['text']

def lemmatization(arr):
    words = []
    for word in arr:
        if(word not in lemmatization_table):
            words.append(word)
    if(len(words)>0):
        lemmatizated_words = fetch(words)
        for original,lemmaitaized in zip(words,lemmatizated_words):
            lemmatization_table[original] = lemmaitaized
    
    
        with open(LEMMATIZATION_FILE, 'w') as write:
            json.dump(lemmatization_table,write)
    
    final_results = []
    for word in arr:
        final_results.append(lemmatization_table[word])
    
    return final_results

In [150]:
text = clean_str("السيارة تسير على الطريق بسرعة كبيرة")
results = lemmatization(text)

In [151]:
text

['السيارة', 'تسير', 'علي', 'الطريق', 'بسرعة', 'كبيرة']

In [152]:
results

['سيارة', 'سار', 'علي', 'طريق', 'سرعة', 'كبير']

# other cleaning

In [169]:
def valid_word(word):
    return word not in ['فى',"الى","عن","علي"]


def clean_2(arr):
    final = []
    for word in arr:
        if valid_word(word):
            final.append(word)
    return final

In [170]:
results = clean_2(results)
results

['سيارة', 'سار', 'طريق', 'سرعة', 'كبير']

# synonyms

In [176]:
SYNONYMS_FILE = "synonyms.json"
WORD_FILE = "word.txt"

word_list = []

with open(SYNONYMS_FILE, 'r') as json_file:
    synonyms_table = json.load(json_file)

with open(WORD_FILE, 'r') as json_file:
    word_list = json_file.read().split("\n")
    

def save_synonyms():
    with open(SYNONYMS_FILE, 'w') as write:
        json.dump(synonyms_table,write)
    
def add_new(keys=[],values=[],key=None,value=None):
    if(key==None):
        for k,v in zip(keys,values):
            synonyms_table[k] = v
    else:
        synonyms_table[key] = value
    
    save_synonyms();
    
def get_words():
    for word in word_list:
        synonyms_table[word] = word
    save_synonyms()
    
    
get_words()
for pair in synonyms_table.items():
    print(pair)

('اخت', 'اخت')
('اخ', 'اخ')
('يذهب', 'يذهب')
('يشجع', 'يشجع')
('سيارة', 'سيارة')
('سرعة', 'سرعة')
('كبير', 'كبير')


In [177]:
add_new(key="تحرك",value="ذهب")
add_new(key="تحرك",value="ذهب")
add_new(key="مشى",value="ذهب")
add_new(key="سار",value="ذهب") 


In [178]:
for pair in synonyms_table.items():
    print(pair)

('اخت', 'اخت')
('اخ', 'اخ')
('يذهب', 'يذهب')
('يشجع', 'يشجع')
('سيارة', 'سيارة')
('سرعة', 'سرعة')
('كبير', 'كبير')
('تحرك', 'ذهب')
('مشى', 'ذهب')
('سار', 'ذهب')


In [179]:
def translate(arr):
    final = []
    for word in arr:
        if word in synonyms_table:
            final.append(synonyms_table[word])
        else:
            for letter in word:
                final.append(letter)
    return final

In [180]:
# ex ; we don't have سار but we  have ذهب 
# we don't have طريق so it spelled it 
translate(results)

['سيارة', 'ذهب', 'ط', 'ر', 'ي', 'ق', 'سرعة', 'كبير']

# steaming (might be used in the future)

In [1]:
import nltk

In [2]:
# https://www.nltk.org/api/nltk.stem.isri.html
obj = nltk.stem.isri.ISRIStemmer()


In [15]:
for i in ["المتحرك","الحركة","حركة","يتحرك","متحرك"]:
    print(obj.stem(i))

تحر
حرك
حرك
تحر
تحر


In [17]:
for i in ["الاستنشاق","المحبة","الرحمة","يذهب","السما"]:
    print(obj.stem(i))

نشق
حبة
رحم
ذهب
سما
