### Add all imports


In [1]:
from utils import *
import re
from pyarabic.araby import strip_diacritics
import numpy as np


### Read Data

In [2]:
training_dataset = read_training_dataset()
dev_dataset = read_dev_dataset()
# test_dataset = read_test_dataset()


Read training set successfully
Read validation set successfully


### Pre-process and clean data

In [3]:
# 1- Clean the data

# This pattern keeps Arabic letters, diacritics, and whitespaces and endlines
pattern = re.compile(r'[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\s,.؟،;:!?\-\'"]')

# Replace unmatched characters with an empty string
cleaned_corpus = [re.sub(pattern, "", t) for t in training_dataset]
cleaned_corpus = [re.sub("\s\s+", " ", c) for c in cleaned_corpus]

# print(len(cleaned_corpus))

data,labels = [],[]

for c in cleaned_corpus:
    sentences = re.split(r"[,.؟،;:!?']+", c)  # split on all punctuation
    labels += sentences

    without_dialects = [
        strip_diacritics(s) for s in sentences
    ]  # get the letters without dialects
    data += without_dialects


# remove any spaces from line
data = [d.strip() for d in data]
labels = [l.strip() for l in labels]

# remove empty lines
data = [i for i in data if i]
labels = [i for i in labels if i]

# write the clean corpora to file
with open("./output_data/cleaned_corpus.txt", "w", encoding="utf-8") as f:
    for l in cleaned_corpus:
        f.write(l + "\n")

with open("./output_data/training_data.txt", "w", encoding="utf-8") as f:
    for d in data:
        f.write(str(d) + "\n")

with open("./output_data/labeled/training_labels.txt", "w", encoding="utf-8") as f:
    for l in labels:
        f.write(str(l) + "\n")

In [4]:
# 2- Tokenize to vocab and words
vocab = set()
tokenized_data = []
vocab, tokenized_word_sentence, tokenized_letter_sentence, tokenized_diacritics_sentence = tokenize_to_vocab(labels, vocab)
stemmedVocab = []

# Save the vocab to file each word in a line
with open("./output_data/vocab.txt", "w", encoding="utf-8") as f:
    for v in vocab:
        f.write(str(v) + "\n")

# save the tokenized data sentence, as a form of list with <s> in index 0 and </s> in the last index
        # in between are words
with open("./output_data/tokenized_data_sentences.txt", "w", encoding="utf-8") as f:
    for ts in tokenized_word_sentence:
        f.write(str(ts) + "\n")

# save the tokenized data sentence, as a form of list with <s> in index 0 and </s> in the last index
        # in between are lists of letters
with open("./output_data/tokenized_data_letters.txt", "w", encoding="utf-8") as f:
    for tl in tokenized_letter_sentence:
        f.write(str(tl) + "\n")

# save the tokenized data sentence, as a form of list with <s> in index 0 and </s> in the last index
        # in between are lists of diacritics
with open("./output_data/tokenized_data_diacritics.txt", "w", encoding="utf-8") as f:
    for td in tokenized_diacritics_sentence:
        f.write(str(td) + "\n")


### Features & embeddings

In [5]:
#1- Read data embeddings we have for letters and diacritics
letters, diacritics, diacritics2id = get_letters() , get_diacritics(), get_diacritics2id()

#2- Have mapping ready
#### Letters ---- IDs
letters2id = {item: index for index, item in enumerate(letters)}
id2letters = {index: item for index, item in enumerate(letters)}
# print(letters2id['م'])
# print(id2letters[34])

#### Diacritics ---- IDs
id2diacritics = {index: item for index, item in enumerate(diacritics)}
# print(diacritics2id['ُ'])
# print(id2diacritics[2])


In [6]:

# We map the letters and diacritics to an embedding of size 36 with the letters
input, output = map_data()


TypeError: map_data() missing 1 required positional argument: 'data_raw'

### Model

In [None]:
#1- We then pass the embeddings to an RNN Model
# This model consists of an embedding layer, an RNN layer, and a fully connected layer. 
# The embedding layer transforms the input words (represented as integers) into dense vectors of fixed size. 
# The RNN layer processes these word embeddings sequentially, 
# maintaining an internal state that encodes information about the sequence so far. T
# he fully connected layer transforms the output of the RNN layer to the desired output size.

In [None]:
#2- Get Accuracy


### Prediction

In [None]:
print(predict("العصفور فوق الشجرة", model))