### Imports


In [None]:
%pip install tensorflow
%pip install numpy
%pip install farasapy
%pip install pyarabic


nltk.download('punkt')

In [1]:
from Preprocessing import *
import nltk
from farasa.segmenter import FarasaSegmenter



### 1- Read Data


In [2]:
# 1-Read Datasets
training_dataset = read_training_dataset()
dev_dataset = read_dev_dataset()

Read training set successfully
Read validation set successfully


### 2- Clean Data


In [3]:
# 2-Data cleaning
# Define a regular expression pattern
# This pattern keeps Arabic letters, diacritics, and whitespaces and endlines
# pattern = re.compile(r"[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\s\n]")
pattern = re.compile(r'[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\s,.؟،;:!?\-\'"]')


# Replace unmatched characters with an empty string

cleaned_corpus = [re.sub(pattern, "", t) for t in training_dataset]
cleaned_corpus = [re.sub("\s\s+", " ", c) for c in cleaned_corpus]
# re.sub("\s\s+" , " ", s)


print(len(cleaned_corpus))
# write the clean corpus to file
with open("cleaned_corpus.txt", "w", encoding="utf-8") as f:
    for l in cleaned_corpus:
        f.write(l + "\n")

50000


In [4]:
# data_with_labels = []
data = []
labels = []
# Example usage
for c in cleaned_corpus:
    sentences = re.split(r"[,.؟،;:!?']+", c)  # split on all punctuation
    labels += sentences

    without_dialects = [
        extract_arabic_letters(s) for s in sentences
    ]  # get the letters without dialects
    data += without_dialects


# remove any spaces from line
data = [d.strip() for d in data]
labels = [l.strip() for l in labels]

# remove empty lines
# data = [i for i in data if i]
# labels = [i for i in labels if i]
# data = list(filter(lambda x: x != "", data))
# labels = list(filter(lambda x: x != "", labels))

# write the clean corpus to file
with open("training_data.txt", "w", encoding="utf-8") as f:
    for d in data:
        f.write(str(d) + "\n")

with open("training_labels.txt", "w", encoding="utf-8") as f:
    for l in labels:
        f.write(str(l) + "\n")

In [5]:
print(len(data))
print(len(labels))
# # data.remove("")
# print(data[9])

212012
212012


### 3- Tokenize Data


In [6]:
# 3.1 Tokenize & stem words to vocab
vocab = set()
tokenized_data = []
vocab, tokenized_data = tokenize_to_vocab(data, vocab)
stemmedVocab = []

for v in vocab:
    stemmedVocab.append(stem(v))
# stemmedVocab = set(stemmedVocab)

with open("vocab.txt", "w", encoding="utf-8") as f:
    for v in vocab:
        f.write(str(v) + "\n")

with open("stemmedvocab.txt", "w", encoding="utf-8") as f:
    for sv in stemmedVocab:
        f.write(str(sv) + "\n")

with open("tokenized_data_sentences.txt", "w", encoding="utf-8") as f:
    for ts in tokenized_data:
        f.write(str(ts) + "\n")

In [7]:
# 3.1 Tokenize & stem words to vocab
diacritics = set()
tokenized_labels = []
diacritics, tokenized_labels = tokenize_to_vocab(labels, vocab)


with open("tokenized_label_sentences.txt", "w", encoding="utf-8") as f:
    for tl in tokenized_labels:
        f.write(str(tl) + "\n")

with open("diacritics.txt", "w", encoding="utf-8") as f:
    for d in diacritics:
        f.write(str(d) + "\n")

with open("diacritics_encoded.txt", "w", encoding="utf-8") as f:
    for d in diacritics:
        f.write(str(extract_diacritics(str(d))) + "\n")

In [8]:
# letters = u"هاربا"
# encoded_marks = 40610
# x= join_word_diacritics(letters, encoded_marks, "decimal")

# Fat7a = 4, damma =5, kasra = 6, sokoon = 7
# tanween fat7a =1, damma=2 , kasraa=3
# shadda =70
# print(x)

# print(extract_diacritics(x))
print(word_to_embedding("ببب"))
print(ord("ـ"))  # 1568 -1610 ,,

4728
1600


In [9]:
from collections import defaultdict, Counter
import re

In [17]:
# byte pain encoding
def get_stats(vocab):
    pairs = {}
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pair = (symbols[i], symbols[i + 1])
            pairs[pair] = pairs.get(pair, 0) + freq
    return pairs


def merge_vocab(pair, vocab_in):
    vocab_out = {}
    bigram = " ".join(pair)
    replacement = "".join(pair)
    for word in vocab_in:
        w_out = word.replace(bigram, replacement)
        vocab_out[w_out] = vocab_in[word]
    return vocab_out

words = [l.split() for l in  labels]

# unpack all lists inside words
words = [item for sublist in words for item in sublist]

# Build initial vocabulary (word frequency)
vocab = Counter(words)

# Tokenize into characters
vocab = {' '.join(word) + ' </w>': freq for word, freq in vocab.items()}

num_merges = 10000

for i in range(num_merges):
    pairs = get_stats(vocab)
    if not pairs:
        break

    best = max(pairs, key=pairs.get)
    vocab = merge_vocab(best, vocab)
    # print(f"Merge {i+1}: {best}")

In [16]:
#write vocab to file
with open("vocab_bpe.txt", "w", encoding="utf-8") as f:
    for v in vocab:
        f.write(str(v) + "\n")

In [12]:
# Import necessary libraries

In [13]:
## NOTE
# SEGMENTS EL KELMA 7ELW TA2REEBAN
# # Download NLTK data (if not already downloaded)

# # Initialize Farasa Segmenter
# segmenter = FarasaSegmenter(interactive=True)

# # Sample Dialectal Arabic text (replace with actual DA text)
# # text_da = "هذا نص باللهجة العربية الدارجة"
# # text_da = labels[0]
# text_da = labels[1]


# # Preprocessing steps (Normalization, Cleaning, etc.)
# # This is a placeholder - implement specific preprocessing needed for your text
# def preprocess_text(text):
#     # Implement preprocessing steps here
#     return text


# # Preprocess the text
# preprocessed_text = preprocess_text(text_da)

# # Perform segmentation
# segmented_text = segmenter.segment(preprocessed_text)

# # Output the segmented text
# print(segmented_text)