In [11]:
# Import necessary libraries
import pymongo

In [12]:
# Connect to MongoDB
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["arabic_poems"]
collection = db["poems"]

In [13]:
# Print some sample data from MongoDB collection
for doc in collection.find().limit(5):  # Limiting to 5 documents for demonstration
    print(doc)

{'_id': ObjectId('6612d11df84fdf7786c45827'), 'date': 'الجمعة ٥ نيسان (أبريل) ٢٠٢٤', 'title': 'مبللة بالاشتياق', 'author': None, 'poem': 'كَانَ الفَصلُ شَغَفًا\nوَكَانتِ السَّاعةُ\nمُتجمِّدةَ عقاربِ العِناقِ\nنوافذُ القلبِ خلَّعَتْها\nأعَاصِيرُ اللوعَةِ\nسَتائرُ الرُّوحِ\nمِن دانتيلَ مُنقَّطٍ بِالخَيبةِ\nوَسَماءُ اللهفَةِ\nتُمطِرُ القُبُلاتِ بِحَرارةٍ\nلمْ أكُنْ أحمِلُ مِظَلَّةً تَقِيْني\nوابِلَ سِحرِكَ..\nلمْ أكُنْ ألبَسُ مِعطفًا\nيرُدُّ عنِّي زَمهرِيرَ الدَّهشةِ\nلمْ أكُنْ أنتعِلُ سِوى\nحِذاءِ الوقتِ المَثقوبِ!\nوَهُناكَ...\nعَلى قَارعةِ الانتِظارِ\nكُنتُ أرتجِفُ عِشقًا\nكُنتُ أقفُ مُبلَّلةً بِالاشتياقِ\nلا شَيءَ..\nلا صَوتَ..\nسِوَى زخَّاتِ نَبضٍ يَروي\nظَمَأَ الرُّوحِ القَاحلةِ\nمُنذُ عَهدِ المَجَاعةِ إلى الحُبِّ..\nقَوسُ قُزحِ بَهائِكَ\nهزَمَ عَتمةَ وحدَتِي\nجسَدَانِ..\nجسَدَانِ يُشعِلانِ حَطَبَ الوِصَالِ\nحتَّى انطِفاءِ لَهبِ الرَّغبةِ..\nفنجَانانِ مِن قَهوةِ الحُبِّ\nوبعضٌ مِن كَستَناءِ الفرَحِ\nزَادُ عاشِقَينِ\nحتَّى إشعارٍ آخرَ مِنَ العِنَاقِ!\nوَأمضِي..\nأتوكَّأُ حُزنِي\nرعدٌ

In [14]:
# Text Cleaning
import re

def clean_text(text):
    # Remove punctuation and special characters from the original text
    cleaned_text = re.sub(r'[^\w\s\u0600-\u06FF]', '', text)  # Include Arabic Unicode range \u0600-\u06FF
    # Remove digits from the cleaned text
    cleaned_text = re.sub(r'\d+', '', cleaned_text)
    # Remove line breaks from the cleaned text
    cleaned_text = cleaned_text.replace('\n', '. ')
    return cleaned_text

# Fetch data from MongoDB and perform text cleaning
cleaned_data = []
for doc in collection.find({}, {"_id": 0, "poem": 1}):  # Iterate over documents and retrieve "poem" field
    text = doc.get("poem", "")  # Extract text (poem) from the document
    cleaned_text = clean_text(text)  # Clean the text
    cleaned_data.append(cleaned_text)  # Append cleaned text to the list

# Print cleaned data (you may want to explore the data first to decide if cleaning is needed)
print(cleaned_data[:5])  # Print first 5 cleaned poems

['كَانَ الفَصلُ شَغَفًا. وَكَانتِ السَّاعةُ. مُتجمِّدةَ عقاربِ العِناقِ. نوافذُ القلبِ خلَّعَتْها. أعَاصِيرُ اللوعَةِ. سَتائرُ الرُّوحِ. مِن دانتيلَ مُنقَّطٍ بِالخَيبةِ. وَسَماءُ اللهفَةِ. تُمطِرُ القُبُلاتِ بِحَرارةٍ. لمْ أكُنْ أحمِلُ مِظَلَّةً تَقِيْني. وابِلَ سِحرِكَ. لمْ أكُنْ ألبَسُ مِعطفًا. يرُدُّ عنِّي زَمهرِيرَ الدَّهشةِ. لمْ أكُنْ أنتعِلُ سِوى. حِذاءِ الوقتِ المَثقوبِ. وَهُناكَ. عَلى قَارعةِ الانتِظارِ. كُنتُ أرتجِفُ عِشقًا. كُنتُ أقفُ مُبلَّلةً بِالاشتياقِ. لا شَيءَ. لا صَوتَ. سِوَى زخَّاتِ نَبضٍ يَروي. ظَمَأَ الرُّوحِ القَاحلةِ. مُنذُ عَهدِ المَجَاعةِ إلى الحُبِّ. قَوسُ قُزحِ بَهائِكَ. هزَمَ عَتمةَ وحدَتِي. جسَدَانِ. جسَدَانِ يُشعِلانِ حَطَبَ الوِصَالِ. حتَّى انطِفاءِ لَهبِ الرَّغبةِ. فنجَانانِ مِن قَهوةِ الحُبِّ. وبعضٌ مِن كَستَناءِ الفرَحِ. زَادُ عاشِقَينِ. حتَّى إشعارٍ آخرَ مِنَ العِنَاقِ. وَأمضِي. أتوكَّأُ حُزنِي. رعدٌ وَبرقٌ وَليلٌ. يتكسَّرُ حِذائيَ البَلورِيَّ. وأتعثَّرُ في خُطواتِ ظِلِّكَ المُسافرِ. إلى محطَّاتِ الرَّجاءِ. مطرٌ مطرٌ. تتبلَّلُ أرصِفةَ الوَجدِ. تنهمِرُ 

In [15]:
# Tokenization
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

# Tokenize phrases into words
tokenized_words = []
for poem in cleaned_data:
    phrases = sent_tokenize(poem)
    for phrase in phrases:
        words = word_tokenize(phrase.rstrip('.'))
        tokenized_words.append(words)

# Print the tokenized words for the first few phrases
for i, words in enumerate(tokenized_words[:100]):
    print(f"Phrase {i+1} words:", words)

Phrase 1 words: ['كَانَ', 'الفَصلُ', 'شَغَفًا']
Phrase 2 words: ['وَكَانتِ', 'السَّاعةُ']
Phrase 3 words: ['مُتجمِّدةَ', 'عقاربِ', 'العِناقِ']
Phrase 4 words: ['نوافذُ', 'القلبِ', 'خلَّعَتْها']
Phrase 5 words: ['أعَاصِيرُ', 'اللوعَةِ']
Phrase 6 words: ['سَتائرُ', 'الرُّوحِ']
Phrase 7 words: ['مِن', 'دانتيلَ', 'مُنقَّطٍ', 'بِالخَيبةِ']
Phrase 8 words: ['وَسَماءُ', 'اللهفَةِ']
Phrase 9 words: ['تُمطِرُ', 'القُبُلاتِ', 'بِحَرارةٍ']
Phrase 10 words: ['لمْ', 'أكُنْ', 'أحمِلُ', 'مِظَلَّةً', 'تَقِيْني']
Phrase 11 words: ['وابِلَ', 'سِحرِكَ']
Phrase 12 words: ['لمْ', 'أكُنْ', 'ألبَسُ', 'مِعطفًا']
Phrase 13 words: ['يرُدُّ', 'عنِّي', 'زَمهرِيرَ', 'الدَّهشةِ']
Phrase 14 words: ['لمْ', 'أكُنْ', 'أنتعِلُ', 'سِوى']
Phrase 15 words: ['حِذاءِ', 'الوقتِ', 'المَثقوبِ']
Phrase 16 words: ['وَهُناكَ']
Phrase 17 words: ['عَلى', 'قَارعةِ', 'الانتِظارِ']
Phrase 18 words: ['كُنتُ', 'أرتجِفُ', 'عِشقًا']
Phrase 19 words: ['كُنتُ', 'أقفُ', 'مُبلَّلةً', 'بِالاشتياقِ']
Phrase 20 words: ['لا', 'شَيءَ']
Phrase 21 wo

In [16]:
# Stop words
from nltk.corpus import stopwords

# Function to remove stop words from a list of words
def remove_stopwords(words):
    return [word for word in words if word.lower() not in stopwords.words('arabic')]

# Remove stop words from tokenized words
filtered_words = [remove_stopwords(words) for words in tokenized_words]

# Print the filtered words
for i, words in enumerate(filtered_words[:100]):  # Print the first few phrases
    print(f"Phrase {i+1} words (without stop words):", words)

Phrase 1 words (without stop words): ['كَانَ', 'الفَصلُ', 'شَغَفًا']
Phrase 2 words (without stop words): ['وَكَانتِ', 'السَّاعةُ']
Phrase 3 words (without stop words): ['مُتجمِّدةَ', 'عقاربِ', 'العِناقِ']
Phrase 4 words (without stop words): ['نوافذُ', 'القلبِ', 'خلَّعَتْها']
Phrase 5 words (without stop words): ['أعَاصِيرُ', 'اللوعَةِ']
Phrase 6 words (without stop words): ['سَتائرُ', 'الرُّوحِ']
Phrase 7 words (without stop words): ['مِن', 'دانتيلَ', 'مُنقَّطٍ', 'بِالخَيبةِ']
Phrase 8 words (without stop words): ['وَسَماءُ', 'اللهفَةِ']
Phrase 9 words (without stop words): ['تُمطِرُ', 'القُبُلاتِ', 'بِحَرارةٍ']
Phrase 10 words (without stop words): ['لمْ', 'أكُنْ', 'أحمِلُ', 'مِظَلَّةً', 'تَقِيْني']
Phrase 11 words (without stop words): ['وابِلَ', 'سِحرِكَ']
Phrase 12 words (without stop words): ['لمْ', 'أكُنْ', 'ألبَسُ', 'مِعطفًا']
Phrase 13 words (without stop words): ['يرُدُّ', 'عنِّي', 'زَمهرِيرَ', 'الدَّهشةِ']
Phrase 14 words (without stop words): ['لمْ', 'أكُنْ', 'أنتعِلُ', 'س

In [17]:
# Removing diacritics
def remove_diacritics(word_or_list):
    # Define a regular expression pattern to match Arabic diacritics
    pattern = r'[\u064B-\u065F\u0670]'  # Range of Arabic diacritical marks
    
    # If input is a list, apply remove_diacritics recursively to each element
    if isinstance(word_or_list, list):
        return [remove_diacritics(word) for word in word_or_list]
    
    # If input is a string, remove diacritics using the pattern
    elif isinstance(word_or_list, str):
        return re.sub(pattern, '', word_or_list)
    
    # Otherwise, return the input unchanged
    else:
        return word_or_list

# Apply remove_diacritics function to filtered_words
cleaned_words = remove_diacritics(filtered_words)

# Print words without diacritics
for i, words in enumerate(cleaned_words[:100]):  # Print the first few phrases
    print(f"Phrase {i+1} words (without diacritics):", words)

Phrase 1 words (without diacritics): ['كان', 'الفصل', 'شغفا']
Phrase 2 words (without diacritics): ['وكانت', 'الساعة']
Phrase 3 words (without diacritics): ['متجمدة', 'عقارب', 'العناق']
Phrase 4 words (without diacritics): ['نوافذ', 'القلب', 'خلعتها']
Phrase 5 words (without diacritics): ['أعاصير', 'اللوعة']
Phrase 6 words (without diacritics): ['ستائر', 'الروح']
Phrase 7 words (without diacritics): ['من', 'دانتيل', 'منقط', 'بالخيبة']
Phrase 8 words (without diacritics): ['وسماء', 'اللهفة']
Phrase 9 words (without diacritics): ['تمطر', 'القبلات', 'بحرارة']
Phrase 10 words (without diacritics): ['لم', 'أكن', 'أحمل', 'مظلة', 'تقيني']
Phrase 11 words (without diacritics): ['وابل', 'سحرك']
Phrase 12 words (without diacritics): ['لم', 'أكن', 'ألبس', 'معطفا']
Phrase 13 words (without diacritics): ['يرد', 'عني', 'زمهرير', 'الدهشة']
Phrase 14 words (without diacritics): ['لم', 'أكن', 'أنتعل', 'سوى']
Phrase 15 words (without diacritics): ['حذاء', 'الوقت', 'المثقوب']
Phrase 16 words (without dia

In [18]:
# Normalization
import unicodedata

def normalize_words(words):
    normalized_words = []
    for word in words:
        # Check if the word is a list (nested list)
        if isinstance(word, list):
            # If it's a nested list, recursively call normalize_words
            normalized_sublist = normalize_words(word)
            normalized_words.append(normalized_sublist)
        else:
            # Unicode normalization (NFKD form)
            normalized_word = unicodedata.normalize('NFKD', word)
        
            # Case normalization (convert to lowercase)
            normalized_word = normalized_word.lower()
        
            # Whitespace normalization (replace multiple spaces with a single space)
            normalized_word = ' '.join(normalized_word.split())
        
            normalized_words.append(normalized_word)
    
    return normalized_words

# Normalize cleaned words
normalized_words = normalize_words(cleaned_words)

# Print normalized words
for i, word in enumerate(normalized_words[:100]):  # Print the first few normalized words
    print(f"Normalized word {i+1}:", word)

Normalized word 1: ['كان', 'الفصل', 'شغفا']
Normalized word 2: ['وكانت', 'الساعة']
Normalized word 3: ['متجمدة', 'عقارب', 'العناق']
Normalized word 4: ['نوافذ', 'القلب', 'خلعتها']
Normalized word 5: ['أعاصير', 'اللوعة']
Normalized word 6: ['ستائر', 'الروح']
Normalized word 7: ['من', 'دانتيل', 'منقط', 'بالخيبة']
Normalized word 8: ['وسماء', 'اللهفة']
Normalized word 9: ['تمطر', 'القبلات', 'بحرارة']
Normalized word 10: ['لم', 'أكن', 'أحمل', 'مظلة', 'تقيني']
Normalized word 11: ['وابل', 'سحرك']
Normalized word 12: ['لم', 'أكن', 'ألبس', 'معطفا']
Normalized word 13: ['يرد', 'عني', 'زمهرير', 'الدهشة']
Normalized word 14: ['لم', 'أكن', 'أنتعل', 'سوى']
Normalized word 15: ['حذاء', 'الوقت', 'المثقوب']
Normalized word 16: ['وهناك']
Normalized word 17: ['على', 'قارعة', 'الانتظار']
Normalized word 18: ['كنت', 'أرتجف', 'عشقا']
Normalized word 19: ['كنت', 'أقف', 'مبللة', 'بالاشتياق']
Normalized word 20: ['شيء']
Normalized word 21: ['صوت']
Normalized word 22: ['سوى', 'زخات', 'نبض', 'يروي']


In [21]:
# Stemming
from nltk.stem import SnowballStemmer

# Initialize SnowballStemmer for Arabic
stemmer = SnowballStemmer("arabic")

# Flatten nested lists
def flatten_list(lst):
    flat_list = []
    for item in lst:
        if isinstance(item, list):
            flat_list.extend(flatten_list(item))
        else:
            flat_list.append(item)
    return flat_list

# Flatten the normalized words
flat_normalized_words = flatten_list(normalized_words)

# Stem the normalized words
stemmed_words = [stemmer.stem(word) for word in flat_normalized_words]

# Print the stemmed words
for i, word in enumerate(stemmed_words[:100]):  # Print the first few stemmed words
    print(f"Stemmed word {i+1}:", word)

Stemmed word 1: كان
Stemmed word 2: فصل
Stemmed word 3: شغف
Stemmed word 4: كان
Stemmed word 5: ساع
Stemmed word 6: متجمد
Stemmed word 7: عقارب
Stemmed word 8: عناق
Stemmed word 9: نوافذ
Stemmed word 10: قلب
Stemmed word 11: خلع
Stemmed word 12: أعاصير
Stemmed word 13: لوع
Stemmed word 14: ستائر
Stemmed word 15: روح
Stemmed word 16: من
Stemmed word 17: دانتيل
Stemmed word 18: منقط
Stemmed word 19: خيب
Stemmed word 20: سماء
Stemmed word 21: لهف
Stemmed word 22: تمطر
Stemmed word 23: قبل
Stemmed word 24: حرار
Stemmed word 25: لم
Stemmed word 26: أك
Stemmed word 27: أحمل
Stemmed word 28: مظل
Stemmed word 29: تق
Stemmed word 30: وابل
Stemmed word 31: سحر
Stemmed word 32: لم
Stemmed word 33: أك
Stemmed word 34: ألبس
Stemmed word 35: معطف
Stemmed word 36: يرد
Stemmed word 37: عن
Stemmed word 38: زمهرير
Stemmed word 39: دهش
Stemmed word 40: لم
Stemmed word 41: أك
Stemmed word 42: أنتعل
Stemmed word 43: سوى
Stemmed word 44: حذاء
Stemmed word 45: الو
Stemmed word 46: مثقوب
Stemmed word 

In [23]:
# Lemmatization
from nltk.stem import WordNetLemmatizer

# Initialize WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Flatten nested lists
def flatten_list(lst):
    flat_list = []
    for item in lst:
        if isinstance(item, list):
            flat_list.extend(flatten_list(item))
        else:
            flat_list.append(item)
    return flat_list

# Flatten the normalized words
flat_normalized_words = flatten_list(normalized_words)

# Lemmatize the flattened words
lemmatized_words = [lemmatizer.lemmatize(word) for word in flat_normalized_words]

# Print the lemmatized words
for i, word in enumerate(lemmatized_words[:100]):  # Print the first few lemmatized words
    print(f"Lemmatized word {i+1}:", word)

Lemmatized word 1: كان
Lemmatized word 2: الفصل
Lemmatized word 3: شغفا
Lemmatized word 4: وكانت
Lemmatized word 5: الساعة
Lemmatized word 6: متجمدة
Lemmatized word 7: عقارب
Lemmatized word 8: العناق
Lemmatized word 9: نوافذ
Lemmatized word 10: القلب
Lemmatized word 11: خلعتها
Lemmatized word 12: أعاصير
Lemmatized word 13: اللوعة
Lemmatized word 14: ستائر
Lemmatized word 15: الروح
Lemmatized word 16: من
Lemmatized word 17: دانتيل
Lemmatized word 18: منقط
Lemmatized word 19: بالخيبة
Lemmatized word 20: وسماء
Lemmatized word 21: اللهفة
Lemmatized word 22: تمطر
Lemmatized word 23: القبلات
Lemmatized word 24: بحرارة
Lemmatized word 25: لم
Lemmatized word 26: أكن
Lemmatized word 27: أحمل
Lemmatized word 28: مظلة
Lemmatized word 29: تقيني
Lemmatized word 30: وابل
Lemmatized word 31: سحرك
Lemmatized word 32: لم
Lemmatized word 33: أكن
Lemmatized word 34: ألبس
Lemmatized word 35: معطفا
Lemmatized word 36: يرد
Lemmatized word 37: عني
Lemmatized word 38: زمهرير
Lemmatized word 39: الدهشة
L

In [27]:
# POS tagging
import stanza

# Initialize Stanza pipeline for Arabic
nlp = stanza.Pipeline(lang='ar', processors='tokenize,pos')

# Convert the first 100 lemmatized words to a single string
text = ' '.join(lemmatized_words[:100])

# Perform POS tagging using Stanza
doc = nlp(text)

# Extract POS tags
pos_tags = [(word.text, word.upos) for sent in doc.sentences for word in sent.words]

# Print the POS tags
for i, (word, pos_tag) in enumerate(pos_tags):
    print(f"Word {i+1}: {word} - POS tag: {pos_tag}")

2024-04-08 04:53:48 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-04-08 04:53:49 INFO: Downloaded file to C:\Users\Yasmine\stanza_resources\resources.json
2024-04-08 04:53:50 INFO: Loading these models for language: ar (Arabic):
| Processor | Package     |
---------------------------
| tokenize  | padt        |
| mwt       | padt        |
| pos       | padt_charlm |

2024-04-08 04:53:50 INFO: Using device: cpu
2024-04-08 04:53:50 INFO: Loading: tokenize
2024-04-08 04:53:50 INFO: Loading: mwt
2024-04-08 04:53:50 INFO: Loading: pos
2024-04-08 04:53:50 INFO: Done loading processors!


Word 1: كان - POS tag: VERB
Word 2: الفصل - POS tag: NOUN
Word 3: شغفا - POS tag: NOUN
Word 4: وكانت - POS tag: VERB
Word 5: الساعة - POS tag: NOUN
Word 6: متجمدة - POS tag: ADJ
Word 7: عقارب - POS tag: NOUN
Word 8: العناق - POS tag: NOUN
Word 9: نوافذ - POS tag: X
Word 10: القلب - POS tag: NOUN
Word 11: خلعت - POS tag: VERB
Word 12: ها - POS tag: PRON
Word 13: أعاصير - POS tag: X
Word 14: اللوعة - POS tag: NOUN
Word 15: س - POS tag: AUX
Word 16: تاي<UNK>ر - POS tag: X
Word 17: الروح - POS tag: NOUN
Word 18: من - POS tag: ADP
Word 19: دانتيل - POS tag: X
Word 20: منقط - POS tag: X
Word 21: ب - POS tag: ADP
Word 22: الخيبة - POS tag: NOUN
Word 23: و - POS tag: CCONJ
Word 24: سماء - POS tag: NOUN
Word 25: اللهفة - POS tag: NOUN
Word 26: تمطر - POS tag: VERB
Word 27: القبلات - POS tag: NOUN
Word 28: ب - POS tag: ADP
Word 29: حرارة - POS tag: NOUN
Word 30: لم - POS tag: PART
Word 31: أكن - POS tag: VERB
Word 32: أحمل - POS tag: ADJ
Word 33: مظلة - POS tag: NOUN
Word 34: تقيني - POS tag:

In [32]:
# Import stanza
import stanza

# Initialize Stanza pipeline for Arabic with NER enabled
nlp = stanza.Pipeline(lang='ar', processors='tokenize,ner')

# Convert the first 100 lemmatized words to a single string
text = ' '.join(lemmatized_words[:1000])

# Perform NER using Stanza
doc = nlp(text)

# Extract NER tags
ner_tags = [(ent.text, ent.type) for sent in doc.sentences for ent in sent.ents]

# Print the NER tags
for i, (entity, entity_type) in enumerate(ner_tags):
    print(f"Entity {i+1}: {entity} - Type: {entity_type}")

2024-04-08 05:07:27 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-04-08 05:07:29 INFO: Downloaded file to C:\Users\Yasmine\stanza_resources\resources.json
2024-04-08 05:07:30 INFO: Loading these models for language: ar (Arabic):
| Processor | Package      |
----------------------------
| tokenize  | padt         |
| mwt       | padt         |
| ner       | aqmar_charlm |

2024-04-08 05:07:30 INFO: Using device: cpu
2024-04-08 05:07:30 INFO: Loading: tokenize
2024-04-08 05:07:30 INFO: Loading: mwt
2024-04-08 05:07:30 INFO: Loading: ner
2024-04-08 05:07:32 INFO: Done loading processors!


Entity 1: العرب - Type: PER
Entity 2: تونس - Type: LOC
Entity 3: الشام - Type: LOC
