                        inverted index For Arabic and English Documents 

Arabic Steps PreProcessing:

1.word tokenize

2.remove stopwords

3.stemming

    3.1 ISRIStemmer
    3.2 SnowballStemmer

4.WordNetLemmatizer

English Steps Preprocessing:

1.word tokenize

2.remove stopwords

3.stemming

    3.1 PorterStemmer
    3.2 SnowballStemmer
4.WordNetLemmatizer


End Applay Inverted Index For Arabic Or English Documents


In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer,ISRIStemmer,PorterStemmer,SnowballStemmer
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\Users\Ahmed
[nltk_data]     Ashraf\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Ahmed
[nltk_data]     Ashraf\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

                    Arabic and English Steps PreProcessing 

word tokenize

In [2]:
def tokenize(document,flage=1):
    '''
    flage==1 mean tokenize using word_tokenize
    flage==0 mean tokenize using split
    document input doctument text
    '''
    if flage:
        return word_tokenize(document)
    #mean else    
    return document.split()    

In [3]:
document1 = "The quick brown fox jumped over the lazy dog."
document2 = "احمد اشرف احمد السيد على ليدر مشروع التخرج"
print('Using Word Tokenize')
print(tokenize(document1))
print(tokenize(document2))
print('*'*10)
print('Using Split')
print(tokenize(document1,flage=0))
print(tokenize(document2,flage=0))

Using Word Tokenize
['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog', '.']
['احمد', 'اشرف', 'احمد', 'السيد', 'على', 'ليدر', 'مشروع', 'التخرج']
**********
Using Split
['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog.']
['احمد', 'اشرف', 'احمد', 'السيد', 'على', 'ليدر', 'مشروع', 'التخرج']


remove stopwords

In [4]:
def StopWords_Remove(document_tokenize,flage=1):
    '''
    flage==1 english document
    flage==0 arabic document
    document_tokenize mean document performed tokenize
    '''
    if flage:
        english_stop=set(stopwords.words('english'))
        return [i for i in document_tokenize if i not in english_stop]
    # mean else    
    arabic_stop=set(stopwords.words('arabic'))
    return [i for i in document_tokenize if i not in arabic_stop]    

In [5]:
document1 = "The quick brown fox jumped over the lazy dog."
document2 = "احمد اشرف احمد السيد على ليدر مشروع التخرج"
print('Using Word Tokenize & Remove StopWord')
print(StopWords_Remove(tokenize(document1)))
print(StopWords_Remove(tokenize(document2),flage=0))
print('*'*10)
print('Using Split & Remove StopWord')
print(StopWords_Remove(tokenize(document1,flage=0)))
print(StopWords_Remove(tokenize(document2,flage=0),flage=0))

Using Word Tokenize & Remove StopWord
['The', 'quick', 'brown', 'fox', 'jumped', 'lazy', 'dog', '.']
['احمد', 'اشرف', 'احمد', 'السيد', 'ليدر', 'مشروع', 'التخرج']
**********
Using Split & Remove StopWord
['The', 'quick', 'brown', 'fox', 'jumped', 'lazy', 'dog.']
['احمد', 'اشرف', 'احمد', 'السيد', 'ليدر', 'مشروع', 'التخرج']


stemming & Lemmatizer

In [6]:
def stemming_arabic(document_tokenize,flage=1):
    '''
    flage==1 mean perform ISRIStemmer
    flage==2 mean perform SnowballStemmer
    flage==3 mean perform WordNetLemmatizer
    document_tokenize mean document performed tokenize and may be performed stopword remove
    '''
    if flage==1:
        stem=ISRIStemmer()
        return [stem.stem(i) for i in document_tokenize]
    elif flage==2:
        stem=SnowballStemmer('arabic')
        return [stem.stem(i) for i in document_tokenize]
    else:
        #Lemmatizer of each word
        lemmatizer=WordNetLemmatizer()
        return [lemmatizer.lemmatize(i) for i in document_tokenize]

In [7]:
def stemming_english(document_tokenize,flage=1):
    '''
    flage==1 mean perform PorterStemmer
    flage==2 mean perform SnowballStemmer
    flage==3 mean perform WordNetLemmatizer
    document_tokenize mean document performed tokenize and may be performed stopword remove
    '''
    if flage==1:
       stem=PorterStemmer()
       return [stem.stem(i) for i in document_tokenize]
    elif flage==2:
        stem=SnowballStemmer('english')
        return [stem.stem(i) for i in document_tokenize]
    else:
        #Lemmatizer of each word
        lemmatizer=WordNetLemmatizer()
        return [lemmatizer.lemmatize(i) for i in document_tokenize]

In [8]:
document1 = "The quick brown fox jumped over the lazy dog."
document2 = "احمد اشرف احمد السيد على ليدر مشروع التخرج"
print('Using Word Tokenize & Remove StopWord & Stemming')
print(stemming_english(StopWords_Remove(tokenize(document1))))
print(stemming_arabic(StopWords_Remove(tokenize(document2),flage=0)))
print('*'*10)
print('Using Split & Remove StopWord & Stemming')
print(stemming_english(StopWords_Remove(tokenize(document1,flage=0))))
print(stemming_arabic(StopWords_Remove(tokenize(document2,flage=0),flage=0)))

Using Word Tokenize & Remove StopWord & Stemming
['the', 'quick', 'brown', 'fox', 'jump', 'lazi', 'dog', '.']
['حمد', 'شرف', 'حمد', 'سيد', 'يدر', 'شرع', 'خرج']
**********
Using Split & Remove StopWord & Stemming
['the', 'quick', 'brown', 'fox', 'jump', 'lazi', 'dog.']
['حمد', 'شرف', 'حمد', 'سيد', 'يدر', 'شرع', 'خرج']


In [9]:
document1 = "The quick brown fox jumped over the lazy dog."
document2 = "احمد اشرف احمد السيد على ليدر مشروع التخرج"
print('Using Word Tokenize & Remove StopWord & Stemming')
print(stemming_english(StopWords_Remove(tokenize(document1)),flage=2))
print(stemming_arabic(StopWords_Remove(tokenize(document2),flage=0),flage=2))
print('*'*10)
print('Using Split & Remove StopWord & Stemming')
print(stemming_english(StopWords_Remove(tokenize(document1,flage=0)),flage=2))
print(stemming_arabic(StopWords_Remove(tokenize(document2,flage=0),flage=0),flage=2))

Using Word Tokenize & Remove StopWord & Stemming
['the', 'quick', 'brown', 'fox', 'jump', 'lazi', 'dog', '.']
['احمد', 'اشرف', 'احمد', 'سيد', 'ليدر', 'مشروع', 'تخرج']
**********
Using Split & Remove StopWord & Stemming
['the', 'quick', 'brown', 'fox', 'jump', 'lazi', 'dog.']
['احمد', 'اشرف', 'احمد', 'سيد', 'ليدر', 'مشروع', 'تخرج']


In [10]:
document1 = "The quick brown fox jumped over the lazy dog."
document2 = "احمد اشرف احمد السيد على ليدر مشروع التخرج"
print('Using Word Tokenize & Remove StopWord & Stemming')
print(stemming_english(StopWords_Remove(tokenize(document1)),flage=3))
print(stemming_arabic(StopWords_Remove(tokenize(document2),flage=0),flage=3))
print('*'*10)
print('Using Split & Remove StopWord & Stemming')
print(stemming_english(StopWords_Remove(tokenize(document1,flage=0)),flage=3))
print(stemming_arabic(StopWords_Remove(tokenize(document2,flage=0),flage=0),flage=3))

Using Word Tokenize & Remove StopWord & Stemming
['The', 'quick', 'brown', 'fox', 'jumped', 'lazy', 'dog', '.']
['احمد', 'اشرف', 'احمد', 'السيد', 'ليدر', 'مشروع', 'التخرج']
**********
Using Split & Remove StopWord & Stemming
['The', 'quick', 'brown', 'fox', 'jumped', 'lazy', 'dog.']
['احمد', 'اشرف', 'احمد', 'السيد', 'ليدر', 'مشروع', 'التخرج']


                End Applay Inverted Index For Arabic Or English Documents

In [11]:
def InvertedIndex(*document, text_flage=1, tokenize_flage=1, bool_apply_stopword=1, stopword_flage=1, bool_apply_stemming=1, stemming_flage=1):
    '''
    *document: must be dict that name is key and doc text is value
    tokenize_flage: must be 1 word_tokenize or 0 split 
    text_flage: must be 1 english or 0 arabic
    bool_apply_stopword: must be 1 apply or 0 don't apply
    stopword_flage: must be 1 english or 0 arabic
    bool_apply_stemming: must be 1 apply or 0 don't apply
    stemming_flage: must be 1 or 2 or 3 and depends on text_flag arabic or english
    '''
    inverted_index = {}
    inverted_index2 = {}
    terms = set([])
    names = []
    document_t = []
    for i in document:
        for i2 in i.items():
            names.append(i2[0])
            text = tokenize(i2[1], tokenize_flage)
            if bool_apply_stopword:
                text = StopWords_Remove(text, text_flage)
            if bool_apply_stemming:
                if text_flage:
                    stemming_english(text, stemming_flage)
                else:
                    stemming_arabic(text, stemming_flage)
        terms.update(text)
        document_t.append(text)

    for term in terms:
        document1 = []
        document2 = []
        for document_m in range(len(names)):
            if term in document_t[document_m]:
                document1.append(names[document_m])
                document2.append(document_m + 1)
        inverted_index[term] = document1
        inverted_index2[term] = [len(set(document2)), ' - '.join([str(num) for num in document2])]
    df = pd.DataFrame(inverted_index2)
    df = df.transpose()
    df.columns = ['Document Frequency', 'Postings Lists']
    df.index.name = 'Term'
    return dict(sorted(inverted_index.items())), df.sort_index()

In [12]:
document1 = "The sky is blue."
document2 = "Blueberries are delicious."
document3 = "The blue car is parked outside."
document4 = "She wore a beautiful blue dress."
document5 = "The ocean sparkled with a brilliant blue hue."
result,df=InvertedIndex({'document1': document1}, {'document2': document2},  
              {'document3': document3}, {'document4': document4}, 
              {'document5': document5}, text_flage=1, tokenize_flage=1, bool_apply_stopword=1, bool_apply_stemming=0)

In [13]:
for term, documents in result.items():
    print(term, "->", ", ".join(documents))

. -> document1, document2, document3, document4, document5
Blueberries -> document2
She -> document4
The -> document1, document3, document5
beautiful -> document4
blue -> document1, document3, document4, document5
brilliant -> document5
car -> document3
delicious -> document2
dress -> document4
hue -> document5
ocean -> document5
outside -> document3
parked -> document3
sky -> document1
sparkled -> document5
wore -> document4


In [14]:
#DataFrame
df

Unnamed: 0_level_0,Document Frequency,Postings Lists
Term,Unnamed: 1_level_1,Unnamed: 2_level_1
.,5,1 - 2 - 3 - 4 - 5
Blueberries,1,2
She,1,4
The,3,1 - 3 - 5
beautiful,1,4
blue,4,1 - 3 - 4 - 5
brilliant,1,5
car,1,3
delicious,1,2
dress,1,4


In [15]:
document1 = "الطالب يذاكر في المكتبة."
document2 = "الطالبة تكتب في الصفحة."
document3 = "السماء زرقاء اليوم."
document4 = "الأطفال يلعبون في الحديقة."
document5 = "الشمس تشرق في الصباح الباكر."
result,df=InvertedIndex({'document1': document1}, {'document2': document2},  
              {'document3': document3}, {'document4': document4}, 
              {'document5': document5}, text_flage=0, tokenize_flage=1, bool_apply_stopword=1, bool_apply_stemming=0)

In [16]:
for term, documents in result.items():
    print(term, "->", ", ".join(documents))

. -> document1, document2, document3, document4, document5
الأطفال -> document4
الباكر -> document5
الحديقة -> document4
السماء -> document3
الشمس -> document5
الصباح -> document5
الصفحة -> document2
الطالب -> document1
الطالبة -> document2
المكتبة -> document1
اليوم -> document3
تشرق -> document5
تكتب -> document2
زرقاء -> document3
يذاكر -> document1
يلعبون -> document4


In [17]:
#DataFrame
df

Unnamed: 0_level_0,Document Frequency,Postings Lists
Term,Unnamed: 1_level_1,Unnamed: 2_level_1
.,5,1 - 2 - 3 - 4 - 5
الأطفال,1,4
الباكر,1,5
الحديقة,1,4
السماء,1,3
الشمس,1,5
الصباح,1,5
الصفحة,1,2
الطالب,1,1
الطالبة,1,2


##### Leader Best Work