In [42]:
import re
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer,ISRIStemmer,PorterStemmer,SnowballStemmer
from contextlib import redirect_stdout
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import math
import re
from collections import Counter
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\Users\Ahmed
[nltk_data]     Ashraf\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Ahmed
[nltk_data]     Ashraf\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [43]:
def tokenize(document):
    '''
    flage==1 mean tokenize using word_tokenize
    flage==0 mean tokenize using split
    document input doctument text
    '''
    return word_tokenize(document)

In [44]:
def StopWords_Remove(document_tokenize):
    '''
    flage==1 english document
    flage==0 arabic document
    document_tokenize mean document performed tokenize
    '''
    arabic_stop=set(stopwords.words('arabic'))
    return [i for i in document_tokenize if i not in arabic_stop]   

In [45]:
def Arabic_text_preprocessing(text,flage=3):
    '''
    Flage Number
    1: ISRIStemmer
    2: SnowballStemmer
    3: WordNetLemmatizer
    4: POS
    '''
    text1=text
    #The regular expression [^\w\s] is used to match any character that is not a word character (\w) or a whitespace character (\s).
    text = re.sub(r'[^\w\s]', '',text)
    #splitting sentence into tokens
    text=word_tokenize(text.lower())
    #remove stopwords
    text=StopWords_Remove(text)
    #stemming of each word
    if flage<=3:
        if flage==1:
            stem=ISRIStemmer()
            text=[stem.stem(i) for i in text]
        elif flage==2:
            stem=SnowballStemmer('arabic')
            text=[stem.stem(i) for i in text]
        else:
            #Lemmatizer of each word
            lemmatizer=WordNetLemmatizer()
            text=[lemmatizer.lemmatize(i) for i in text]
    else:
        #part of speech of each word
        text=nltk.pos_tag(text)
    return text

In [46]:
def InvertedIndex(*document):
    '''
    *document: must be dict that name is key and doc text is value
    tokenize_flage: must be 1 word_tokenize or 0 split 
    text_flage: must be 1 english or 0 arabic
    bool_apply_stopword: must be 1 apply or 0 don't apply
    stopword_flage: must be 1 english or 0 arabic
    bool_apply_stemming: must be 1 apply or 0 don't apply
    stemming_flage: must be 1 or 2 or 3 and depends on text_flag arabic or english
    '''
    inverted_index = {}
    inverted_index2 = {}
    terms = set([])
    names = []
    document_t = []
    for doc in document:
        for name, text in doc.items():
            names.append(name)
            processed_text = Arabic_text_preprocessing(text)
            terms.update(processed_text)
            document_t.append(processed_text)
    for term in terms:
        document1 = []
        document2 = []
        for document_m in range(len(names)):
            if term in document_t[document_m]:
                document1.append(names[document_m])
                document2.append(document_m + 1)
        inverted_index[term] = document1
        inverted_index2[term] = [len(set(document2)), ' - '.join([str(num) for num in document2])]
    df = pd.DataFrame(inverted_index2)
    df = df.transpose()
    df.columns = ['Document Frequency', 'Postings Lists']
    df.index.name = 'Term'
    return dict(sorted(inverted_index.items())), df.sort_index()

In [47]:
data = pd.read_csv('documents.csv',usecols=[1,2])
data.head()

Unnamed: 0,Documents,doc id
0,في عصرنا هذا، تحيط تطبيقا ُت الذكاء االصطناعي ...,document1
1,تُع ر المؤلفات الذكاء االصطناعي، على أنه» :درا...,document2
2,الذكاء االصطناعي يمكن تقسيمه إلى:\nالذكاء االص...,document3
3,أدوات بحث الذكاء االصطناعي \nخالل خمسين سنة من...,document4
4,لقد أصبح الذكاء االصطناعي مصطل ًحا شامًًل للتط...,document5


In [48]:
documents={}
for i, x in zip(data.Documents.values,data['doc id'].values):
    documents[x]=i

In [49]:
len(documents)

21

In [41]:
result, df = InvertedIndex(documents)


dict_items([('استقرار', ['document9', 'document16']), ('ان', ['document19', 'document20']), ('البارزة', ['document14']), ('لأنظمة', ['document13', 'document19']), ('يبرز', ['document14']), ('بمتطلبات', ['document13']), ('بدأت', ['document3', 'document4', 'document21']), ('لمستوى', ['document19']), ('المؤسسات', ['document3', 'document4', 'document5', 'document6', 'document19', 'document20']), ('أمن', ['document20']), ('ابتداء', ['document4', 'document15']), ('هاينلين', ['document2']), ('بشن', ['document21']), ('اللعب', ['document18']), ('والمستمر', ['document13']), ('تكبد', ['document19']), ('يحدد', ['document4']), ('وتكنولوجيا', ['document7']), ('والتضامن', ['document19', 'document20']), ('دورا', ['document17', 'document19']), ('وسيمور', ['document3']), ('وبالتالي', ['document1', 'document10', 'document19', 'document20', 'document21']), ('أجرى', ['document3', 'document7']), ('باآللةكان', ['document2']), ('دارتموث', ['document2', 'document8', 'document18']), ('الحقوق', ['document13', 'd

In [11]:
result

{'0': ['document4'],
 '1': ['document4', 'document18'],
 '10': ['document14', 'document20'],
 '100': ['document21'],
 '102': ['document4'],
 '104136': ['document4'],
 '105': ['document4'],
 '112': ['document4'],
 '113': ['document4'],
 '114': ['document4'],
 '115': ['document4'],
 '1160000': ['document17'],
 '117': ['document4'],
 '118': ['document4'],
 '119': ['document4'],
 '12': ['document15'],
 '120': ['document4'],
 '121': ['document4'],
 '122': ['document4'],
 '123': ['document4'],
 '124': ['document4'],
 '126': ['document4'],
 '127': ['document4'],
 '128': ['document4'],
 '129': ['document4'],
 '130': ['document4'],
 '132': ['document4'],
 '133': ['document4'],
 '135828': ['document13'],
 '137': ['document4'],
 '139140': ['document4'],
 '1400': ['document1'],
 '141': ['document4'],
 '144145': ['document4'],
 '146': ['document4'],
 '147': ['document4'],
 '148': ['document4'],
 '1492': ['document12'],
 '15': ['document14'],
 '150151152': ['document4'],
 '153': ['document4'],
 '154

In [12]:
df

Unnamed: 0_level_0,Document Frequency,Postings Lists
Term,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,4
1,2,4 - 18
10,2,14 - 20
100,1,21
102,1,4
...,...,...
يوقد,1,18
يوقع,1,20
يولد,1,16
يوم,2,1 - 17


In [13]:
df.to_csv('inverted index.csv')

In [14]:
def retrieval(query):
    data = pd.read_csv('inverted index.csv')
    query=Arabic_text_preprocessing(query)
    print(query)
    first = None
    for i in range(len(query)):
        try:
            output = data[data['Term']==query[i]]['Postings Lists'].values
        except:
            output=[]
        if len(output) > 0:
            output = [int(i) for i in output[0].split('-')]
            if first is None:
                first = set(output)
            else:
                first = first.intersection(output)
            print(output, query[i])
    print(list(first))
retrieval(' الذكاء الاصطناعي')

['الذكاء', 'الاصطناعي']
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] الذكاء
[9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] الاصطناعي
[9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]


In [None]:
مصطلح الذكاء الاصطناعي

In [13]:
def Ranking(documents,query):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
    query_vector = tfidf_vectorizer.transform([query])
    similarities = cosine_similarity(query_vector, tfidf_matrix)
    similarities_flat = similarities.flatten()
    doc_similarity_list = [(i, similarities_flat[i]) for i in range(len(similarities_flat))]
    doc_similarity_list.sort(key=lambda x: x[1], reverse=True)
    top_k = 5
    for i in range(top_k):
        doc_index = doc_similarity_list[i][0]
        similarity_score = doc_similarity_list[i][1]
        print(f"Document {doc_index + 1}: {documents[doc_index]} - Similarity Score: {similarity_score}")

In [14]:
Ranking([document1,document2,document3,document4,document5],' الطالبة و الطالب')

NameError: name 'document1' is not defined

In [51]:
def text_to_vector(text):
    WORD = re.compile(r"\w+")
    words = WORD.findall(text)
    return Counter(words)
def get_cosine(document,query):
    vec1 = text_to_vector(document)
    vec2 = text_to_vector(query)
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])
    sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
    sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)
    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator
cosine = get_cosine('الطالب يذاكر','الطالبة و الطالب')
print("Cosine:", cosine)

Cosine: 0.40824829046386296


In [54]:
text_to_vector('الطالب؟  يذاكر و الطالب')

Counter({'الطالب': 2, 'يذاكر': 1, 'و': 1})

In [None]:
cosine = get_cosine(document2,'الطالبة و الطالب')
print("Cosine:", cosine)

In [None]:
cosine = get_cosine(document3,'الطالبة و الطالب')
print("Cosine:", cosine)

In [None]:
cosine = get_cosine(document4,'الطالبة و الطالب')
print("Cosine:", cosine)

In [None]:
cosine = get_cosine(document5,'الطالبة و الطالب')
print("Cosine:", cosine)

In [None]:
def evaluate(relevant_docs, retrieved_docs):
    relevant = set(relevant_docs)
    retrieved = set(retrieved_docs)
    true_positives = len(relevant.intersection(retrieved))
    precision = true_positives / len(retrieved) if len(retrieved) > 0 else 0
    recall = true_positives / len(relevant) if len(relevant) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return precision,recall,f1
precision, recall, f1 = evaluate([2],[2,1])
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

In [None]:
precision, recall, f1 = evaluate([2,1],[2,1])
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

In [12]:
dict = {'Document9': 0.5199403065766336, 'Document10': 0.4576620428378843, 'Document11': 0.4559767433103392, 
        'Document12': 0.36066785386697287, 'Document13': 0.3412306495090726, 'Document14': 0.14872934763578954, 'Document15': 0.4320408105743342}
dict

{'Document9': 0.5199403065766336,
 'Document10': 0.4576620428378843,
 'Document11': 0.4559767433103392,
 'Document12': 0.36066785386697287,
 'Document13': 0.3412306495090726,
 'Document14': 0.14872934763578954,
 'Document15': 0.4320408105743342}

In [13]:
sorted(dict.items(), key=lambda x: x[1], reverse=True)

[('Document9', 0.5199403065766336),
 ('Document10', 0.4576620428378843),
 ('Document11', 0.4559767433103392),
 ('Document15', 0.4320408105743342),
 ('Document12', 0.36066785386697287),
 ('Document13', 0.3412306495090726),
 ('Document14', 0.14872934763578954)]