# Notebook: Search Engine with Python, NLTK, and Scikit-learn
# Author: [Yasser Barghouth]

# Description: This notebook demonstrates the creation of a simple search engine using a service-oriented approach.
#              It utilizes NLTK for text preprocessing and Scikit-learn for vectorization and search functionality.


# Import necessary libraries

In [16]:
import ir_datasets
import nltk
from nltk.tokenize import word_tokenize , sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
from spellchecker import SpellChecker
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import numpy as np
import string
from typing import List
import pandas as pd
import pickle
from itertools import chain
from tabulate import tabulate
import mwparserfromhell
import re
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from wordcloud import WordCloud
import joblib
import json
from flask import Flask, request, jsonify
from flask_sqlalchemy import SQLAlchemy
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, Float, Text

# Preprocessor Service

In [17]:
class Preprocessor:
    def __init__(self):
        # Initialize stopwords and stemmer
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    def get_wordnet_pos(self, tag):
        """
        Map POS tag to first character lemmatize() accepts.
        
        Args:
            tag (str): The POS tag.
        
        Returns:
            str: The corresponding wordnet POS tag.
        """
        tag_dict = {
            "J": wordnet.ADJ,
            "N": wordnet.NOUN,
            "V": wordnet.VERB,
            "R": wordnet.ADV
        }
        return tag_dict.get(tag[0].upper(), wordnet.NOUN)

    def lemmatization(self, tagged_doc_text):
        """
        Lemmatize the document text based on POS tags.
        
        Args:
            tagged_doc_text (list): A list of tuples with word and POS tag.
        
        Returns:
            list: A list of lemmatized words.
        """
        return [self.lemmatizer.lemmatize(word, pos=self.get_wordnet_pos(tag)) for word, tag in tagged_doc_text]

    def preprocess(self, text):
        """
        Preprocesses the input text by tokenizing, removing stopwords, and lemmatizing.
        
        Args:
            text (str): The text to preprocess.
        
        Returns:
            list: A list of preprocessed tokens.
        """
        # print("start processe ==================")
        
        # Tokenize text
        tokens = word_tokenize(text.lower())
        # print("Tokenize & lower text")
        # Remove punctuation tokens
        tokens = [token.translate(str.maketrans('', '', string.punctuation)) for token in tokens if token.translate(str.maketrans('', '', string.punctuation))]
        # print("Remove punctuation tokens")
        # Remove stopwords
        tokens = [word for word in tokens if word not in self.stop_words]
        # print("Remove stopwords")
        # Apply Part-of-speech Tagging
        tagged_tokens = pos_tag(tokens)
        # print("Apply Part-of-speech Tagging")
        # print("Part-of-speech Tagging tokens : " + str(tagged_tokens[:5]))
        # Apply lemmatization
        tokens = self.lemmatization(tagged_tokens)
        # print("Apply lemmatization")
        # print("lemmatized tokens : " + str(tokens[:5]))

        # print("end processe ==================")
        
        return tokens

## Spell Corrector Service

In [18]:
class SpellCorrector:
    def __init__(self):
        self.spell_checker = SpellChecker()

    def correct_sentence_spelling(self, query):
        """
        Corrects the spelling of query.
        
        Args:
            query (str): A query text.
        
        Returns:
            list: A list of corrected tokens.
        """
        query_tokens = word_tokenize(query.lower())
        misspelled = self.spell_checker.unknown(query_tokens)
        corrected_tokens = [self.spell_checker.correction(token) if token in misspelled else token for token in query_tokens]
        return ' '.join(corrected_tokens)


# Vectorizer Service

In [19]:
class Vectorizer:
    def __init__(self):
        # Initialize TF-IDF vectorizer
        self.vectorizer = TfidfVectorizer()

    def fit_transform(self, documents):
        """
        Fits the vectorizer to the documents and transforms them into TF-IDF vectors.
        
        Args:
            documents (list): A list of documents (strings).
        
        Returns:
            sparse matrix: The document-term matrix.
        """
        self.tfidf_matrix = self.vectorizer.fit_transform(documents)
        return self.tfidf_matrix

    def transform(self, document):
        """
        Transforms a single document into a TF-IDF vector.
        
        Args:
            document (str): The document to transform.
        
        Returns:
            sparse matrix: The TF-IDF vector of the document.
        """
        return self.vectorizer.transform([document])

    def get_feature_names(self):
        """
        Retrieves the feature names (terms) from the vectorizer.
        
        Returns:
            array: An array of feature names.
        """
        return self.vectorizer.get_feature_names_out()

# SearchEngine Service

In [46]:
class SearchEngine:
    def __init__(self, preprocessor, spell_corrector, document_model, app):
        # Initialize with preprocessor, spell corrector and vectorizer and matrice services
        self.preprocessor = preprocessor
        self.spell_corrector = spell_corrector
        self.vectorizers = {}
        self.tfidf_matrices = {}
        
        with app.app_context():
            self.documents = [doc.to_dict() for doc in document_model.query.all()]
            self.elements = document_model.get_columns(True)

    def index_documents(self):
        """
        Indexes the documents by preprocessing and vectorizing them.
        
        Args:
            documents (list): A list of documents to index.
        """
        for element in self.elements:
            try:
                # print("processed docs elment " + element + " :")
                # print("before processe *******************************************")
                # Preprocess documents
                processed_docs = [' '.join(self.preprocessor.preprocess(doc[element])) for doc in self.documents]
                # print("after processe *******************************************")
                
                vectorizer = Vectorizer()
                self.vectorizers[element] = vectorizer
                # Vectorize documents
                self.tfidf_matrices[element] = vectorizer.fit_transform(processed_docs)
                # print(f"Vectorization successful for {element}. Matrix shape:", self.tfidf_matrices[element].shape)
                # print("******************************************************************")
            except Exception as e:
                print(f"An error occurred during vectorization of {element}:", e)
    
    def save_model(self, name):
        joblib.dump(self.vectorizers, f'{name}_vectorizers.pkl')
        joblib.dump(self.tfidf_matrices, f'{name}tfidf_matrices.pkl')

    def load_model(self,name):
        self.vectorizers = joblib.load(f'{name}_vectorizers.pkl')
        self.tfidf_matrices = joblib.load(f'{name}tfidf_matrices.pkl')

    def search(self, query, weights):
        """
        Searches the indexed documents for the given query and ranks them by relevance.
        
        Args:
            query (str): The search query.
            elements (list): List of elements to search within.
            weights (list): List of weights for each element.
        
        Returns:
            tuple: Ranked indices of documents, their similarity scores and corrected query.
        """
        
        # print("query : " + query)

        # Correct spelling of the query
        corrected_query = self.spell_corrector.correct_sentence_spelling(query)
        # print("corrected_query : " + str(corrected_query))
        
        # Preprocess the query
        query_processed = self.preprocessor.preprocess(corrected_query)
        # print("query_processed : " + str(query_processed))
        
        if not query_processed:
            print("Query processed to an empty list. Cannot search with an empty query.")
            return [], [], corrected_query
        
        query = ' '.join(query_processed)
        # print("query joined : " + query)
        
        scores = np.zeros(len(self.documents))

        for element, weight in zip(self.elements, weights):
            try:
                query_vector = self.vectorizers[element].transform(query)
                
                if query_vector.shape[1] == 0:
                    print(f"No valid terms in query for element {element}. Skipping.")
                    continue
                
                # Compute cosine similarities between the query and the documents
                cosine_similarities = cosine_similarity(query_vector, self.tfidf_matrices[element]).flatten()
                scores += weight * cosine_similarities
            except Exception as e:
                print(f"An error occurred during searching in {element}:", e)

        # Rank documents by similarity
        ranked_indices = np.argsort(scores)[::-1]
        return ranked_indices, scores, corrected_query


In [36]:
def clean_value(value):
    # Use mwparserfromhell to parse the value
    wikicode = mwparserfromhell.parse(value)
    text = wikicode.strip_code()  # Convert to plain text

    # Remove Markdown-like syntax for bold/italic
    text = re.sub(r"'''''(.*?)'''''", r"\1", text)  # Bold and italic
    text = re.sub(r"'''(.*?)'''", r"\1", text)      # Bold
    text = re.sub(r"''(.*?)''", r"\1", text)        # Italic

    # Normalize whitespaces
    text = re.sub(r"\s+", " ", text).strip()

    return text

In [45]:

app = Flask(__name__)
app.config['SQLALCHEMY_DATABASE_URI'] = 'mysql://root:@localhost/ir_search_engine'
db = SQLAlchemy(app)

class Document(db.Model):
    id = db.Column(db.Integer, primary_key=True)
    page_title = db.Column(db.String(255))
    wikidata_classes = db.Column(db.String(255))
    text = db.Column(db.Text)
    sections = db.Column(db.Text)
    infoboxes = db.Column(db.Text)

    def to_dict(self):
        return {
            'id': self.id,
            'page_title': self.page_title,
            'wikidata_classes': self.wikidata_classes,
            'text': self.text,
            'sections': self.sections,
            'infoboxes': self.infoboxes,
        }
        
    def get_columns(self, exclude_id = False):
        columns = self.__table__.columns
        if exclude_id:
            return [column.name for column in columns if column.name != 'id']
        else:
            return [column.name for column in columns]

with app.app_context():
    db.create_all()

In [38]:
def save_document(doc_data):
    doc = Document(
        page_title=doc_data['page_title'],
        wikidata_classes=doc_data['wikidata_classes'],
        text=doc_data['text'],
        sections=doc_data['sections'],
        infoboxes=doc_data['infoboxes']
    )
    db.session.add(doc)
    db.session.commit()


# Main Application

In [33]:
# Main Application
if __name__ == "__main__":
    # Load dataset
    dataset = ir_datasets.load("trec-tot/2023/train") # number of docs in this dataset is 231852 doc
    
    corpus = []

    for i, doc in enumerate(dataset.docs_iter()):
        if i == 1000:
            break
        
        doc_data = {
            "page_title": doc.page_title,
            "wikidata_classes": doc.wikidata_classes[0][1],
            "text": doc.text,
            "sections": "",
            "infoboxes": "",
        }
    
        for section_name, section_text in doc.sections.items():
            doc_data["sections"] += f"\n {section_text}"
    
        for infobox in doc.infoboxes:
            for key, value in infobox['params'].items():
                cleaned_value = clean_value(value)
                doc_data["infoboxes"] += f"\n {cleaned_value}"
        
        corpus.append(doc_data)
        with app.app_context():
            save_document(doc_data)

In [37]:
# # Save processed data
# with open('cleaned_corpus.json', 'w') as f:
#     json.dump(corpus, f)

# # Load processed data
# with open('cleaned_corpus.json', 'r') as f:
#     corpus = json.load(f)

In [47]:
    # Initialize services
    document_model = Document()
    preprocessor = Preprocessor()
    spell_corrector = SpellCorrector()
    search_engine = SearchEngine(preprocessor, spell_corrector, document_model, app)

In [48]:
    # elements = ["page_title", "wikidata_classes", "text", "sections", "infoboxes"]
    weights = [0.5, 1.5, 0.3, 0.1, 0.1]  # Assign weights based on importance

In [36]:
    # Index documents
    search_engine.index_documents()
   
    # Save the model
    search_engine.save_model("trec-tot")

## Diagram Data

In [None]:
    # page_title ======================================
    tfidf_matrix = search_engine.tfidf_matrices['page_title']
    feature_names = search_engine.vectorizers['page_title'].get_feature_names()
    word_scores = dict(zip(feature_names, np.array(tfidf_matrix.sum(axis=0)).flatten()))    
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_scores)
    plt.figure(figsize=(10, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Word Cloud of TF-IDF Scores page title')
    plt.show()

    # تحويل مصفوفة TF-IDF إلى DataFrame لاستخدامها مع seaborn
    tfidf_df = pd.DataFrame(search_engine.tfidf_matrices['page_title'].toarray(), columns=search_engine.vectorizers['page_title'].get_feature_names())
    
    plt.figure(figsize=(12, 8))
    sns.heatmap(tfidf_df.T, cmap='viridis', cbar=True)
    plt.title('Heatmap of TF-IDF Scores page title')
    plt.xlabel('Document')
    plt.ylabel('Term')
    plt.show()

    # Vectorize the corpus using one element, e.g., 'page_title'
    tfidf_matrix = search_engine.tfidf_matrices['page_title'].toarray()

    # Apply PCA to reduce to 3D
    pca = PCA(n_components=3)
    reduced_matrix = pca.fit_transform(tfidf_matrix)

    # Plotting the results in 3D
    fig = plt.figure(figsize=(10, 8))
    ax = fig.add_subplot(111, projection='3d')

    ax.scatter(reduced_matrix[:, 0], reduced_matrix[:, 1], reduced_matrix[:, 2], c='blue', marker='o')

    ax.set_title('3D Visualization of TF-IDF Matrix page title')
    ax.set_xlabel('PCA Component 1')
    ax.set_ylabel('PCA Component 2')
    ax.set_zlabel('PCA Component 3')

    doc_lengths = [len(doc["page_title"].split()) for doc in corpus]
    plt.figure(figsize=(10, 6))
    sns.histplot(doc_lengths, bins=30, kde=True)
    plt.title('Distribution of Document\'s page title Lengths')
    plt.xlabel('Number of Words')
    plt.ylabel('Frequency')
    plt.show()

    # text ======================================
    tfidf_matrix = search_engine.tfidf_matrices['text']
    feature_names = search_engine.vectorizers['text'].get_feature_names()
    word_scores = dict(zip(feature_names, np.array(tfidf_matrix.sum(axis=0)).flatten()))    
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_scores)
    plt.figure(figsize=(10, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Word Cloud of TF-IDF Scores text')
    plt.show()

    # # تحويل مصفوفة TF-IDF إلى DataFrame لاستخدامها مع seaborn
    # tfidf_df = pd.DataFrame(search_engine.tfidf_matrices['text'].toarray(), columns=search_engine.vectorizers['text'].get_feature_names())
    
    # plt.figure(figsize=(12, 100))
    # sns.heatmap(tfidf_df.T, cmap='viridis', cbar=True)
    # plt.title('Heatmap of TF-IDF Scores text')
    # plt.xlabel('Document')
    # plt.ylabel('Term')
    # plt.show()
    
    # # Vectorize the corpus using one element, e.g., 'text'
    # tfidf_matrix = search_engine.tfidf_matrices['text'].toarray()

    # # Apply PCA to reduce to 3D
    # pca = PCA(n_components=3)
    # reduced_matrix = pca.fit_transform(tfidf_matrix)

    # # Plotting the results in 3D
    # fig = plt.figure(figsize=(10, 8))
    # ax = fig.add_subplot(111, projection='3d')

    # ax.scatter(reduced_matrix[:, 0], reduced_matrix[:, 1], reduced_matrix[:, 2], c='blue', marker='o')

    # ax.set_title('3D Visualization of TF-IDF Matrix text')
    # ax.set_xlabel('PCA Component 1')
    # ax.set_ylabel('PCA Component 2')
    # ax.set_zlabel('PCA Component 3')

    doc_lengths = [len(doc["text"].split()) for doc in corpus]
    plt.figure(figsize=(10, 6))
    sns.histplot(doc_lengths, bins=30, kde=True)
    plt.title('Distribution of Document\'s text Lengths')
    plt.xlabel('Number of Words')
    plt.ylabel('Frequency')
    plt.show()

    # sections ======================================
    tfidf_matrix = search_engine.tfidf_matrices['sections']
    feature_names = search_engine.vectorizers['sections'].get_feature_names()
    word_scores = dict(zip(feature_names, np.array(tfidf_matrix.sum(axis=0)).flatten()))    
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_scores)
    plt.figure(figsize=(10, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Word Cloud of TF-IDF sections')
    plt.show()

    # # تحويل مصفوفة TF-IDF إلى DataFrame لاستخدامها مع seaborn
    # tfidf_df = pd.DataFrame(search_engine.tfidf_matrices['sections'].toarray(), columns=search_engine.vectorizers['sections'].get_feature_names())
    
    # plt.figure(figsize=(12, 100))
    # sns.heatmap(tfidf_df.T, cmap='viridis', cbar=True)
    # plt.title('Heatmap of TF-IDF Scores sections')
    # plt.xlabel('Document')
    # plt.ylabel('Term')
    # plt.show()
    
    # # Vectorize the corpus using one element, e.g., 'sections'
    # tfidf_matrix = search_engine.tfidf_matrices['sections'].toarray()

    # # Apply PCA to reduce to 3D
    # pca = PCA(n_components=3)
    # reduced_matrix = pca.fit_transform(tfidf_matrix)

    # # Plotting the results in 3D
    # fig = plt.figure(figsize=(10, 8))
    # ax = fig.add_subplot(111, projection='3d')

    # ax.scatter(reduced_matrix[:, 0], reduced_matrix[:, 1], reduced_matrix[:, 2], c='blue', marker='o')

    # ax.set_title('3D Visualization of TF-IDF Matrix sections')
    # ax.set_xlabel('PCA Component 1')
    # ax.set_ylabel('PCA Component 2')
    # ax.set_zlabel('PCA Component 3')

    doc_lengths = [len(doc["sections"].split()) for doc in corpus]
    plt.figure(figsize=(10, 6))
    sns.histplot(doc_lengths, bins=30, kde=True)
    plt.title('Distribution of Document\'s sections Lengths')
    plt.xlabel('Number of Words')
    plt.ylabel('Frequency')
    plt.show()

    # infoboxes ======================================
    tfidf_matrix = search_engine.tfidf_matrices['infoboxes']
    feature_names = search_engine.vectorizers['infoboxes'].get_feature_names()
    word_scores = dict(zip(feature_names, np.array(tfidf_matrix.sum(axis=0)).flatten()))    
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_scores)
    plt.figure(figsize=(10, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Word Cloud of TF-IDF Scores infoboxes')
    plt.show()

    # # تحويل مصفوفة TF-IDF إلى DataFrame لاستخدامها مع seaborn
    # tfidf_df = pd.DataFrame(search_engine.tfidf_matrices['infoboxes'].toarray(), columns=search_engine.vectorizers['infoboxes'].get_feature_names())
    
    # plt.figure(figsize=(12, 100))
    # sns.heatmap(tfidf_df.T, cmap='viridis', cbar=True)
    # plt.title('Heatmap of TF-IDF Scores infoboxes')
    # plt.xlabel('Document')
    # plt.ylabel('Term')
    # plt.show()

    # # Vectorize the corpus using one element, e.g., 'infoboxes'
    # tfidf_matrix = search_engine.tfidf_matrices['infoboxes'].toarray()

    # # Apply PCA to reduce to 3D
    # pca = PCA(n_components=3)
    # reduced_matrix = pca.fit_transform(tfidf_matrix)

    # # Plotting the results in 3D
    # fig = plt.figure(figsize=(10, 8))
    # ax = fig.add_subplot(111, projection='3d')

    # ax.scatter(reduced_matrix[:, 0], reduced_matrix[:, 1], reduced_matrix[:, 2], c='blue', marker='o')

    # ax.set_title('3D Visualization of TF-IDF Matrix infoboxes')
    # ax.set_xlabel('PCA Component 1')
    # ax.set_ylabel('PCA Component 2')
    # ax.set_zlabel('PCA Component 3')

    doc_lengths = [len(doc["infoboxes"].split()) for doc in corpus]
    plt.figure(figsize=(10, 6))
    sns.histplot(doc_lengths, bins=30, kde=True)
    plt.title('Distribution of Document\'s infoboxes Lengths')
    plt.xlabel('Number of Words')
    plt.ylabel('Frequency')
    plt.show()

## Query Section

In [55]:
    # Example search query
    query = "film television Chldren"
    
    search_engine.load_model("trec-tot")
    
    # print(search_engine.)
    
    ranked_indices, scores, corrected_query = search_engine.search(query, weights)

    # عرض التصحيحات للمستخدم
    if corrected_query != query.lower() :
        print("Did you mean: " + str(corrected_query))
        
    # Display results
    print("\n\nRanked Documents:\n")
    for index in ranked_indices[:10]:  # Limit to top 10 results
        print(f"Document {index} - Score: {scores[index]:.4f}")
        print(f"Title: {corpus[index]['page_title']}")
        print(f"Classes: {corpus[index]['wikidata_classes']}")
        # Uncomment to display more details
        # print(f"Text: {corpus[index]['text']}")
        # print(f"Sections: {corpus[index]['sections']}")
        # print(f"Infoboxes: {corpus[index]['infoboxes']}")
        print("-" * 50)

Did you mean: film television children


Ranked Documents:

Document 332 - Score: 1.3650
Title: All My Children
Classes: television program
--------------------------------------------------
Document 81 - Score: 1.2435
Title: Married... with Children
Classes: television program
--------------------------------------------------
Document 303 - Score: 1.1785
Title: You Can't Do That on Television
Classes: television program
--------------------------------------------------
Document 220 - Score: 1.1496
Title: Marty (The Philco Television Playhouse)
Classes: television program
--------------------------------------------------
Document 702 - Score: 1.1164
Title: Where Are My Children?
Classes: film
--------------------------------------------------
Document 263 - Score: 1.0153
Title: Sesame Street
Classes: television program
--------------------------------------------------
Document 106 - Score: 0.9937
Title: Press Gang
Classes: television program
----------------------------------------

In [56]:
@app.route('/search', methods=['GET'])
def search():
    query = request.args.get('query', '')
    type_dataset = request.args.get('type_dataset', '')

    if not query:
        return jsonify({
            'message': 'Query parameter is required'
        }), 400
        
    if not type_dataset:
        return jsonify({
            'message': 'Type dataset parameter is required'
        }), 400

    if type_dataset == "1":
        search_engine.load_model("trec-tot")
    elif type_dataset == "2":
        search_engine.load_model("webis-touche2020")
    else:
        return jsonify({
            'message': "Type dataset not valid"
        }), 400
        
    ranked_indices, scores, corrected_query = search_engine.search(query, weights)
    
    results = []
    for index in ranked_indices[:10]:
        doc = search_engine.documents[index]
        results.append({
            'title': doc['page_title'],
            'wikidata_classes': doc['wikidata_classes'],
            'text_snippet': ' '.join(doc['text'].split()[:30]),
            'similarity_score': f'{scores[index]:.4f}'
        })

    response = {
        'corrected_query': corrected_query if corrected_query.lower() != query.lower() else None,
        'results': results
    }
    return jsonify(response)

In [58]:
if __name__ == '__main__':
    app.run(debug=True)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
 * Restarting with watchdog (windowsapi)


SystemExit: 1