<a href="https://colab.research.google.com/github/ayomibamm/Search-Engine-using-NLP-Techniques./blob/main/Search_Engine_using_NLP_Techniques.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Robot.txt check**

https://pureportal.coventry.ac.uk/robots.txt:

- User-Agent: *

- Crawl-Delay: 5

- Disallow: /*?*format=rss

- Disallow: /*?*export=xls

- Sitemap: https://pureportal.coventry.ac.uk/sitemap.xml

## **Libraries importation**

In [1]:
import json
import string
import pandas as pd
import numpy as np
from time import sleep
from bs4 import BeautifulSoup as bs
import requests
from collections import Counter
import urllib.robotparser
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
# RobotFileParser object to help read the url robots.txt file
rule_preservation = urllib.robotparser.RobotFileParser()
rule_preservation.set_url("https://pureportal.coventry.ac.uk/en/organisations/research-centre-for-computational-science-and-mathematical-modell/robots.txt")
rule_preservation.read()

# Ensuring that required url can be crawled
url = "https://pureportal.coventry.ac.uk/en/organisations/research-centre-for-computational-science-and-mathematical-modell/page.html"
if rule_preservation.can_fetch("*", url):
    print("Website crawling allowed")
else:
    print("Website crawling disallowed")

Website crawling allowed


## **Web crawler processing**

In [3]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'}

data =[]

def getInfo(page):
    url = f'https://pureportal.coventry.ac.uk/en/organisations/research-centre-for-computational-science-and-mathematical-modell/publications/?page={page}&pagesize=50'
    response = requests.get(url, headers=headers)
    html = response.text
    soup = bs(html, "html.parser")
    box = soup.find_all('h3', class_='title')

    for dom in box:
        pub = {}
        title =dom.string.lower()   # extraction of publication title
        link = dom.find('a', class_ = 'link')
        links = link.get('href') # extraction of publication links

        html2 = requests.get(links)
        soup2 = bs(html2.text, 'html.parser') # application of beautiful soup module to each link to extract further information

        pub_abstract = soup2.find('div', class_ = 'textblock') # retrieving publication abstract

        if pub_abstract:
            abstracts = pub_abstract.text.strip()
            abstract = abstracts.partition('.')[0] + '...' # ensuring that only the first line of abstract is extracted
        else:
            abstract= "not available" # some of the publications do not have this information

        aut_details = soup2.find_all('a', class_ = 'link person')
        aut_data = []
        for author in aut_details:
            aut_name = author.get_text().strip() # authors name and link
            if aut_name:
                aut_names = aut_name
                aut_links = author.get('href')
                aut_data.append(f'{aut_names}, {aut_links}')

        pub_date = soup2.find('span', class_ = 'date').string.strip()

        if aut_data: # all information appended only if at least 1 author has both information present
            pub['Publication title'] = title
            pub['Link'] = links
            pub['Abstract'] = abstract
            pub['Date'] = pub_date
            pub['Author information'] = aut_data
            data.append(pub)

    return

# crawl delay of 5 seconds per page
for x in range(0,5):
    getInfo(x)
    sleep(5)


In [None]:
data

In [5]:
# total number publications extracted
len(data)

228

## **Inverted Index implementation**

In [6]:
# data saved as a json data file
with open('publication.json', 'w') as f:
    json.dump(data, f)

publications = "/content/publication.json"

#function for inverted_index for the unique terms in the documents
ps = PorterStemmer()
def data_processing(publications):
    inverted_index={}
    cleaned_titles = []
    with open('publication.json', 'r') as f:
        publications = json.load(f)

    data_titles = [publication['Publication title'] for publication in publications]

    doc_id = 0

    # data pre-processing techniques application
    for titles in data_titles:
        tokenised_titles = word_tokenize(titles)
        tokenised_lower = [w.lower() for w in tokenised_titles if w.isalnum()]
        stopped_titles = [w for w in tokenised_lower if w not in stopwords.words('english')]
        stemmed_titles = [ps.stem(w) for w in stopped_titles]
        cleaned_titles.append(stemmed_titles)

        # initialisation of the indexer for each unique terms.
        for stemmed in stemmed_titles:
            value = inverted_index.get(stemmed)
            if value == None:
                count = [1, [doc_id]]
                inverted_index[stemmed] = count
            else:
                count = inverted_index[stemmed]
                if doc_id not in count[1]:
                    count[1].append(doc_id)
                    count[0] += 1

        doc_id += 1

    return inverted_index, cleaned_titles, data_titles

inverted_index, cleaned_titles, data_titles = data_processing(publications)

In [None]:
inverted_index

In [None]:
data_titles

### **Example of implementation- Limited to just 1 to 2 words**

In [9]:

def get_publication_info(query, cleaned_titles, inverted_index, data):
    # Flatten the list of titles
    flatten_list = [item for items in cleaned_titles for item in items]

    # Tokenize, lowercase, remove stopwords, and stem the query
    token_query = word_tokenize(query)
    token_lower = [w.lower() for w in token_query if w.isalnum()]
    stopped_query = [w for w in token_lower if w not in stopwords.words('english')]
    ps = PorterStemmer()
    stemmed = [ps.stem(w) for w in stopped_query]

    if any(word not in flatten_list for word in stemmed):
        print('please try searching another term or phrase')
        return None

    # Check the length of the stemmed query
    if len(stemmed) == 1 and (stemmed[-1] in flatten_list):
        infor = inverted_index[stemmed[-1]]
        post_list = infor[1]

    elif len(stemmed) == 2:
        result = []
        for word in stemmed:
            if word in flatten_list:
                infor = inverted_index[word]
                lists = infor[1]
                result.append(lists)
            else:
                print(f'{word} is not present in any publication')
                post_list = lists
                print(post_list)

        if len(result) > 0:
            post_list = list(set.intersection(*map(set,result)))

    else:
        print('please try searching another term or phrase')
        return None

    Result_list = []
    associated_titles = [data[i] for i in post_list]
    for information in associated_titles:
        date, link = information['Date'], information['Link']
        info = f"'{date}': {link}"
        Result_list.append(info)

    return Result_list

# Example usage:
# Replace the arguments with your actual data
user_input = input('Query: ')
get_publication_info(user_input, cleaned_titles, inverted_index, data)


Query: machine learning


["'25 Sept 2021': https://pureportal.coventry.ac.uk/en/publications/detection-of-sleep-apnea-using-machine-learning-algorithms-based-",
 "'14 Nov 2022': https://pureportal.coventry.ac.uk/en/publications/sc-square-future-progress-with-machine-learning",
 "'20 Apr 2022': https://pureportal.coventry.ac.uk/en/publications/stable-likelihood-computation-for-machine-learning-of-linear-diff",
 "'5 Jan 2022': https://pureportal.coventry.ac.uk/en/publications/using-machine-learning-for-anomaly-detection-on-a-system-on-chip-",
 "'30 Jun 2022': https://pureportal.coventry.ac.uk/en/publications/using-machine-learning-for-anomaly-detection-on-a-system-on-chip--2",
 "'2022': https://pureportal.coventry.ac.uk/en/publications/machine-learning-for-computer-algebra",
 "'23 Aug 2022': https://pureportal.coventry.ac.uk/en/publications/using-machine-learning-in-sc2",
 "'9 Jun 2021': https://pureportal.coventry.ac.uk/en/publications/using-machine-learning-algorithms-to-develop-a-clinical-decision-",
 "'8 Jul

## **Implementation of Query Processor**

In [11]:
docid = []
for i in range(1, (len(data)+1)):
    docid.append(i)

# dictionary of data with docid as key and main data as nested dictionary
#- this will be the processor result
data_dict = dict(zip(docid, data))

# convert data to a data frame with docid as index to help visualise each data better
df = pd.DataFrame(data)
df['doc_id'] = docid
df = df.set_index('doc_id')

In [15]:
# combining the tokenised words in cleaned titles to form a cleaned sentence for vectorization
filtered_docs = []
for title in cleaned_titles:
    title = ' '.join(title)
    filtered_docs.append(title)
filtered_docs

def queryprocessor(query):
    tokens = word_tokenize(query)
    tmp = ""
    for w in tokens:
        if w not in stopwords.words('english'):
            tmp += ps.stem(w) + " "

    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(filtered_docs) # vectorization of the document
    query_vec = vectorizer.transform([tmp]) # vectorization of query
    results = cosine_similarity(X,query_vec).reshape((-1,)) # to measure document similary
    res = results.tolist()
    similarity_data = {'doc_id': docid, 'similarity_score': res, 'title': data_titles}
    df_similarity = pd.DataFrame(similarity_data) # dataframe to visualise the similarity of the query to each doc
    df_similarity.sort_values(by=['similarity_score'], inplace=True, ascending=False) # values arranged in ascending order
    query_relevance = df_similarity[df_similarity['similarity_score'] > 0] #only similarity values > 0 printed
    post_lists = (query_relevance['doc_id']).tolist()

    # retrieves publication details based on doc ID from post list
    if post_lists:
        for i in post_lists:
            idd = i
            for id, info in data_dict.items():
                if idd == id:
                    print()
                    for key in info:
                        result = print(key + ':', info[key])
    else:
        print('please try searching another term or phrase')

    return post_lists, results, query_relevance

user_input = input('Query: ')
post_lists, results, query_relevance = queryprocessor(user_input)

Query: machin learnnig

Publication title: machine learning for computer algebra
Link: https://pureportal.coventry.ac.uk/en/publications/machine-learning-for-computer-algebra
Abstract: not available
Date: 2022
Author information: ['Rashid Barket, https://pureportal.coventry.ac.uk/en/persons/rashid-barket', 'Tereso del Río, https://pureportal.coventry.ac.uk/en/persons/tereso-del-r%C3%ADo-almajano', 'Matthew England, https://pureportal.coventry.ac.uk/en/persons/matthew-england']

Publication title: using machine learning in sc2
Link: https://pureportal.coventry.ac.uk/en/publications/using-machine-learning-in-sc2
Abstract: This talk exposes many possible uses of Machine Learning (ML) in the context of SC2, and how this approach differs from human-made heuristics...
Date: 23 Aug 2022
Author information: ['Tereso del Río, https://pureportal.coventry.ac.uk/en/persons/tereso-del-r%C3%ADo-almajano']

Publication title: sc-square: future progress with machine learning
Link: https://pureportal.c

In [None]:
query_relevance

Unnamed: 0,doc_id,similarity_score,title
117,118,0.496218,machine learning for computer algebra
149,150,0.476305,using machine learning in sc2
137,138,0.423423,sc-square: future progress with machine learning
64,65,0.343662,opt-rnn-dbsvm: optimal recurrent neural networ...
147,148,0.333344,using machine learning for anomaly detection o...
148,149,0.333344,using machine learning for anomaly detection o...
211,212,0.308186,using machine learning algorithms to develop a...
139,140,0.257561,stable likelihood computation for machine lear...
215,216,0.254528,a leap from randomized to quantum clustering w...
129,130,0.242446,predicting primary sequence-based protein-prot...


In [16]:
import pandas as pd

# Load JSON data
with open('publication.json', 'r') as f:
    json_data = json.load(f)

# Convert to DataFrame
df = pd.DataFrame(json_data)

# Save as CSV
df.to_csv('publications.csv', index=False)