In this notebook, the goal is to create an indexer for the crawled data stored in a database.<br>
The database is <i>database.csv</i> which is in the same working directory.<br><br>
seed_url = https://pureportal.coventry.ac.uk/en/organisations/centre-global-learning

At the end, query search is implemented and the relevant results are displayed.

Before running this notebook, please check the <i>scheduled_crawler.ipynb</i> notebook.

# Import Required Libraries

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import nltk
# nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# import warnings
# warnings.filterwarnings("ignore")

# Create some helper functions.

### Create list of CGL profiles

In [2]:
def CGLprofiles():
    """
    This function creates a list of profiles on CGL
    """
    profiles_link = seed_url+"/persons"
    profile_page = requests.get(profiles_link)
    soup3 = BeautifulSoup(profile_page.text, "html.parser")
    
    persons_list = soup3.find_all("a", class_ = "link person")
    profiles = []
    for person in persons_list:
        profiles.append(person.get_text())
    return profiles

###### Demonstration

In [3]:
seed_url = "https://pureportal.coventry.ac.uk/en/organisations/centre-global-learning"
page = requests.get(seed_url)
soup = BeautifulSoup(page.text, "html.parser")
profiles = CGLprofiles()
profiles

['Sian Alsop',
 'Dimitar Angelov',
 'Rami Ayoubi',
 'Ema Baukaite',
 'Julia Carroll',
 'Jacqueline Cawston',
 'Megan Crawford',
 'QueAnh Dang',
 'Alun DeWinter',
 'Ken Fero',
 'Mark Hodds',
 'Sylwia Holmes',
 'Elizabeth Horton',
 'Jaya Jacobo',
 'Emmanuel Johnson',
 'Mehmet Karakus',
 'Luca Morini',
 'Marina Orsini-Jones',
 'Charlotte Price',
 'Steve Raven',
 'Carlo Tramontano',
 'Katherine Wimpenny']

### Display only CGL authors

In [4]:
def displayResult(link):
    """
    This function is used to check if the author is from CGL, and only display the 
    author link if it has profile on CGL.
    Takes in publication link as the input.
    """
    paper = requests.get(link)
    soup2 = BeautifulSoup(paper.text, "html.parser")
    authors = soup2.select_one("p", class_ = "relations persons")
    authors = authors.get_text()
    authors = re.sub(r'\s*\([^)]*\)', '', authors).split(", ") # list of authors
    
    title = soup2.select_one('h1').get_text()

    date = soup2.select_one('.date').get_text()

    print(title)
    print(link, end='\n\n')

    # to create links to CGL authors
    print("Authors:")
    for author in authors:
        if author in profiles:
            print(author)
            print(("https://pureportal.coventry.ac.uk/en/persons/")+(author.lower().replace(" ","-")))

        else:
            print(author,end='\n\n')
    print("\nDate: ", date)
    

###### Demonstration

In [5]:
displayResult("https://pureportal.coventry.ac.uk/en/publications/carnival-of-invention")

Carnival of invention
https://pureportal.coventry.ac.uk/en/publications/carnival-of-invention

Authors:
Jenny Fennessy

Sandie Woods

Helen Johnson

Carol Rivas

David Norbury

Isilda Almeida-Harvey

Jessica Moriarty

Katherine Wimpenny
https://pureportal.coventry.ac.uk/en/persons/katherine-wimpenny
Kerensa Bushell

Polly Blake


Date:  1 May 2019


# Preprocessing

This is a helper function which preprocesses the text. <br>
It does the following: <br> 
<br>
• convert text into lower case<br>
• remove special characters, punctuations and numbers<br>
• tokenize into words<br>
• remove stop words<br>
• stemming<br>
<br>
After the above preprocessing task, it again joins all the words back into a single string.

In [6]:
def preprocessText(text):
    """
    This function preprocesses the text into a clean and consistent format that can
    be fed into a model for further analysis.
    """
    # lowercase
    text = text.lower()
    
    # remove special characters, punctuations and numbers
    text = re.sub(r"[^a-zA-Z\s]", "", text)

    # tokenize into words
    words = text.split()

    # remove stop words
    sw = stopwords.words('english')
    words = [word for word in words if word not in sw]

    # stemming
    ps = PorterStemmer()
    words = [(ps.stem(word)) for word in words]

    # join the processed words back into a single string
    processed_text = " ".join(words)

    return processed_text


###### Demonstration

In [7]:
preprocessText("""This is a random text. 123 *&^%]} THE preprocessText 
    FUNCTION CREATES A CLEAN & "COnsistent" text string. wink wink ;) """)

'random text preprocesstext function creat clean consist text string wink wink'

# Function to create Inverted Index

In [8]:
def createInvertedIndex(document, inverted_index):
    """
    This function is used to create an inverted index using python defaultdict.
    Inputs are document to be indexed and dictionary to be made.
    It returns the inverted index dictionary.
    """
    for index, doc in enumerate(document):
        for term in doc.split():
            if term in inverted_index:
                inverted_index[term].add(index)

            else:
                inverted_index[term] = {index}
    return inverted_index

###### Demonstration

In [9]:
# create test doc
test_doc = pd.Series(['random person music','avishek','music random random cat',
                      'person cat text'])

# create an empty dictionary for inveted index
test_index = defaultdict(list)

# create inverted index
createInvertedIndex(test_doc, test_index)

# print(test_doc)

defaultdict(list,
            {'random': {0, 2},
             'person': {0, 3},
             'music': {0, 2},
             'avishek': {1},
             'cat': {2, 3},
             'text': {3}})

### Search Function

In [10]:
def search(query):
    """
    This function takes in user query and displays the relevant results.
    It also displays the total number of relevant outputs.
    """
    # preprocess the query
    processed_query = preprocessText(query)
    
    # tokenize query
    query_terms = processed_query.split()

    # compare with inverted index
    relevant_docs = set() # creating empty set to make sure no duplicate index is added
    
    for term in query_terms:
        if term in inverted_index_db: # inverted_index
            relevant_docs.update(inverted_index_db[term])

    relevant_docs = list(relevant_docs) # convert to list
    
    if relevant_docs != []:
        # tfidf
        tfidf_query = vectorizer.transform([processed_query])
        
        # calculate cosine similarity for the relevant docs
        cosine_similarities = cosine_similarity(tfidf_query, tfidf_matrix[relevant_docs])

        # sort the relevant documents based on similarity scores
        sorted_docs = sorted(zip(relevant_docs, cosine_similarities[0]), key=lambda x: x[1], reverse=True)
        
        # for numbering
        i=1

        for index, text in sorted_docs:
            print(i,'. ', end='')
            displayResult(doc_to_index.Publication_Link[index])
            print("----------------------------------------------------------------------\n")

            i+=1
    else:
        print('No matching result for "', query, '". Please try another.')
    print("Total results displayed: ",len(relevant_docs))

# Read the database and Create Inverted Index

In [11]:
# read the database
doc_to_index = pd.read_csv("database.csv")
# combine Title and Authors for indexing
doc_to_index['Text'] = doc_to_index['Title']+ ' ' + doc_to_index['Authors']

# preprocessing
text_doc = doc_to_index.Text
processed_text_doc = text_doc.apply(preprocessText)

# create an empty dictionary for inveted index
inverted_index_db = defaultdict(list)

# create inverted index
createInvertedIndex(processed_text_doc, inverted_index_db)

# tfidf vectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(processed_text_doc)

# the given seed url
seed_url = "https://pureportal.coventry.ac.uk/en/organisations/centre-global-learning"

# CGL profiles list
profiles = CGLprofiles()

### Demonstration of the Search

Take query from the user

In [12]:
# # uncomment and run this if you want to take input from input box
# query = input("enter: ")
# print('')
# print("----------------------------------------------------------------------\n")
# search(query)

In [13]:
# 1. when query is empty
search("")

No matching result for "  ". Please try another.
Total results displayed:  0


In [14]:
# 2. when query is irrelevant
# like my name, avishek
search("avishek kc")

No matching result for " avishek kc ". Please try another.
Total results displayed:  0


In [15]:
# 3. when publication exists
search("engine")

1 . Summary matters: How lecturers review and preview information in engineering lectures
https://pureportal.coventry.ac.uk/en/publications/summary-matters-how-lecturers-review-and-preview-information-in-e

Authors:
Sian Alsop
https://pureportal.coventry.ac.uk/en/persons/sian-alsop

Date:  5 Jul 2018
----------------------------------------------------------------------

2 . Exploring differential engagement with mathematics support from an engineering student focus
https://pureportal.coventry.ac.uk/en/publications/exploring-differential-engagement-with-mathematics-support-from-a

Authors:
Farhana Gokhool

Duncan Lawson

Mark Hodds
https://pureportal.coventry.ac.uk/en/persons/mark-hodds
Farzana Aslam


Date:  23 Dec 2021
----------------------------------------------------------------------

Total results displayed:  2


In [16]:
# 4. with punctuations
search("_{;engine!!!@#$%")

1 . Summary matters: How lecturers review and preview information in engineering lectures
https://pureportal.coventry.ac.uk/en/publications/summary-matters-how-lecturers-review-and-preview-information-in-e

Authors:
Sian Alsop
https://pureportal.coventry.ac.uk/en/persons/sian-alsop

Date:  5 Jul 2018
----------------------------------------------------------------------

2 . Exploring differential engagement with mathematics support from an engineering student focus
https://pureportal.coventry.ac.uk/en/publications/exploring-differential-engagement-with-mathematics-support-from-a

Authors:
Farhana Gokhool

Duncan Lawson

Mark Hodds
https://pureportal.coventry.ac.uk/en/persons/mark-hodds
Farzana Aslam


Date:  23 Dec 2021
----------------------------------------------------------------------

Total results displayed:  2


In [17]:
# 5. with stop words 
search("how to be a nigerian")

1 . What does it Mean to be Educated in Nigerian Student Experience?
https://pureportal.coventry.ac.uk/en/publications/what-does-it-mean-to-be-educated-in-nigerian-student-experience

Authors:
Emmanuel Johnson
https://pureportal.coventry.ac.uk/en/persons/emmanuel-johnson

Date:  1 Jun 2023
----------------------------------------------------------------------

2 . Becoming Nigerian
https://pureportal.coventry.ac.uk/en/publications/becoming-nigerian

Authors:
Emmanuel Johnson
https://pureportal.coventry.ac.uk/en/persons/emmanuel-johnson
Godswill Ezeonyeka

Aisha Houmadi

Inemesit Inyang

Lizzy Johnson

Misal Eskender


Date:  25 Mar 2023
----------------------------------------------------------------------

Total results displayed:  2


In [18]:
# 6. ranked retrieval
search("global")

1 . Global Learning and Assessment; How can global citizenship-oriented curricula enhance teaching training?
https://pureportal.coventry.ac.uk/en/publications/global-learning-and-assessment-how-can-global-citizenship-oriente

Authors:
Arinola Adefila

QueAnh Dang
https://pureportal.coventry.ac.uk/en/persons/queanh-dang
Alun DeWinter
https://pureportal.coventry.ac.uk/en/persons/alun-dewinter
Luca Morini
https://pureportal.coventry.ac.uk/en/persons/luca-morini
Katherine Wimpenny
https://pureportal.coventry.ac.uk/en/persons/katherine-wimpenny

Date:  2022
----------------------------------------------------------------------

2 . Global knowledge-exchange for widening participation in learning through Open Education Practices
https://pureportal.coventry.ac.uk/en/publications/global-knowledge-exchange-for-widening-participation-in-learning-

Authors:
Katherine Wimpenny
https://pureportal.coventry.ac.uk/en/persons/katherine-wimpenny

Date:  13 May 2019
--------------------------------------

In [19]:
# 7. search by authors name
search("mark")

1 . The impact of Covid-19 on mathematical entry competencies: 1 year on
https://pureportal.coventry.ac.uk/en/publications/the-impact-of-covid-19-on-mathematical-entry-competencies-1-year-

Authors:
Mark Hodds
https://pureportal.coventry.ac.uk/en/persons/mark-hodds

Date:  18 Jan 2023
----------------------------------------------------------------------

2 . Changes in student entry competencies 2001 - 2017
https://pureportal.coventry.ac.uk/en/publications/changes-in-student-entry-competencies-2001-2017

Authors:
Mark Hodds
https://pureportal.coventry.ac.uk/en/persons/mark-hodds
Jia Shao

Duncan Lawson


Date:  10 Dec 2020
----------------------------------------------------------------------

3 . The early impact of the Covid-19 pandemic on mathematical competencies on entry into a UK university
https://pureportal.coventry.ac.uk/en/publications/the-early-impact-of-the-covid-19-pandemic-on-mathematical-compete

Authors:
Mark Hodds
https://pureportal.coventry.ac.uk/en/persons/mark-hodd

### Remarks

Notice:<br>
1. Only publications from Centre for Global Learning are displayed.<br>
2. At least one of the co-authors is from CGL. Link is displayed for CGL authors.<br>
3. Can also be searched by authors name.<br>
4. Ranked retrieval using cosine similarity.<br>
5. The results are numbered. <br>
6. Total number of results displayed each time is shown at the end.

# User Interface

I have implemented the User Interface using tkinter. The codes are easy to understand and write.

When you run the below code, a separate window opens up.

In [20]:
import tkinter as tk
from tkinter import *
from tkinter import ttk
import sys
import webbrowser
import re
import io

def clickHyperlink(url):
    webbrowser.open_new(url)
    
def createHyperlinks(text_widget, text):
    text_widget.insert(tk.END, text)
    
    urls = re.findall(r"https?://\S+", text)
    for url in urls:
        index = "1.0"
        while True:
            index = text_widget.search(url, index, stopindex=tk.END)
            if not index:
                break
            if index in text_widget.tag_names():
                # skip already tagged urls
                index = f"{index}+1c"
                continue
            tag_name = f"hyperlink_{len(text_widget.tag_names())}"
            text_widget.tag_add(tag_name, index, f"{index}+{len(url)}c")
            text_widget.tag_config(tag_name, foreground='blue', underline=True)
            text_widget.tag_bind(tag_name, "<Button-1>", lambda event, link=url: clickHyperlink(link))
            
            index = f"{index}+1c" # move to the next character for the next search

def clickSearch(*args):
    try:
        value = (query.get())
        
        console_output = sys.stdout
        sys.stdout = io.StringIO()
        
        search(value)
        
        # capture the console output and update the text_widget
        result = sys.stdout.getvalue()
        
        # clear previous output
        text_widget.delete(1.0, tk.END)  
        createHyperlinks(text_widget, result)

        # restore stdout
        sys.stdout = console_output
        
    except ValueError:
        pass

root = Tk()
root.title("CU CGL Search Engine")

mainframe = ttk.Frame(root, padding="3 3 12 12")
mainframe.grid(column=0, row=0, sticky=(N, W, E, S))
root.columnconfigure(0, weight=1)
root.rowconfigure(0, weight=1)

ttk.Label(mainframe, text="Enter text").grid(column=0, row=0, sticky=E)

query = StringVar()
query_entry = ttk.Entry(mainframe, width=25, textvariable=query)
query_entry.grid(column=1, row=0, sticky=(W, E))

ttk.Button(mainframe, text="Search", command=clickSearch).grid(column=2, row=0, sticky=W)

text_widget = tk.Text(mainframe, wrap="word", font=("Arial", 12))
text_widget.grid(column=0, row=1, columnspan=3, sticky=(N, W, E, S))

mainframe.columnconfigure(0, weight=1)
mainframe.columnconfigure(1, weight=3)
mainframe.columnconfigure(2, weight=1)
mainframe.rowconfigure(0, weight=1)
mainframe.rowconfigure(1, weight=3)

for child in mainframe.winfo_children(): 
    child.grid_configure(padx=5, pady=5)

query_entry.focus()
root.bind("<Return>", clickSearch)

root.mainloop()

<i>Thank you for reading my notebook.
<br>
<br>Avishek K C<br>
2023</i>