In [None]:
# Step 1: Setting Up the Environment
!pip install python-terrier
!pip install nltk
!pip install beautifulsoup4
!pip install requests
!pip install pandas



In [None]:
# Step 2: Importing Libraries and Initializing PyTerrier
import pyterrier as pt
if not pt.started():
    pt.init()

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import pandas as pd
import requests
from bs4 import BeautifulSoup

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Step 3: Data Collection and Web Crawling
# Sample document collection for academic papers from arXiv
urls = [
    "https://arxiv.org/abs/2201.00001",
    "https://arxiv.org/abs/2201.00002",
    "https://arxiv.org/abs/2201.00003",
]

def crawl_web_page(url):
    response = requests.get(url)
    if response.status_code == 200:
        page_content = response.text
        soup = BeautifulSoup(page_content, 'html.parser')
        # Extract relevant sections (title, abstract, and main text)
        title = soup.find('h1', {'class': 'title'}).get_text(strip=True)
        authors = soup.find('div', {'class': 'authors'}).get_text(strip=True)
        abstract = soup.find('blockquote', {'class': 'abstract'}).get_text(strip=True)
        keywords_tag = soup.find('meta', {'name': 'keywords'})
        keywords = keywords_tag['content'] if keywords_tag else ''
        text = title + " " + authors + " " + abstract + " " + keywords
        return text
    else:
        return None

crawled_docs = []
for i, url in enumerate(urls):
    text = crawl_web_page(url)
    if text:
        crawled_docs.append({"docno": f"crawled_{i}", "text": text})

# Convert crawled documents to DataFrame
docs_df = pd.DataFrame(crawled_docs)

In [None]:
# Step 4: Preprocessing the Text
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    words = word_tokenize(text.lower())
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

docs_df['processed_text'] = docs_df['text'].apply(preprocess_text)
print("Combined and Preprocessed Documents:")
print(docs_df)

Combined and Preprocessed Documents:
       docno                                               text  \
0  crawled_0  Title:Modeling Advection on Directed Graphs us...   
1  crawled_1  Title:Time-Dependent Duhamel Renormalization m...   
2  crawled_2  Title:Simulating local fields in carbon nanotu...   

                                      processed_text  
0  titl : model advect direct graph use matérn ga...  
1  titl : time-depend duhamel renorm method multi...  
2  titl : simul local field carbon nanotub reinfo...  


In [None]:
# Step 5: Indexing the Documents
indexer = pt.DFIndexer("./index", overwrite=True)
index_ref = indexer.index(docs_df["processed_text"], docs_df["docno"])
index = pt.IndexFactory.of(index_ref)
print("Index collection statistics:")
print(index.getCollectionStatistics().toString())

Index collection statistics:
Number of documents: 3
Number of terms: 181
Number of postings: 209
Number of fields: 0
Number of tokens: 306
Field names: []
Positions:   false



In [None]:
# Step 6: Implementing the Retrieval Model
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

In [None]:
# Step 7: Search Functionality
def search(query):
    query = preprocess_text(query)
    results = bm25.search(query)
    return results

In [None]:
# Step 8: Evaluating the Retrieval System
test_queries = [
    {"query": "deep learning in medical imaging", "relevant_docs": ["crawled_0"]},
    {"query": "quantum computing algorithms", "relevant_docs": ["crawled_1"]},
    {"query": "graph theory", "relevant_docs": ["crawled_0"]},
    {"query": "carbon nanotubes simulation", "relevant_docs": ["crawled_2"]},
]

def evaluate(test_queries):
    precision_scores = []
    recall_scores = []
    f_measure_scores = []

    for test_query in test_queries:
        results = search(test_query["query"])
        retrieved_docs = results['docno'].tolist()
        relevant_docs = test_query["relevant_docs"]

        true_positives = len(set(retrieved_docs) & set(relevant_docs))
        precision = true_positives / len(retrieved_docs) if retrieved_docs else 0
        recall = true_positives / len(relevant_docs) if relevant_docs else 0
        f_measure = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

        precision_scores.append(precision)
        recall_scores.append(recall)
        f_measure_scores.append(f_measure)

    return {
        "precision": sum(precision_scores) / len(precision_scores),
        "recall": sum(recall_scores) / len(recall_scores),
        "f_measure": sum(f_measure_scores) / len(f_measure_scores),
    }

evaluation_results = evaluate(test_queries)
print("Evaluation results:")
print(evaluation_results)

Evaluation results:
{'precision': 0.75, 'recall': 0.75, 'f_measure': 0.75}
