# Group 16 Information Retrieval Search Engine Project

## 0 Load Package

In [None]:
!pip install rank-bm25

Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2


In [None]:
!pip install sentence-transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [None]:
!pip install openai



In [None]:
# Standard libraries
import string
import json
from collections import Counter, defaultdict

# Data manipulation
import pandas as pd
import numpy as np

# Natural Language Processing (NLP)
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download necessary NLTK data
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

# Sentence Embedding and Similarity
import torch
from sentence_transformers import SentenceTransformer, util

# Initialize Sentence Transformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# BM25 Algorithm
from rank_bm25 import BM25Okapi

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## 1 Text Acquisition

Here, we focus only on the subcategory `cs.CV` (Computer Vision and Pattern Recognition), which includes topics such as image processing, computer vision, pattern recognition, and scene understanding. This subcategory contains approximately 150000 papers. Compared to the previous large-scale dataset with millions of entries, it is more manageable in terms of computational efficiency and processing time.

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("Cornell-University/arxiv")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/Cornell-University/arxiv?dataset_version_number=226...


100%|██████████| 1.42G/1.42G [00:19<00:00, 78.8MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/Cornell-University/arxiv/versions/226


In [None]:
# Move the downloaded dataset folder(from above) to /content so that the rest of the code can access it easily
%mv $path /content/

In [None]:
def extract_cs_papers(file_path, output_path="1_cs_cv_subcategories.csv"):
    records = []

    with open(file_path, 'r') as f:
        for line in f:
            record = json.loads(line.strip())
            categories = record.get('categories', '')

            categories_list = categories.split()
            target_categories = {'cs.CV'}

            if any(cat in target_categories for cat in categories_list):
                records.append({
                    'id': str(record.get('id', '')),
                    'title': record.get('title', ''),
                    'abstract': record.get('abstract', ''),
                    'authors': record.get('authors', ''),
                })

    df = pd.DataFrame(records)
    print(f"Total number of CS papers (CV): {len(df)}")
    display(df.head())
    df.to_csv(output_path, index=False)
    print(f"DataFrame saved as {output_path}")
    return df

file_path = "/content/226/arxiv-metadata-oai-snapshot.json"
df = extract_cs_papers(file_path)

Total number of CS papers (CV): 151541


Unnamed: 0,id,title,abstract,authors
0,704.1267,Text Line Segmentation of Historical Documents...,There is a huge amount of historical documen...,"Laurence Likforman-Sulem, Abderrazak Zahour, B..."
1,704.3635,Rough Sets Computations to Impute Missing Data,Many techniques for handling missing data ha...,Fulufhelo Vincent Nelwamondo and Tshilidzi Mar...
2,705.0199,The Parameter-Less Self-Organizing Map algorithm,The Parameter-Less Self-Organizing Map (PLSO...,"Erik Berglund, Joaquin Sitte"
3,705.0214,Riemannian level-set methods for tensor-valued...,We present a novel approach for the derivati...,"Mourad Zerai, Maher Moakher"
4,705.0449,Multiresolution Approximation of Polygonal Cur...,We propose a new algorithm to the problem of...,"Pierre-Fran\c{c}ois Marteau (VALORIA), Gilbas ..."


DataFrame saved as 1_cs_cv_subcategories.csv


## 2 Text Transformation and Index Creation

Here, we focus on processing the `1_cs_cv_subcategories.csv` file. The main operations include lowercasing, removing punctuation, stopwords, frequent words, and rare words, as well as lemmatization. The processed results are saved in the intermediate file `2_lemmatized_data.csv`. Afterward, tokenization and inverted indexing is performed and saved in the `3_inverted_index.json` file.


### 2.1 Lower Casing

In [None]:
# Convert all columns to lowercase
df['title_lower'] = df['title'].str.lower()
df['abstract_lower'] = df['abstract'].str.lower()

# print(df[['title_lower', 'abstract_lower']].head())

### 2.2 Removal of Punctuations

In [None]:
# Remove punctuation from each column
def remove_punctuation(text):
    if isinstance(text, str):
        return text.translate(str.maketrans('', '', string.punctuation))
    return text

df['title_wo_punct'] = df['title_lower'].apply(remove_punctuation)
df['abstract_wo_punct'] = df['abstract_lower'].apply(remove_punctuation)

# print(df[['title_wo_punct', 'abstract_wo_punct']].head())

### 2.3 Removal of stopwords

In [None]:
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))

"a, about, above, after, again, against, ain, all, am, an, and, any, are, aren, aren't, as, at, be, because, been, before, being, below, between, both, but, by, can, couldn, couldn't, d, did, didn, didn't, do, does, doesn, doesn't, doing, don, don't, down, during, each, few, for, from, further, had, hadn, hadn't, has, hasn, hasn't, have, haven, haven't, having, he, he'd, he'll, her, here, hers, herself, he's, him, himself, his, how, i, i'd, if, i'll, i'm, in, into, is, isn, isn't, it, it'd, it'll, it's, its, itself, i've, just, ll, m, ma, me, mightn, mightn't, more, most, mustn, mustn't, my, myself, needn, needn't, no, nor, not, now, o, of, off, on, once, only, or, other, our, ours, ourselves, out, over, own, re, s, same, shan, shan't, she, she'd, she'll, she's, should, shouldn, shouldn't, should've, so, some, such, t, than, that, that'll, the, their, theirs, them, themselves, then, there, these, they, they'd, they'll, they're, they've, this, those, through, to, too, under, until, up, 

In [None]:
STOPWORDS = set(stopwords.words('english'))

def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

df['title_wo_stop'] = df['title_wo_punct'].apply(lambda text: remove_stopwords(text))
df['abstract_wo_stop'] = df['abstract_wo_punct'].apply(lambda text: remove_stopwords(text))

# print(df[['title_wo_stop', 'abstract_wo_stop']].head())

### 2.4 Removal of Frequent Words

In [None]:
from collections import Counter

cnt = Counter()

for text in df[['title_wo_stop', 'abstract_wo_stop']].values:
    for content in text:
        for word in content.split():
            cnt[word] += 1

print(cnt.most_common(10))

[('image', 153585), ('model', 134192), ('learning', 133541), ('data', 122965), ('models', 117645), ('images', 116089), ('methods', 103113), ('method', 102106), ('performance', 91803), ('using', 84790)]


In [None]:
FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])

def remove_freqwords(text):
    """custom function to remove the frequent words"""
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])

df["title_wo_stopfreq"] = df["title_wo_stop"].apply(lambda text: remove_freqwords(text))
df["abstract_wo_stopfreq"] = df["abstract_wo_stop"].apply(lambda text: remove_freqwords(text))

# print(df[['title_wo_stopfreq', 'abstract_wo_stopfreq']].head())

### 2.5 Removal of Rare Words

In [None]:
n_rare_words = 10

RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
def remove_rarewords(text):
    """custom function to remove the rare words"""
    return " ".join([word for word in str(text).split() if word not in RAREWORDS])

df["title_wo_stopfreqrare"] = df["title_wo_stopfreq"].apply(lambda text: remove_rarewords(text))
df["abstract_wo_stopfreqrare"] = df["abstract_wo_stopfreq"].apply(lambda text: remove_rarewords(text))

# print(df[['title_wo_stopfreqrare', 'abstract_wo_stopfreqrare']].head())

### 2.6 Lemmatization

In [None]:
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

df["title_lemmatized"] = df["title_wo_stopfreqrare"].apply(lambda text: lemmatize_words(text))
df["abstract_lemmatized"] = df["abstract_wo_stopfreqrare"].apply(lambda text: lemmatize_words(text))

# print(df[['title_lemmatized', 'abstract_lemmatized']].head())

In [None]:
df[['title_lemmatized', 'abstract_lemmatized', 'title', 'abstract', 'authors', 'id']].to_csv('2_lemmatized_data.csv', index=False)

print("Lemmatization complete! Intermediate file saved as '2_lemmatized_data.csv'")

Lemmatization complete! Intermediate file saved as '2_lemmatized_data.csv'


### 2.7 Tokenization

In [None]:
# Load intermediate file
df = pd.read_csv('/content/2_lemmatized_data.csv')

  df = pd.read_csv('/content/2_lemmatized_data.csv')


In [None]:
def preprocess(text):
    return word_tokenize(text)

def batch_process(df, batch_size=1000):
    inverted_index = defaultdict(set)
    processed_docs = []

    total_docs = len(df)
    print(f"Total documents: {total_docs}")

    for start in range(0, total_docs, batch_size):
        end = min(start + batch_size, total_docs)
        batch = df.iloc[start:end]

        for i, row in batch.iterrows():
            doc_id = row['id']

            content = f"{row['title_lemmatized']} {row['abstract_lemmatized']}"

            tokens = preprocess(content)

            processed_docs.append({'doc_id': doc_id, 'tokens': tokens})

            for token in set(tokens):
                inverted_index[token].add(doc_id)

        print(f"Processed {end}/{total_docs} documents")

    return processed_docs, inverted_index

In [None]:
processed_docs, inverted_index = batch_process(df, batch_size=1000)

Total documents: 151541
Processed 1000/151541 documents
Processed 2000/151541 documents
Processed 3000/151541 documents
Processed 4000/151541 documents
Processed 5000/151541 documents
Processed 6000/151541 documents
Processed 7000/151541 documents
Processed 8000/151541 documents
Processed 9000/151541 documents
Processed 10000/151541 documents
Processed 11000/151541 documents
Processed 12000/151541 documents
Processed 13000/151541 documents
Processed 14000/151541 documents
Processed 15000/151541 documents
Processed 16000/151541 documents
Processed 17000/151541 documents
Processed 18000/151541 documents
Processed 19000/151541 documents
Processed 20000/151541 documents
Processed 21000/151541 documents
Processed 22000/151541 documents
Processed 23000/151541 documents
Processed 24000/151541 documents
Processed 25000/151541 documents
Processed 26000/151541 documents
Processed 27000/151541 documents
Processed 28000/151541 documents
Processed 29000/151541 documents
Processed 30000/151541 docum

In [None]:
with open('3_inverted_index.json', 'w') as f:
    json.dump({token: list(doc_ids) for token, doc_ids in inverted_index.items()}, f, ensure_ascii=False, indent=4)

print("Processing and export complete!")

Processing and export complete!


In [None]:
tokenized_corpus = [doc['tokens'] for doc in processed_docs]
doc_ids = [str(doc['doc_id']) for doc in processed_docs]
doc_texts = [" ".join(doc['tokens']) for doc in processed_docs]

## 3 Retrieval and Ranking

Here, we first run BM25 alone for retrieval and ranking, then use a hybrid method combining it with minilm to retrieve results again. Through the dataframe, we can see there are some changes in the ranking. Try adjusting alpha and test queries to understand the differences. Code will be added here later to implement printing the retrieval time.

### 3.1 BM25

In [None]:
# Initialize BM25
bm25 = BM25Okapi(tokenized_corpus)

In [None]:
# Test Query
query = "Applying transfer learning to image analysis"
tokenized_query = query.lower().split()
scores = bm25.get_scores(tokenized_query)

# Get the top 10 relevant documents for the query
top_n = sorted(zip(doc_ids, scores), key=lambda x: x[1], reverse=True)[:10]
results = pd.DataFrame(top_n, columns=["id", "bm25_score"])
results["bm25_rank"] = range(1, len(results) + 1)

df["id"] = df["id"].astype(str)
results["id"] = results["id"].astype(str)

bm25_final_df = pd.merge(results, df, on="id", how="left")
bm25_final_df = bm25_final_df.sort_values("bm25_rank")

# Display the result
bm25_final_df = bm25_final_df[["bm25_rank", "bm25_score", "title", "abstract",'id']]
display(bm25_final_df)

Unnamed: 0,bm25_rank,bm25_score,title,abstract,id
0,1,17.5367,Persistence Diagrams with Linear Machine Learn...,Persistence diagrams have been widely recogn...,1706.10082
1,2,14.947036,Knowledge Transfer for Melanoma Screening with...,Knowledge transfer impacts the performance o...,1703.07479
2,3,12.62425,Deep Active Learning: A Reality Check,We conduct a comprehensive evaluation of sta...,2403.148
3,4,12.462598,Character-Aware Models Improve Visual Text Ren...,Current image generation models struggle to ...,2212.10562
4,5,11.992452,Brief Introduction to Contrastive Learning Pre...,To improve performance in visual feature rep...,2210.03163
5,6,11.982047,Shortcut Learning in Deep Neural Networks,Deep learning has triggered the current rise...,2004.0778
6,7,11.440466,Transferability analysis of data-driven additi...,Data-driven research in Additive Manufacturi...,2309.06286
7,8,11.145522,Osteosarcoma Tumor Detection using Transfer Le...,The field of clinical image analysis has bee...,2305.0966
8,9,11.101455,Breast Cancer Diagnosis in Two-View Mammograph...,Some recent studies have described deep conv...,2110.01606
9,10,11.086021,"Deep Learning Meets OBIA: Tasks, Challenges, S...",Deep learning has gained significant attenti...,2408.01607


### 3.2 Hybrid Model

In [None]:
def hybrid_search(query, doc_ids, doc_texts, bm25, alpha, top_k):
    # Step 1: BM25 scoring
    tokenized_query = query.lower().split()
    bm25_scores = bm25.get_scores(tokenized_query)
    bm25_limit = max(top_k * 3, 100)
    top_n = sorted(zip(doc_ids, bm25_scores), key=lambda x: x[1], reverse=True)[:bm25_limit]

    # Step 2: MiniLM encoding
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    top_n_docs = [doc_id for doc_id, _ in top_n]
    top_n_texts = [doc_texts[doc_ids.index(doc_id)] for doc_id in top_n_docs]

    query_embedding = model.encode(query, convert_to_tensor=True)
    doc_embeddings = model.encode(top_n_texts, convert_to_tensor=True)
    cosine_scores = util.cos_sim(query_embedding, doc_embeddings)[0]

    # Step 3: Normalize & Combine
    def min_max_norm(tensor):
        return (tensor - tensor.min()) / (tensor.max() - tensor.min() + 1e-8)

    bm25_tensor = torch.tensor([score for _, score in top_n])
    bm25_norm = min_max_norm(bm25_tensor)
    cosine_norm = min_max_norm(cosine_scores)

    hybrid_scores = alpha * bm25_norm + (1 - alpha) * cosine_norm

    # Ensure we return doc_id, bm25_score, and hybrid_score
    full_ranking = [(doc_id, float(bm25_score), float(hybrid_score))for
                    (doc_id, bm25_score), hybrid_score in zip(top_n, hybrid_scores.tolist())]

    final_ranking = sorted(full_ranking, key=lambda x: x[2], reverse=True)[:top_k]

    return final_ranking

In [None]:
# Test Query
query = "Transfer learning approaches for image processing"
alpha = 0.3  # BM25 and MiniLM weight
top_k = 100

# Get hybrid search results
hybrid_results = hybrid_search(query, doc_ids, doc_texts, bm25, alpha, top_k)

# Create DataFrame with id, bm25_score, and hybrid_score
results = pd.DataFrame(hybrid_results, columns=["id", "bm25_score", "hybrid_score"])

# Add BM25 rank (by BM25 score, descending order)
results["bm25_rank"] = results["bm25_score"].rank(ascending=False, method="min").astype(int)

# Add Hybrid rank (by hybrid_score, descending order)
results = results.sort_values("hybrid_score", ascending=False).reset_index(drop=True)
results["hybrid_rank"] = results.index + 1

# Ensure ID is a string for consistent merging
df["id"] = df["id"].astype(str)
results["id"] = results["id"].astype(str)

# Merge hybrid results with original dataset
hybrid_final_df = pd.merge(results, df, on="id", how="left")

# Sort by hybrid_rank for final output
hybrid_final_df = hybrid_final_df.sort_values("hybrid_rank").reset_index(drop=True)

# Display the result
hybrid_final_df = hybrid_final_df[["hybrid_rank", "hybrid_score", 'bm25_rank', "title", "abstract", 'id']]
display(hybrid_final_df[:10])

Unnamed: 0,hybrid_rank,hybrid_score,bm25_rank,title,abstract,id
0,1,0.874031,1,Knowledge Transfer for Melanoma Screening with...,Knowledge transfer impacts the performance o...,1703.07479
1,2,0.740188,7,Image Style Transfer: from Artistic to Photore...,The rapid advancement of deep learning has s...,2203.06328
2,3,0.726323,20,Filter Style Transfer between Photos,"Over the past few years, image-to-image styl...",2007.07925
3,4,0.715977,28,Understanding the Mechanisms of Deep Transfer ...,The ability to automatically learn task spec...,1704.0604
4,5,0.700888,98,Domain Adaptive Transfer Learning with Special...,Transfer learning is a widely used method to...,1811.07056
5,6,0.696288,13,VisTabNet: Adapting Vision Transformers for Ta...,Although deep learning models have had great...,2501.00057
6,7,0.695801,15,Unlabeled Data Deployment for Classification o...,Convolutional neural networks (CNNs) are ext...,2002.03321
7,8,0.692593,26,Style Transfer: From Stitching to Neural Networks,This article compares two style transfer met...,2409.00606
8,9,0.688566,67,Lifting Layers: Analysis and Applications,The great advances of learning-based approac...,1803.0866
9,10,0.68854,100,Supervised Transfer Learning at Scale for Medi...,Transfer learning is a standard technique to...,2101.05913
