In [1]:
!pip install nltk rank_bm25



In [2]:
# Import libraries
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from rank_bm25 import BM25Okapi

# Download NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
!wget https://zenodo.org/records/11212056/files/CR-UNSC_2024-05-19_EN_TXT_BEST.zip?download=1
!unzip CR-UNSC_2024-05-19_EN_TXT_BEST.zip?download=1
!mkdir dataset
!mv *.txt dataset/

--2025-09-07 12:15:50--  https://zenodo.org/records/11212056/files/CR-UNSC_2024-05-19_EN_TXT_BEST.zip?download=1
Resolving zenodo.org (zenodo.org)... 188.185.45.92, 188.185.48.194, 188.185.43.25, ...
Connecting to zenodo.org (zenodo.org)|188.185.45.92|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8166779 (7.8M) [application/octet-stream]
Saving to: ‘CR-UNSC_2024-05-19_EN_TXT_BEST.zip?download=1.1’


2025-09-07 12:15:51 (7.80 MB/s) - ‘CR-UNSC_2024-05-19_EN_TXT_BEST.zip?download=1.1’ saved [8166779/8166779]

Archive:  CR-UNSC_2024-05-19_EN_TXT_BEST.zip?download=1
  inflating: S_RES_0001_1946_EN_GOLD.txt  
  inflating: S_RES_0002_1946_EN_GOLD.txt  
  inflating: S_RES_0003_1946_EN_GOLD.txt  
  inflating: S_RES_0004_1946_EN_GOLD.txt  
  inflating: S_RES_0005_1946_EN_GOLD.txt  
  inflating: S_RES_0006_1946_EN_GOLD.txt  
  inflating: S_RES_0007_1946_EN_GOLD.txt  
  inflating: S_RES_0008_1946_EN_GOLD.txt  
  inflating: S_RES_0009_1946_EN_GOLD.txt  
  inflating: S_RE

In [4]:
# Get all text files from dataset directory
dataset_path = 'dataset/'
txt_files = [f for f in os.listdir(dataset_path) if f.endswith('.txt')]
print(f"\nFound {len(txt_files)} text files")

# Prepare stopwords (excluding critical legal terms)
stop_words = set(stopwords.words('english')) - {'not', 'shall', 'no', 'without'}

# Storage for processed documents
processed_docs = []
doc_filenames = []


Found 2722 text files


In [5]:
# Process each document
for filename in txt_files:
    filepath = os.path.join(dataset_path, filename)

    # Read file
    with open(filepath, 'r', encoding='utf-8') as file:
        text = file.read()

    # Boilerplate removal - remove ALL matching patterns from full text
    text = re.sub(r'S/RES/\d+|Distr.: General|Page \d+ of \d+|© \d+|UN DOCUMENT', '', text, flags=re.IGNORECASE)

    # Cleaning - collapse whitespace and convert to lowercase
    text = re.sub(r'\n\s*\n+', ' ', text).strip()
    text = text.lower()

    # Tokenization
    tokens = word_tokenize(text)

    # Filter tokens: only alphabetic, remove stopwords
    filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]

    # Store processed document
    processed_docs.append(filtered_tokens)
    doc_filenames.append(filename)

    print(f"Processed {filename}: {len(filtered_tokens)} tokens")

Processed S_RES_0895_1994_EN_GOLD.txt: 126 tokens
Processed S_RES_0307_1971_EN_GOLD.txt: 207 tokens
Processed S_RES_2397_2017_EN.txt: 2536 tokens
Processed S_RES_0696_1991_EN_GOLD.txt: 171 tokens
Processed S_RES_1086_1996_EN.txt: 359 tokens
Processed S_RES_2250_2015_EN.txt: 1143 tokens
Processed S_RES_0999_1995_EN.txt: 614 tokens
Processed S_RES_2382_2017_EN.txt: 1772 tokens
Processed S_RES_1206_1998_EN.txt: 369 tokens
Processed S_RES_1260_1999_EN.txt: 635 tokens
Processed S_RES_1365_2001_EN.txt: 371 tokens
Processed S_RES_1834_2008_EN.txt: 703 tokens
Processed S_RES_1829_2008_EN.txt: 525 tokens
Processed S_RES_2100_2013_EN.txt: 2732 tokens
Processed S_RES_1458_2003_EN.txt: 227 tokens
Processed S_RES_2562_2021_EN.txt: 489 tokens
Processed S_RES_2244_2015_EN.txt: 1424 tokens
Processed S_RES_2174_2014_EN.txt: 657 tokens
Processed S_RES_2057_2012_EN.txt: 1797 tokens
Processed S_RES_1789_2007_EN.txt: 597 tokens
Processed S_RES_1993_2011_EN.txt: 388 tokens
Processed S_RES_1239_1999_EN.txt: 

In [6]:
# Build BM25 index
print(f"\nBuilding BM25 index with {len(processed_docs)} documents...")
bm25 = BM25Okapi(processed_docs)


Building BM25 index with 2722 documents...


In [7]:
# Define queries
queries = ["peacekeeping force", "sanctions against iran", "ceasefire agreement"]

# Process and search each query
print("\n" + "="*60)
print("BM25 SEARCH RESULTS")
print("="*60)

for query_text in queries:
    print(f"\nQuery: '{query_text}'")
    print("-" * 40)

    # Tokenize and preprocess query same way as documents
    query_tokens = word_tokenize(query_text.lower())
    query_tokens = [word for word in query_tokens if word.isalpha() and word not in stop_words]

    # Get BM25 scores for all documents
    scores = bm25.get_scores(query_tokens)

    # Get top 3 results
    top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:3]

    for rank, idx in enumerate(top_indices, 1):
        filename = doc_filenames[idx]
        score = scores[idx]
        print(f"{rank}. {filename} (Score: {score:.4f})")

print("\n" + "="*60)
print("Search completed successfully!")


BM25 SEARCH RESULTS

Query: 'peacekeeping force'
----------------------------------------
1. S_RES_1327_2000_EN.txt (Score: 4.3339)
2. S_RES_1353_2001_EN.txt (Score: 4.2923)
3. S_RES_2378_2017_EN.txt (Score: 4.2598)

Query: 'sanctions against iran'
----------------------------------------
1. S_RES_2231_2015_EN.txt (Score: 13.0834)
2. S_RES_1929_2010_EN.txt (Score: 11.1259)
3. S_RES_1803_2008_EN.txt (Score: 11.0215)

Query: 'ceasefire agreement'
----------------------------------------
1. S_RES_2694_2023_EN.txt (Score: 5.8207)
2. S_RES_1279_1999_EN.txt (Score: 5.7547)
3. S_RES_1291_2000_EN.txt (Score: 5.7349)

Search completed successfully!
