In [1]:
!pip -q install nltk rank_bm25

In [2]:
import os, re, glob, json, nltk, random
import numpy as np
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from rank_bm25 import BM25Okapi

nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

True

In [3]:
!rm -rf dataset

In [4]:
!wget -q -O CR.zip https://zenodo.org/records/11212056/files/CR-UNSC_2024-05-19_EN_TXT_BEST.zip?download=1
!unzip -q CR.zip -d dataset/ && rm CR.zip
TXT_FILES = glob.glob("dataset/*.txt")

In [5]:
STOP = set(stopwords.words("english")) - {"not", "shall", "no", "without"}
LEMMA = WordNetLemmatizer()

In [6]:
def remove_uscouncil_templates(text: str) -> str:
    # General  DD MMM YYYY
    text = re.sub(r"S/RES/\d+.*?Distr\.: General.*?\n", "", text, flags=re.S | re.I)
    # page footers
    text = re.sub(r"Page \d+ of \d+.*?\n", "", text, flags=re.I)
    # copyright line
    text = re.sub(r"¬©.*?20\d{2}.*?\n", "", text, flags=re.I)
    return text


def preprocess(text: str) -> list[str]:
    """split ‚Üí tokenise ‚Üí clean ‚Üí lemmatise."""
    text = remove_uscouncil_templates(text)
    sents = sent_tokenize(text)
    tokens = []
    for sent in sents:
        sent = re.sub(r"\s+", " ", sent.strip().lower())
        toks = word_tokenize(sent)
        toks = [LEMMA.lemmatize(t) for t in toks
                if t.isalpha() and t not in STOP]
        tokens.extend(toks)
    return tokens

In [7]:
processed_docs, doc_ids = [], []
for fp in TXT_FILES:
    with open(fp, encoding="utf-8") as f:
        processed_docs.append(preprocess(f.read()))
    doc_ids.append(os.path.basename(fp))

bm25 = BM25Okapi(processed_docs)
print("üîç BM25 index built")

üîç BM25 index built


In [8]:
QUERIES = ["peacekeeping force", "sanctions against iran", "ceasefire agreement"]

def search(query: str, k: int = 3):
    qtok = preprocess(query)
    scores = bm25.get_scores(qtok)
    top = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)[:k]
    return [(doc_ids[i], s) for i, s in top]

print("\n" + "="*60)
for q in QUERIES:
    print(f"\nQuery: {q!r}")
    for rank, (did, sc) in enumerate(search(q), 1):
        print(f"{rank}. {did}  (score {sc:.4f})")
print("="*60)



Query: 'peacekeeping force'
1. S_RES_1327_2000_EN.txt  (score 6.6788)
2. S_RES_1311_2000_EN.txt  (score 6.6335)
3. S_RES_1096_1997_EN.txt  (score 6.6281)

Query: 'sanctions against iran'
1. S_RES_2231_2015_EN.txt  (score 13.0989)
2. S_RES_1929_2010_EN.txt  (score 11.1211)
3. S_RES_1803_2008_EN.txt  (score 11.0169)

Query: 'ceasefire agreement'
1. S_RES_2694_2023_EN.txt  (score 5.2597)
2. S_RES_1279_1999_EN.txt  (score 5.1727)
3. S_RES_1291_2000_EN.txt  (score 5.1510)


In [15]:
def compare(idx: int):
    fp = TXT_FILES[idx]
    with open(fp, encoding="utf-8") as f:
        raw_text = f.read()

    clean = remove_uscouncil_templates(raw_text)
    clean = re.sub(r"\s+", " ", clean.strip())

    # lematised version
    lemmas = preprocess(raw_text)

    print(f"name: {os.path.basename(fp)}")
    print(clean)
    print("‚Äî" * 80)
    print(" ".join(lemmas))

compare(0)

name: S_RES_1441_2002_EN.txt
United Nations 8 November 2002 Resolution 1441 (2002) Adopted by the Security Council at its 4644th meeting, on 8 November 2002 The Security Council, Recalling all its previous relevant resolutions, in particular its resolutions 661 (1990) of 6 August 1990, 678 (1990) of 29 November 1990, 686 (1991) of 2 March 1991, 687 (1991) of 3 April 1991, 688 (1991) of 5 April 1991, 707 (1991) of 15 August 1991, 715 (1991) of 11 October 1991, 986 (1995) of 14 April 1995, and 1284 (1999) of 17 December 1999, and all the relevant statements of its President, Recalling also its resolution 1382 (2001) of 29 November 2001 and its intention to implement it fully, Recognizing the threat Iraq‚Äôs non-compliance with Council resolutions and proliferation of weapons of mass destruction and long-range missiles poses to international peace and security, Recalling that its resolution 678 (1990) authorized Member States to use all necessary means to uphold and implement its resoluti