**Load Dataset**

In [None]:
import pandas as pd

df =pd.read_csv('/content/Articles.csv', encoding='latin1')

In [None]:
display(df.head())

df.info()

df.isnull().sum()

Unnamed: 0,Article,Date,Heading,NewsType
0,KARACHI: The Sindh government has decided to b...,1/1/2015,sindh govt decides to cut public transport far...,business
1,HONG KONG: Asian markets started 2015 on an up...,1/2/2015,asia stocks up in new year trad,business
2,HONG KONG: Hong Kong shares opened 0.66 perce...,1/5/2015,hong kong stocks open 0.66 percent lower,business
3,HONG KONG: Asian markets tumbled Tuesday follo...,1/6/2015,asian stocks sink euro near nine year,business
4,NEW YORK: US oil prices Monday slipped below $...,1/6/2015,us oil prices slip below 50 a barr,business


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2692 entries, 0 to 2691
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Article   2692 non-null   object
 1   Date      2692 non-null   object
 2   Heading   2692 non-null   object
 3   NewsType  2692 non-null   object
dtypes: object(4)
memory usage: 84.3+ KB


Unnamed: 0,0
Article,0
Date,0
Heading,0
NewsType,0


In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
word="running"
tokens = word_tokenize(word)
print("Tokens:", tokens)
lemmatized_word = lemmatizer.lemmatize(word, pos=wordnet.VERB)
print("Lemmatized word:", lemmatized_word)
string="A quick brown fox jumps over the lazy dog"
tokens=word_tokenize(string.lower())
print("Tokens in lower case:",tokens)
tokens_without_stopwords=[word for word in tokens if word.isalnum() and word not in stop_words]
print("Tokens without stopwords:",tokens_without_stopwords)

Tokens: ['running']
Lemmatized word: run
Tokens in lower case: ['a', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
Tokens without stopwords: ['quick', 'brown', 'fox', 'jumps', 'lazy', 'dog']


In [None]:
def preprocess_text(text):
  tokens=word_tokenize(text.lower())

  tokens=[word for word in tokens if word.isalnum() and word not in stop_words]
  tokens=[lemmatizer.lemmatize(word,pos='v') for word in tokens ]
  return tokens

df['preprocessed_text']=df['Article'].apply(preprocess_text)
display(df.head())

Unnamed: 0,Article,Date,Heading,NewsType,preprocessed_text
0,KARACHI: The Sindh government has decided to b...,1/1/2015,sindh govt decides to cut public transport far...,business,"[karachi, sindh, government, decide, bring, pu..."
1,HONG KONG: Asian markets started 2015 on an up...,1/2/2015,asia stocks up in new year trad,business,"[hong, kong, asian, market, start, 2015, upswi..."
2,HONG KONG: Hong Kong shares opened 0.66 perce...,1/5/2015,hong kong stocks open 0.66 percent lower,business,"[hong, kong, hong, kong, share, open, percent,..."
3,HONG KONG: Asian markets tumbled Tuesday follo...,1/6/2015,asian stocks sink euro near nine year,business,"[hong, kong, asian, market, tumble, tuesday, f..."
4,NEW YORK: US oil prices Monday slipped below $...,1/6/2015,us oil prices slip below 50 a barr,business,"[new, york, us, oil, price, monday, slip, 50, ..."


In [None]:
!pip install rank-bm25

from rank_bm25 import BM25Okapi

Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2


In [None]:
def calculate_precision_recall(retrieved_docs, relevant_docs):
    retrieved_set = {doc_idx for doc_idx, _ in retrieved_docs}
    precision = len(retrieved_set & relevant_docs) / len(retrieved_set)
    recall = len(retrieved_set & relevant_docs) / len(relevant_docs)
    return precision, recall

    # Average Precision
def average_precision(top_docs, relevant_docs):
    ap = 0
    hits = 0

    for rank, (doc_idx, score) in enumerate(top_docs, start=1):
        if doc_idx in relevant_docs:
            hits += 1
            ap += hits / rank

    return ap / hits if hits > 0 else 0

# Reciprocal Rank
def reciprocal_rank(top_docs, relevant_docs):
    for rank, (doc_idx, score) in enumerate(top_docs, start=1):
        if doc_idx in relevant_docs:
            return 1 / rank
    return 0

In [None]:
doc= df['preprocessed_text'].tolist()
bm25= BM25Okapi(doc)

def ret_doc(query, relevant_docs):
  query_tokens=preprocess_text(query)
  doc_scores=bm25.get_scores(query_tokens)
  sort_doc=sorted(enumerate(doc_scores),key=lambda x:x[1],reverse=True)

  top_k=5
  top_docs=sort_doc[:top_k]
  # EVALUATION
  precision, recall = calculate_precision_recall(top_docs, relevant_docs)
  ap = average_precision(top_docs, relevant_docs)
  rr = reciprocal_rank(top_docs, relevant_docs)

  print(f"\n=== Evaluation Metrics for Query: '{query}' ===")
  print(f"Precision: {precision:.4f}")
  print(f"Recall: {recall:.4f}")
  print(f"Average Precision (AP): {ap:.4f}")
  print(f"Reciprocal Rank (RR): {rr:.4f}\n")

  print("=== Top Retrieved Documents ===\n")
  for doc_index,score in top_docs:
    print(f"Doc Index: {doc_index},Score:{score}, Heading: {df.iloc[doc_index]['Heading']}")
    print(f"Article: {df.iloc[doc_index]['Article'][:200]}...")
    print("\n---\n")


In [None]:
query = "oil"
relevant_docs = {2630, 2491, 415, 294, 538}  # Example: manually specified relevant documents for the query
ret_doc(query, relevant_docs)


=== Evaluation Metrics for Query: 'oil' ===
Precision: 1.0000
Recall: 1.0000
Average Precision (AP): 1.0000
Reciprocal Rank (RR): 1.0000

=== Top Retrieved Documents ===

Doc Index: 2630,Score:3.367028311931764, Heading: Iran US sanctions stop American oil firms proj
Article: strong>TEHRAN: Iran has imposed no restrictions on U.S. oil firms willing to participate in energy projects in the country but American sanctions make such cooperation impossible, Iran's deputy oil mi...

---

Doc Index: 2491,Score:3.347026536408691, Heading: Pakistan imports 605333 tones palm oil Malaysi
Article: strong>ISLAMABAD: Pakistan imported about 605,333 tones of palm oil from Malaysia during the period of January-September, which were used primarily in the food sector.</strongPakistan continued to be ...

---

Doc Index: 415,Score:3.3363082834671354, Heading: Oil surges on Saudi Arabia Iran tensi
Article: strong>SINGAPORE: Oil prices surged during the start of 2016 trading as relations between top crude