In [4]:
import os

# Check current directory
print("Current directory:", os.getcwd())
print("\n--- Files in current directory ---")
print(os.listdir())

# Check inside homework_03_IR folder
print("\n--- Files inside homework_03_IR ---")
print(os.listdir("homework_03_IR"))

Current directory: /content

--- Files in current directory ---
['.config', '.ipynb_checkpoints', 'homework_03_IR', 'sample_data']

--- Files inside homework_03_IR ---
['Articles.csv']


In [6]:
import pandas as pd
csv_file_name = "homework_03_IR/Articles.csv"
try:
    df = pd.read_csv(csv_file_name, encoding='utf-8')
except:
    try:
        df = pd.read_csv(csv_file_name, encoding='latin-1')
    except:
        df = pd.read_csv(csv_file_name, encoding='ISO-8859-1')

print("CSV file loaded successfully!")
print(f"Total rows: {len(df)}")
print(f"Columns: {list(df.columns)}")
print("\n--- First 5 rows ---")
print(df.head())

CSV file loaded successfully!
Total rows: 2692
Columns: ['Article', 'Date', 'Heading', 'NewsType']

--- First 5 rows ---
                                             Article      Date  \
0  KARACHI: The Sindh government has decided to b...  1/1/2015   
1  HONG KONG: Asian markets started 2015 on an up...  1/2/2015   
2  HONG KONG:  Hong Kong shares opened 0.66 perce...  1/5/2015   
3  HONG KONG: Asian markets tumbled Tuesday follo...  1/6/2015   
4  NEW YORK: US oil prices Monday slipped below $...  1/6/2015   

                                             Heading  NewsType  
0  sindh govt decides to cut public transport far...  business  
1                    asia stocks up in new year trad  business  
2           hong kong stocks open 0.66 percent lower  business  
3             asian stocks sink euro near nine year   business  
4                 us oil prices slip below 50 a barr  business  


In [7]:
!pip install -q nltk

import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

print("Libraries installed!")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Libraries installed!


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [8]:
# Initialize stemmer and stopwords
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """
    Preprocess the text: lowercase, remove punctuation, tokenize,
    remove stopwords, and apply stemming
    """
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation and special characters (keep only letters and spaces)
    text = re.sub(r'[^a-z\s]', '', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords and apply stemming
    processed_tokens = [
        stemmer.stem(word) for word in tokens
        if word not in stop_words and len(word) > 2
    ]

    return processed_tokens

# Test on first article
print("--- Original Text (first 200 chars) ---")
print(df['Article'][0][:200])

print("\n--- Preprocessed Tokens (first 20) ---")
processed = preprocess_text(df['Article'][0])
print(processed[:20])

--- Original Text (first 200 chars) ---
KARACHI: The Sindh government has decided to bring down public transport fares by 7 per cent due to massive reduction in petroleum product prices by the federal government, Geo News reported.Sources s

--- Preprocessed Tokens (first 20) ---
['karachi', 'sindh', 'govern', 'decid', 'bring', 'public', 'transport', 'fare', 'per', 'cent', 'due', 'massiv', 'reduct', 'petroleum', 'product', 'price', 'feder', 'govern', 'geo', 'news']


In [9]:
import time

print("Starting preprocessing of all documents...")
start_time = time.time()

# Apply preprocessing to all articles
df['processed_tokens'] = df['Article'].apply(preprocess_text)
df['processed_text'] = df['processed_tokens'].apply(lambda x: ' '.join(x))

end_time = time.time()

print(f" Preprocessing completed in {end_time - start_time:.2f} seconds!")
print(f"Total documents processed: {len(df)}")
print("\n--- Sample processed document ---")
print(f"Original: {df['Article'][0][:100]}...")
print(f"Processed: {df['processed_text'][0][:100]}...")

Starting preprocessing of all documents...
 Preprocessing completed in 16.24 seconds!
Total documents processed: 2692

--- Sample processed document ---
Original: KARACHI: The Sindh government has decided to bring down public transport fares by 7 per cent due to ...
Processed: karachi sindh govern decid bring public transport fare per cent due massiv reduct petroleum product ...


In [10]:
import time

print("Starting preprocessing of all documents...")
start_time = time.time()

# Apply preprocessing to all articles
df['processed_tokens'] = df['Article'].apply(preprocess_text)

# Also keep processed text as string (needed for TF-IDF later)
df['processed_text'] = df['processed_tokens'].apply(lambda x: ' '.join(x))

end_time = time.time()

print(f"Preprocessing completed in {end_time - start_time:.2f} seconds!")
print(f"Total documents processed: {len(df)}")
print("\n--- Sample processed document ---")
print(f"Original: {df['Article'][0][:100]}...")
print(f"Processed: {df['processed_text'][0][:100]}...")

Starting preprocessing of all documents...
Preprocessing completed in 13.22 seconds!
Total documents processed: 2692

--- Sample processed document ---
Original: KARACHI: The Sindh government has decided to bring down public transport fares by 7 per cent due to ...
Processed: karachi sindh govern decid bring public transport fare per cent due massiv reduct petroleum product ...


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

print("Building TF-IDF model...")

# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    min_df=2,
    max_df=0.8
)
# Fit and transform the processed documents
tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed_text'])

print(f"TF-IDF model built successfully!")
print(f"Matrix shape: {tfidf_matrix.shape}")
print(f"  - {tfidf_matrix.shape[0]} documents")
print(f"  - {tfidf_matrix.shape[1]} unique terms")
print(f"Matrix size in memory: {tfidf_matrix.data.nbytes / (1024*1024):.2f} MB")

Building TF-IDF model...
TF-IDF model built successfully!
Matrix shape: (2692, 5000)
  - 2692 documents
  - 5000 unique terms
Matrix size in memory: 2.16 MB


In [12]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search_documents(query, top_k=5):
    """
    Search for documents relevant to the query using TF-IDF and cosine similarity

    Args:
        query: Search query string
        top_k: Number of top results to return

    Returns:
        List of tuples (document_index, similarity_score, heading, article_snippet)
    """
    # Preprocess the query
    processed_query_tokens = preprocess_text(query)
    processed_query = ' '.join(processed_query_tokens)

    # Transform query to TF-IDF vector
    query_vector = tfidf_vectorizer.transform([processed_query])

    # Calculate cosine similarity between query and all documents
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()

    # Get top k document indices
    top_indices = np.argsort(similarity_scores)[-top_k:][::-1]

    # Prepare results
    results = []
    for idx in top_indices:
        score = similarity_scores[idx]
        heading = df['Heading'].iloc[idx]
        article_snippet = df['Article'].iloc[idx][:200] + "..."
        results.append((idx, score, heading, article_snippet))

    return results

# Test the search function
test_query = "oil prices economy"
print(f"Searching for: '{test_query}'")
print("="*80)

results = search_documents(test_query, top_k=5)

for rank, (doc_id, score, heading, snippet) in enumerate(results, 1):
    print(f"\n Rank {rank} | Score: {score:.4f} | Doc ID: {doc_id}")
    print(f"Heading: {heading}")
    print(f"Snippet: {snippet}")
    print("-"*80)

Searching for: 'oil prices economy'

 Rank 1 | Score: 0.4228 | Doc ID: 271
Heading: oil prices swing wildly in china uncertainty
Snippet: London: Fragile financial markets are grappling with wild swings in world oil prices, unnerved by uncertainty over global supplies and the demand outlook from China.By the close of business Monday, th...
--------------------------------------------------------------------------------

 Rank 2 | Score: 0.3741 | Doc ID: 520
Heading: Oil prices stable as market seen bottoming but oversupply linger
Snippet: strong>SINGAPORE: Oil prices were stable in early trading on Monday, with global oversupply and slowing economic growth weighing on markets but prospects of falling production lending some support.</s...
--------------------------------------------------------------------------------

 Rank 3 | Score: 0.3695 | Doc ID: 765
Heading: POL prices to remain unchanged for July
Snippet: strong>ISLAMABAD: The prices of petroleum products would remain unchanged

In [13]:
# Test multiple queries
test_queries = [
    "Pakistan government policy",
    "stock market investment",
    "technology innovation"
]

for query in test_queries:
    print(f"\n{'='*80}")
    print(f"Query: '{query}'")
    print('='*80)

    results = search_documents(query, top_k=3)

    for rank, (doc_id, score, heading, snippet) in enumerate(results, 1):
        print(f"\nRank {rank} | Score: {score:.4f}")
        print(f"Heading: {heading}")
        print(f"Snippet: {snippet[:150]}...")


Query: 'Pakistan government policy'

Rank 1 | Score: 0.3338
Heading: SBP keeps policy rate unchanged at 575
Snippet: strong>KARACHI: State Bank of Pakistan (SBP) on Saturday announced its monetary policy for the next two months and kept the policy rate unchanged at 5...

Rank 2 | Score: 0.3280
Heading: Trade Policy Framework 2015 18 to be announced 
Snippet: ISLAMABAD: The Ministry of Commerce has finalized new Strategic Trade Policy Framework 2015-18 which would be announced soon after the approval of the...

Rank 3 | Score: 0.2845
Heading: PML N govt turned around Pakistans economy in 3 years PM
Snippet: strong>DAVOS: Prime Minister Nawaz Sharif Wednesday said his government had achieved economic stability and turned around the countrys economy in a s...

Query: 'stock market investment'

Rank 1 | Score: 0.3846
Heading: Pakistan stocks fall 2 pct Brexit volatility
Snippet: strong>ISLAMABAD: Pakistani shares fell 2 percent in early trading on Friday, hurt by a global sell-off in ris

In [14]:
import pickle

# Save the TF-IDF vectorizer and matrix
print("Saving model components...")

# Save TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

# Save TF-IDF matrix
with open('tfidf_matrix.pkl', 'wb') as f:
    pickle.dump(tfidf_matrix, f)

# Save preprocessed dataframe
df.to_pickle('preprocessed_data.pkl')

print("Model saved successfully!")
print("\nSaved files:")
print("  - tfidf_vectorizer.pkl")
print("  - tfidf_matrix.pkl")
print("  - preprocessed_data.pkl")

Saving model components...
Model saved successfully!

Saved files:
  - tfidf_vectorizer.pkl
  - tfidf_matrix.pkl
  - preprocessed_data.pkl


In [15]:
import time
import sys

print("="*80)
print("EVALUATION METRICS")
print("="*80)

# 1. Memory Usage
print("\n MEMORY USAGE:")
print(f"  - TF-IDF Matrix: {tfidf_matrix.data.nbytes / (1024*1024):.2f} MB")
print(f"  - Dataframe: {sys.getsizeof(df) / (1024*1024):.2f} MB")
print(f"  - Total Documents: {len(df)}")
print(f"  - Vocabulary Size: {len(tfidf_vectorizer.vocabulary_)}")

# 2. Query Speed Test
print("\n QUERY SPEED TEST:")
test_queries = [
    "oil prices economy",
    "Pakistan government",
    "stock market",
    "technology innovation",
    "sports cricket"
]

query_times = []
for query in test_queries:
    start = time.time()
    results = search_documents(query, top_k=5)
    end = time.time()
    query_time = (end - start) * 1000  # Convert to milliseconds
    query_times.append(query_time)
    print(f"  Query: '{query}' → {query_time:.2f} ms")

avg_time = sum(query_times) / len(query_times)
print(f"\n  Average Query Time: {avg_time:.2f} ms")

# 3. System Statistics
print("\n SYSTEM STATISTICS:")
print(f"  - Total Documents: {len(df)}")
print(f"  - Index Size: {tfidf_matrix.nnz} non-zero entries")
print(f"  - Sparsity: {(1 - tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1])) * 100:.2f}%")

EVALUATION METRICS

 MEMORY USAGE:
  - TF-IDF Matrix: 2.16 MB
  - Dataframe: 15.59 MB
  - Total Documents: 2692
  - Vocabulary Size: 5000

 QUERY SPEED TEST:
  Query: 'oil prices economy' → 13.78 ms
  Query: 'Pakistan government' → 11.54 ms
  Query: 'stock market' → 11.18 ms
  Query: 'technology innovation' → 12.74 ms
  Query: 'sports cricket' → 12.03 ms

  Average Query Time: 12.25 ms

 SYSTEM STATISTICS:
  - Total Documents: 2692
  - Index Size: 282867 non-zero entries
  - Sparsity: 97.90%


In [None]:
def interactive_search():
    """
    Interactive search interface
    """
    print("\n" + "="*80)
    print(" INFORMATION RETRIEVAL SYSTEM - INTERACTIVE SEARCH")
    print("="*80)
    print("Enter your search query (or type 'quit' to exit)")
    print("-"*80)

    while True:
        query = input("\n Search Query: ").strip()

        if query.lower() in ['quit', 'exit', 'q']:
            print("\n Thank you for using the IR system!")
            break

        if not query:
            print(" Please enter a valid query!")
            continue

        print(f"\nSearching for: '{query}'...")
        print("-"*80)

        # Perform search
        start_time = time.time()
        results = search_documents(query, top_k=5)
        search_time = (time.time() - start_time) * 1000

        # Display results
        if results[0][1] == 0:  # If top result has 0 similarity
            print(" No relevant documents found!")
        else:
            for rank, (doc_id, score, heading, snippet) in enumerate(results, 1):
                print(f"\n Rank {rank} | Similarity Score: {score:.4f} | Doc ID: {doc_id}")
                print(f"Heading: {heading}")
                print(f"Snippet: {snippet[:200]}...")
                print("-"*80)

        print(f"\n Search completed in {search_time:.2f} ms")

# Run the interactive search
interactive_search()


 INFORMATION RETRIEVAL SYSTEM - INTERACTIVE SEARCH
Enter your search query (or type 'quit' to exit)
--------------------------------------------------------------------------------

 Search Query: cricket match

Searching for: 'cricket match'...
--------------------------------------------------------------------------------

 Rank 1 | Similarity Score: 0.3804 | Doc ID: 1656
Heading: Scotland claims top prize in ICC Annual Awards 2015
Snippet: DUBAI: Cricket Scotland has taken the top prize in the annual ICC Development Programme Awards 2015, claiming the Best Overall Cricket Development Programme Award.Cricket Scotland´s Development Progra...
--------------------------------------------------------------------------------

 Rank 2 | Similarity Score: 0.3782 | Doc ID: 1746
Heading: ICC President discusses cricket with Punjab CM
Snippet: LAHORE: International Cricket Council (ICC) President Zaheer Abbas here Saturday called on Punjab Chief Minister Muhammad Shehbaz Sharif and discussed