In [19]:
# Install all required packages
!pip install scikit-learn rank-bm25 nltk pandas joblib tqdm chardet --quiet

print("Packages installed")

Packages installed


In [20]:
import nltk

# Download ALL required NLTK data
print("Downloading NLTK language data...")
print("This may take a minute...")

# Download everything we need
nltk.download('punkt', quiet=False)
nltk.download('punkt_tab', quiet=False)  # This is what's missing
nltk.download('stopwords', quiet=False)
nltk.download('averaged_perceptron_tagger', quiet=False)
nltk.download('wordnet', quiet=False)

print("All NLTK data downloaded successfully!")

Downloading NLTK language data...
This may take a minute...


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...


All NLTK data downloaded successfully!


[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [21]:
from google.colab import files
import os

# Check if file already exists
if not os.path.exists("Articles.csv"):
    print("Please upload your Articles.csv file")
    uploaded = files.upload()
else:
    print("Articles.csv already exists")
    print(f"File size: {os.path.getsize('Articles.csv')} bytes")

Articles.csv already exists
File size: 5071129 bytes


In [22]:
import pandas as pd
import chardet

print("Checking file encoding...")
with open('Articles.csv', 'rb') as f:
    raw_data = f.read(100000)
    result = chardet.detect(raw_data)
    print(f" Detected encoding: {result['encoding']} ({result['confidence']:.2%} confidence)")

# Test reading
try:
    df = pd.read_csv("Articles.csv", encoding=result['encoding'])
    print(f" File read successfully!")
    print(f" Rows: {len(df)}, Columns: {df.columns.tolist()}")
    print("\n Sample:")
    print(df.head(2))
    working_encoding = result['encoding']
except:
    print("Trying alternative encodings...")
    for enc in ['latin-1', 'ISO-8859-1', 'cp1252', 'utf-8', 'utf-16']:
        try:
            df = pd.read_csv("Articles.csv", encoding=enc)
            print(f"Works with: {enc}")
            working_encoding = enc
            break
        except:
            continue

print(f"\n Use this encoding in next step: '{working_encoding}'")

Checking file encoding...
 Detected encoding: Windows-1252 (73.00% confidence)
 File read successfully!
 Rows: 2692, Columns: ['Article', 'Date', 'Heading', 'NewsType']

 Sample:
                                             Article      Date  \
0  KARACHI: The Sindh government has decided to b...  1/1/2015   
1  HONG KONG: Asian markets started 2015 on an up...  1/2/2015   

                                             Heading  NewsType  
0  sindh govt decides to cut public transport far...  business  
1                    asia stocks up in new year trad  business  

 Use this encoding in next step: 'Windows-1252'


In [23]:
import pandas as pd
import numpy as np
import re
import joblib
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi

# Make sure NLTK data is available
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('tokenizers/punkt_tab/english')
except LookupError:
    print(" Downloading missing NLTK data...")
    nltk.download('punkt', quiet=True)
    nltk.download('punkt_tab', quiet=True)
    nltk.download('stopwords', quiet=True)

stop_words = set(stopwords.words('english'))

# Text cleaning functions
def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def tokenize_text(text):
    words = word_tokenize(text)
    words = [w for w in words if re.search(r"[a-zA-Z0-9]", w)]
    words = [w for w in words if w not in stop_words]
    return words

# Search System Class
class ArticleSearch:
    def __init__(self, documents, doc_ids=None):
        self.documents = documents
        self.doc_ids = doc_ids or list(range(len(documents)))
        self.clean_docs = [clean_text(d) for d in documents]
        self.tokenized_docs = [tokenize_text(d) for d in self.clean_docs]
        self.tfidf_vectorizer = None
        self.tfidf_matrix = None
        self.bm25_index = None

    def create_tfidf(self):
        self.tfidf_vectorizer = TfidfVectorizer()
        self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(self.clean_docs)

    def create_bm25(self):
        self.bm25_index = BM25Okapi(self.tokenized_docs)

    def save_index(self, filename):
        joblib.dump({
            'documents': self.documents,
            'doc_ids': self.doc_ids,
            'clean_docs': self.clean_docs,
            'tokenized_docs': self.tokenized_docs,
            'tfidf_vectorizer': self.tfidf_vectorizer,
            'tfidf_matrix': self.tfidf_matrix,
            'bm25_index': self.bm25_index
        }, filename)
        print(f" Index saved as {filename}")

    @classmethod
    def load_index(cls, filename):
        data = joblib.load(filename)
        system = cls(data['documents'], data['doc_ids'])
        system.clean_docs = data['clean_docs']
        system.tokenized_docs = data['tokenized_docs']
        system.tfidf_vectorizer = data['tfidf_vectorizer']
        system.tfidf_matrix = data['tfidf_matrix']
        system.bm25_index = data['bm25_index']
        return system

    def search_bm25(self, query, num_results=10):
        query_tokens = tokenize_text(clean_text(query))
        scores = self.bm25_index.get_scores(query_tokens)
        top_indices = np.argsort(scores)[::-1][:num_results]
        results = []
        for i in top_indices:
            results.append({
                'id': self.doc_ids[i],
                'score': float(scores[i]),
                'content': self.documents[i]
            })
        return results

    def search_tfidf(self, query, num_results=10):
        query_vector = self.tfidf_vectorizer.transform([clean_text(query)])
        similarities = cosine_similarity(query_vector, self.tfidf_matrix).flatten()
        top_indices = np.argsort(similarities)[::-1][:num_results]
        results = []
        for i in top_indices:
            results.append({
                'id': self.doc_ids[i],
                'score': float(similarities[i]),
                'content': self.documents[i]
            })
        return results

# --- MAIN EXECUTION ---
print(" Loading your articles...")

# USE THE ENCODING FROM STEP 5 HERE!
# Example: 'latin-1', 'utf-8', 'cp1252', etc.
working_encoding = 'latin-1'  #  CHANGE THIS to what worked in Step 5

df = pd.read_csv("Articles.csv", encoding=working_encoding)
print(f" Found {len(df)} articles")

# Check columns
print("\n Available columns:")
for i, col in enumerate(df.columns):
    print(f"  {i}: {col}")

# Use first column for text (change index if needed)
text_column = df.columns[0]
print(f"\n Using column '{text_column}' for article content")

# Prepare data
doc_texts = df[text_column].fillna("").astype(str).tolist()
document_ids = [f"doc_{i}" for i in range(len(doc_texts))]  # Create IDs

print(f"\n Loaded {len(doc_texts)} documents")
print(f" Average document length: {np.mean([len(d) for d in doc_texts]):.0f} characters")

# Build search system
print("\n Building search indexes...")
print("  Creating BM25 index...")
search_engine = ArticleSearch(doc_texts, document_ids)
search_engine.create_bm25()
print("  Creating TF-IDF index...")
search_engine.create_tfidf()

# Save the index
search_engine.save_index("article_search_index.pkl")

print("\n" + "="*50)
print(" SEARCH SYSTEM READY!")
print("="*50)
print(f" Index saved: article_search_index.pkl")
print(f" Documents indexed: {len(doc_texts)}")
print(f" Methods available: BM25 and TF-IDF")

 Loading your articles...
 Found 2692 articles

 Available columns:
  0: Article
  1: Date
  2: Heading
  3: NewsType

 Using column 'Article' for article content

 Loaded 2692 documents
 Average document length: 1810 characters

 Building search indexes...
  Creating BM25 index...
  Creating TF-IDF index...
 Index saved as article_search_index.pkl

 SEARCH SYSTEM READY!
 Index saved: article_search_index.pkl
 Documents indexed: 2692
 Methods available: BM25 and TF-IDF


In [24]:
working_encoding = 'latin-1'  #  CHANGE THIS to what worked in Step 5

In [25]:
# Quick test
search_engine = ArticleSearch.load_index("article_search_index.pkl")

# Test with a simple query
test_queries = ["technology", "science", "health", "education"]

for query in test_queries:
    print(f"\n '{query}':")
    results = search_engine.search_bm25(query, num_results=2)
    for i, r in enumerate(results, 1):
        preview = r['content'][:100].replace('\n', ' ') + "..."
        print(f"   {i}. [ID:{r['id']}] Score:{r['score']:.3f}")
        print(f"      {preview}")

print("\n System is working!")


 'technology':
   1. [ID:doc_2650] Score:6.902
      SAN FRANCISCO: Alphabet on Thursday filed a lawsuit accusing Uber and its self-driving vehicle unit ...
   2. [ID:doc_1807] Score:6.705
      strong>SYDNEY: Australia has resorted to guided missile technology to reduce injuries to their fast ...

 'science':
   1. [ID:doc_1666] Score:7.083
      strong>MELBOURNE: Australia fast bowler John Hastings has been withdrawn from a one-day tri-series a...
   2. [ID:doc_490] Score:7.038
      strong>WASHINGTON: Federal Minister for Planning, Ahsan Iqbal on Saturday said the next 10 years wil...

 'health':
   1. [ID:doc_549] Score:6.792
      strong>NEW DELHI :India´s top cigarette maker ITC Ltd, part-owned by British American Tobacco , said...
   2. [ID:doc_603] Score:5.923
      strong>Karachi: K-Electric has won two national awards in recognition of its commitment to promote a...

 'education':
   1. [ID:doc_961] Score:7.676
      strong>ISLAMABAD: Sindh Chief Minister Syed Murad Ali Shah