In [4]:
import os
import json
import pickle
import numpy as np
import re
from collections import defaultdict, Counter
from typing import List, Dict, Set, Tuple
import math
from sklearn.feature_extraction.text import TfidfVectorizer as SklearnTfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import scipy.sparse as sp

class SupremeCourtTFIDFSystem:
    """
    TF-IDF Vectorization and Search System for Supreme Court Documents
    Integrates with existing Boolean and N-gram systems
    """
    
    def __init__(self, corpus_folder: str):
        """
        Initialize TF-IDF System
        
        Args:
            corpus_folder: Path to the cleaned_corpus folder
        """
        self.corpus_folder = corpus_folder
        self.index_folder = os.path.join(corpus_folder, "tfidf_index")
        
        # Create index folder if it doesn't exist
        if not os.path.exists(self.index_folder):
            os.makedirs(self.index_folder)
        
        # Data structures
        self.documents = {}  # doc_id -> document info
        self.doc_texts = {}  # doc_id -> full text
        self.doc_tokens = {}  # doc_id -> list of tokens
        
        # TF-IDF matrices
        self.tfidf_matrix = None  # Sparse TF-IDF matrix
        self.feature_names = []  # Vocabulary
        self.vectorizer = None   # Scikit-learn vectorizer
        
        # Statistics
        self.stats = {
            'total_documents': 0,
            'vocabulary_size': 0,
            'total_terms': 0
        }
        
        # Document mapping
        self.doc_id_to_index = {}
        self.index_to_doc_id = {}
        self.doc_names = []
    
    def load_documents(self):
        """Load documents from the corpus"""
        print("üìÇ Loading documents...")
        
        # Try to find document tokens file
        doc_tokens_file = os.path.join(self.corpus_folder, "document_tokens.json")
        
        if not os.path.exists(doc_tokens_file):
            print(f"‚ùå Document tokens file not found: {doc_tokens_file}")
            
            # Try alternative locations
            alt_locations = [
                os.path.join(self.corpus_folder, "..", "document_tokens.json"),
                os.path.join(self.corpus_folder, "statistics", "document_tokens.json"),
            ]
            
            for location in alt_locations:
                if os.path.exists(location):
                    doc_tokens_file = location
                    print(f"‚úÖ Found at: {doc_tokens_file}")
                    break
        
        if not os.path.exists(doc_tokens_file):
            print("‚ùå Could not find document tokens file")
            return False
        
        # Load document tokens
        try:
            with open(doc_tokens_file, 'r', encoding='utf-8') as f:
                doc_data = json.load(f)
            
            doc_id = 0
            for doc_name, doc_info in doc_data.items():
                doc_id += 1
                doc_key = f"doc_{doc_id:05d}"
                
                tokens = doc_info.get('tokens', [])
                token_count = doc_info.get('token_count', 0)
                
                if tokens and token_count > 0:
                    self.documents[doc_key] = {
                        'name': doc_name,
                        'token_count': token_count
                    }
                    
                    # Store tokens as text for vectorization
                    self.doc_texts[doc_key] = " ".join(tokens)
                    self.doc_tokens[doc_key] = tokens
            
            self.stats['total_documents'] = len(self.documents)
            self.stats['total_terms'] = sum(doc['token_count'] for doc in self.documents.values())
            
            print(f"‚úÖ Loaded {self.stats['total_documents']} documents")
            print(f"üìä Total terms: {self.stats['total_terms']:,}")
            
            return True
            
        except Exception as e:
            print(f"‚ùå Error loading documents: {e}")
            return False
    
    def build_tfidf_index(self, max_features: int = 10000, **kwargs):
        """
        Build TF-IDF index using scikit-learn
        
        Args:
            max_features: Maximum number of features to keep
            **kwargs: Additional parameters for TfidfVectorizer
        """
        print("üî® Building TF-IDF Index...")
        
        if not self.doc_texts:
            print("‚ùå No documents loaded. Loading documents first...")
            if not self.load_documents():
                return False
        
        # Prepare document texts
        doc_ids = sorted(self.doc_texts.keys())
        doc_names = [self.documents[doc_id]['name'] for doc_id in doc_ids]
        doc_texts = [self.doc_texts[doc_id] for doc_id in doc_ids]
        
        # Configure TF-IDF vectorizer
        vectorizer_kwargs = {
            'max_features': max_features,
            'min_df': 2,  # Ignore terms that appear in less than 2 documents
            'max_df': 0.95,  # Ignore terms that appear in more than 95% of documents
            'stop_words': 'english',
            'ngram_range': (1, 2),  # Use unigrams and bigrams
            'sublinear_tf': True,  # Use 1 + log(tf)
            'norm': 'l2',  # Normalize vectors to unit length
            'use_idf': True,  # Use IDF weighting
            'smooth_idf': True,  # Smooth IDF weights
            **kwargs
        }
        
        print(f"üìä Vectorizer parameters: {vectorizer_kwargs}")
        
        # Create and fit vectorizer
        self.vectorizer = SklearnTfidfVectorizer(**vectorizer_kwargs)
        self.tfidf_matrix = self.vectorizer.fit_transform(doc_texts)
        
        # Get feature names
        self.feature_names = self.vectorizer.get_feature_names_out()
        
        # Update statistics
        self.stats['vocabulary_size'] = len(self.feature_names)
        
        # Store document mapping
        self.doc_id_to_index = {doc_id: idx for idx, doc_id in enumerate(doc_ids)}
        self.index_to_doc_id = {idx: doc_id for idx, doc_id in enumerate(doc_ids)}
        self.doc_names = doc_names
        
        # Save index
        self.save_index()
        
        print(f"\n‚úÖ TF-IDF Index built successfully!")
        print(f"   Documents: {self.stats['total_documents']:,}")
        print(f"   Vocabulary: {self.stats['vocabulary_size']:,}")
        print(f"   TF-IDF matrix shape: {self.tfidf_matrix.shape}")
        print(f"   Sparsity: {(1 - self.tfidf_matrix.nnz / (self.tfidf_matrix.shape[0] * self.tfidf_matrix.shape[1])) * 100:.1f}%")
        
        # Show sample features
        sample_features = self.feature_names[:10]
        print(f"   Sample features: {', '.join(sample_features)}")
        
        return True
    
    def save_index(self):
        """Save TF-IDF index to disk"""
        if self.vectorizer is None or self.tfidf_matrix is None:
            print("‚ùå No index to save")
            return
        
        # Save vectorizer and matrix
        index_data = {
            'vectorizer': self.vectorizer,
            'tfidf_matrix': self.tfidf_matrix,
            'documents': self.documents,
            'doc_texts': self.doc_texts,
            'doc_tokens': self.doc_tokens,
            'feature_names': self.feature_names,
            'stats': self.stats,
            'doc_id_to_index': self.doc_id_to_index,
            'index_to_doc_id': self.index_to_doc_id,
            'doc_names': self.doc_names
        }
        
        index_file = os.path.join(self.index_folder, "tfidf_index.pkl")
        with open(index_file, 'wb') as f:
            pickle.dump(index_data, f)
        
        # Also save a human-readable version
        readable_file = os.path.join(self.index_folder, "tfidf_stats.json")
        readable_data = {
            'stats': self.stats,
            'matrix_shape': self.tfidf_matrix.shape,
            'sample_features': self.feature_names[:50].tolist() if hasattr(self.feature_names, 'tolist') else self.feature_names[:50],
            'document_count': len(self.documents)
        }
        
        with open(readable_file, 'w', encoding='utf-8') as f:
            json.dump(readable_data, f, indent=2, ensure_ascii=False)
        
        print(f"\nüíæ Index saved to: {index_file}")
        print(f"üìä Statistics saved to: {readable_file}")
    
    def load_index(self):
        """Load TF-IDF index from disk"""
        index_file = os.path.join(self.index_folder, "tfidf_index.pkl")
        
        if not os.path.exists(index_file):
            print(f"‚ùå Index not found at: {index_file}")
            print("Building new index...")
            return self.build_tfidf_index()
        
        try:
            print(f"üìÇ Loading TF-IDF index from: {index_file}")
            with open(index_file, 'rb') as f:
                index_data = pickle.load(f)
            
            self.vectorizer = index_data['vectorizer']
            self.tfidf_matrix = index_data['tfidf_matrix']
            self.documents = index_data['documents']
            self.doc_texts = index_data['doc_texts']
            self.doc_tokens = index_data['doc_tokens']
            self.feature_names = index_data['feature_names']
            self.stats = index_data['stats']
            self.doc_id_to_index = index_data['doc_id_to_index']
            self.index_to_doc_id = index_data['index_to_doc_id']
            self.doc_names = index_data['doc_names']
            
            print(f"‚úÖ TF-IDF Index loaded successfully!")
            print(f"   Documents: {self.stats['total_documents']:,}")
            print(f"   Vocabulary: {self.stats['vocabulary_size']:,}")
            print(f"   Matrix shape: {self.tfidf_matrix.shape}")
            
            return True
            
        except Exception as e:
            print(f"‚ùå Error loading index: {e}")
            import traceback
            traceback.print_exc()
            return self.build_tfidf_index()
    
    def search_similar_documents(self, query: str, top_k: int = 10) -> List[Dict]:
        """
        Search for documents similar to query using cosine similarity
        
        Args:
            query: Search query string
            top_k: Number of top results to return
            
        Returns:
            List of similar documents with similarity scores
        """
        if self.vectorizer is None or self.tfidf_matrix is None:
            print("‚ùå TF-IDF index not loaded")
            return []
        
        # Transform query to TF-IDF vector
        query_vector = self.vectorizer.transform([query])
        
        # Calculate cosine similarity with all documents
        similarities = cosine_similarity(query_vector, self.tfidf_matrix).flatten()
        
        # Get top K similar documents
        top_indices = similarities.argsort()[::-1][:top_k]
        
        results = []
        for rank, idx in enumerate(top_indices, 1):
            doc_id = self.index_to_doc_id[idx]
            doc_info = self.documents[doc_id]
            similarity = similarities[idx]
            
            # Calculate query term relevance
            query_terms = query.lower().split()
            doc_text = self.doc_texts[doc_id].lower()
            query_terms_found = [term for term in query_terms if term in doc_text]
            
            results.append({
                'doc_id': doc_id,
                'name': doc_info['name'],
                'token_count': doc_info['token_count'],
                'similarity_score': similarity,
                'query_terms_found': query_terms_found,
                'terms_found_count': len(query_terms_found),
                'rank': rank,
                'relevance_score': self.calculate_relevance_score(similarity, len(query_terms_found), len(query_terms))
            })
        
        # Sort by relevance score (higher is better)
        results.sort(key=lambda x: x['relevance_score'], reverse=True)
        
        # Re-rank based on final relevance score
        for i, result in enumerate(results, 1):
            result['rank'] = i
        
        return results
    
    def calculate_relevance_score(self, similarity: float, terms_found: int, total_terms: int) -> float:
        """
        Calculate comprehensive relevance score
        
        Args:
            similarity: Cosine similarity score
            terms_found: Number of query terms found in document
            total_terms: Total number of query terms
            
        Returns:
            Combined relevance score
        """
        if total_terms == 0:
            return similarity
        
        # Term coverage score (0-1)
        term_coverage = terms_found / total_terms
        
        # Weighted combination: 70% similarity + 30% term coverage
        relevance = (0.7 * similarity) + (0.3 * term_coverage)
        
        return relevance
    
    def find_similar_to_document(self, doc_name: str, top_k: int = 10) -> List[Dict]:
        """
        Find documents similar to a specific document
        
        Args:
            doc_name: Name of the document
            top_k: Number of top results to return
            
        Returns:
            List of similar documents
        """
        if self.vectorizer is None or self.tfidf_matrix is None:
            print("‚ùå TF-IDF index not loaded")
            return []
        
        # Find the document
        target_doc_id = None
        for doc_id, info in self.documents.items():
            if info['name'] == doc_name:
                target_doc_id = doc_id
                break
        
        if target_doc_id is None:
            print(f"‚ùå Document not found: {doc_name}")
            return []
        
        # Get document index
        if target_doc_id not in self.doc_id_to_index:
            print(f"‚ùå Document not in TF-IDF matrix: {doc_name}")
            return []
        
        doc_idx = self.doc_id_to_index[target_doc_id]
        
        # Calculate cosine similarity with all other documents
        doc_vector = self.tfidf_matrix[doc_idx:doc_idx+1]
        similarities = cosine_similarity(doc_vector, self.tfidf_matrix).flatten()
        
        # Set self-similarity to 0 to avoid returning the same document
        similarities[doc_idx] = 0
        
        # Get top K similar documents
        top_indices = similarities.argsort()[::-1][:top_k]
        
        results = []
        for rank, idx in enumerate(top_indices, 1):
            if idx == doc_idx:
                continue
                
            doc_id = self.index_to_doc_id[idx]
            doc_info = self.documents[doc_id]
            similarity = similarities[idx]
            
            # Calculate document overlap
            target_tokens = set(self.doc_tokens[target_doc_id])
            other_tokens = set(self.doc_tokens[doc_id])
            overlap_ratio = len(target_tokens.intersection(other_tokens)) / len(target_tokens.union(other_tokens)) if target_tokens.union(other_tokens) else 0
            
            results.append({
                'doc_id': doc_id,
                'name': doc_info['name'],
                'token_count': doc_info['token_count'],
                'similarity_score': similarity,
                'overlap_ratio': overlap_ratio,
                'rank': rank
            })
        
        # Sort by similarity score (already sorted, but ensure)
        results.sort(key=lambda x: x['similarity_score'], reverse=True)
        
        return results
    
    def get_top_terms_for_document(self, doc_name: str, top_n: int = 10) -> List[Tuple[str, float]]:
        """
        Get top TF-IDF terms for a specific document
        
        Args:
            doc_name: Name of the document
            top_n: Number of top terms to return
            
        Returns:
            List of (term, tfidf_score) tuples
        """
        if self.vectorizer is None or self.tfidf_matrix is None:
            print("‚ùå TF-IDF index not loaded")
            return []
        
        # Find the document
        target_doc_id = None
        for doc_id, info in self.documents.items():
            if info['name'] == doc_name:
                target_doc_id = doc_id
                break
        
        if target_doc_id is None:
            print(f"‚ùå Document not found: {doc_name}")
            return []
        
        # Get document index
        if target_doc_id not in self.doc_id_to_index:
            print(f"‚ùå Document not in TF-IDF matrix: {doc_name}")
            return []
        
        doc_idx = self.doc_id_to_index[target_doc_id]
        
        # Get document vector
        doc_vector = self.tfidf_matrix[doc_idx]
        
        # Convert to dense if sparse
        if sp.issparse(doc_vector):
            doc_vector = doc_vector.toarray().flatten()
        
        # Get top term indices
        top_indices = doc_vector.argsort()[::-1][:top_n]
        
        # Get term names and scores
        top_terms = []
        for idx in top_indices:
            if doc_vector[idx] > 0:
                term = self.feature_names[idx]
                score = doc_vector[idx]
                top_terms.append((term, score))
        
        return top_terms
    
    def search_by_keywords(self, keywords: List[str], top_k: int = 10) -> List[Dict]:
        """
        Search for documents containing specific keywords using TF-IDF weights
        
        Args:
            keywords: List of keywords to search for
            top_k: Number of top results to return
            
        Returns:
            List of documents with relevance scores
        """
        if self.vectorizer is None or self.tfidf_matrix is None:
            print("‚ùå TF-IDF index not loaded")
            return []
        
        # Create a query that emphasizes the keywords
        query = " ".join(keywords)
        
        # Transform query to TF-IDF vector
        query_vector = self.vectorizer.transform([query])
        
        # Calculate cosine similarity
        similarities = cosine_similarity(query_vector, self.tfidf_matrix).flatten()
        
        # Get top K documents
        top_indices = similarities.argsort()[::-1][:top_k]
        
        results = []
        for rank, idx in enumerate(top_indices, 1):
            doc_id = self.index_to_doc_id[idx]
            doc_info = self.documents[doc_id]
            similarity = similarities[idx]
            
            # Check if document contains any of the keywords
            keywords_found = []
            doc_text = self.doc_texts[doc_id].lower()
            for keyword in keywords:
                if keyword.lower() in doc_text:
                    keywords_found.append(keyword)
            
            # Calculate keyword density
            keyword_density = 0
            if doc_info['token_count'] > 0:
                total_occurrences = sum(doc_text.count(keyword.lower()) for keyword in keywords)
                keyword_density = total_occurrences / doc_info['token_count']
            
            results.append({
                'doc_id': doc_id,
                'name': doc_info['name'],
                'token_count': doc_info['token_count'],
                'similarity_score': similarity,
                'keywords_found': keywords_found,
                'keywords_count': len(keywords_found),
                'keyword_density': keyword_density,
                'rank': rank,
                'relevance_score': self.calculate_keyword_relevance(similarity, len(keywords_found), len(keywords), keyword_density)
            })
        
        # Sort by relevance score
        results.sort(key=lambda x: x['relevance_score'], reverse=True)
        
        # Re-rank
        for i, result in enumerate(results, 1):
            result['rank'] = i
        
        return results
    
    def calculate_keyword_relevance(self, similarity: float, found_count: int, total_keywords: int, density: float) -> float:
        """
        Calculate relevance score for keyword search
        
        Args:
            similarity: Cosine similarity
            found_count: Number of keywords found
            total_keywords: Total number of keywords
            density: Keyword density in document
            
        Returns:
            Combined relevance score
        """
        if total_keywords == 0:
            return similarity
        
        # Keyword coverage score
        coverage = found_count / total_keywords
        
        # Weighted combination
        relevance = (0.5 * similarity) + (0.3 * coverage) + (0.2 * min(density * 10, 1.0))
        
        return relevance
    
    def show_document_preview(self, doc_name: str, preview_lines: int = 10):
        """Show preview of a document"""
        # Try to find the document in cleaned_docs folder
        cleaned_docs_folder = os.path.join(self.corpus_folder, "cleaned_docs")
        
        if not os.path.exists(cleaned_docs_folder):
            # Try alternative locations
            possible_locations = [
                os.path.join(self.corpus_folder, "..", "cleaned_docs"),
                os.path.join(os.path.dirname(self.corpus_folder), "cleaned_docs"),
                r"C:\\Users\\Armaghan Rafique\\Desktop\\AI Project\\cleaned_corpus\\cleaned_docs",
            ]
            
            for location in possible_locations:
                if os.path.exists(location):
                    cleaned_docs_folder = location
                    break
        
        doc_path = os.path.join(cleaned_docs_folder, doc_name)
        
        if not os.path.exists(doc_path):
            print(f"‚ùå Document not found: {doc_path}")
            return
        
        try:
            with open(doc_path, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()
            
            print(f"\n" + "=" * 80)
            print(f"üìÑ DOCUMENT PREVIEW: {doc_name}")
            print("=" * 80)
            
            # Extract and show text content
            lines = content.split('\n')
            
            # Find where actual text starts (skip metadata)
            text_start = 0
            for i, line in enumerate(lines):
                if 'TEXT CONTENT:' in line or 'TEXT:' in line:
                    text_start = i + 1
                    break
            
            print(f"\nüìù First {preview_lines} lines of content:")
            print("-" * 80)
            
            for i, line in enumerate(lines[text_start:text_start + preview_lines]):
                if line.strip():
                    clean_line = re.sub(r'\s+', ' ', line.strip())
                    if len(clean_line) > 120:
                        print(f"{i+1:3d}. {clean_line[:117]}...")
                    else:
                        print(f"{i+1:3d}. {clean_line}")
            
            if len(lines) > text_start + preview_lines:
                print(f"\n... and {len(lines) - (text_start + preview_lines)} more lines")
            
            print("=" * 80)
            
        except Exception as e:
            print(f"‚ùå Error reading document: {e}")
    
    def show_statistics(self):
        """Show TF-IDF system statistics"""
        print(f"\nüìä TF-IDF SYSTEM STATISTICS")
        print("=" * 80)
        print(f"Documents: {self.stats['total_documents']:,}")
        print(f"Vocabulary: {self.stats['vocabulary_size']:,}")
        print(f"Total terms: {self.stats['total_terms']:,}")
        
        if self.tfidf_matrix is not None:
            print(f"TF-IDF matrix shape: {self.tfidf_matrix.shape}")
            print(f"Matrix density: {self.tfidf_matrix.nnz / (self.tfidf_matrix.shape[0] * self.tfidf_matrix.shape[1]) * 100:.2f}%")
        
        # Show most common terms
        if self.vectorizer is not None and hasattr(self.vectorizer, 'idf_'):
            print(f"\nüìà Most important terms (highest IDF scores):")
            
            # Get terms with highest IDF scores
            feature_names = self.feature_names
            idf_scores = self.vectorizer.idf_
            
            # Sort by IDF (descending)
            top_indices = idf_scores.argsort()[::-1][:20]
            
            for i, idx in enumerate(top_indices[:10], 1):
                term = feature_names[idx]
                score = idf_scores[idx]
                print(f"  {i:2d}. {term:<20} IDF: {score:.3f}")
        
        print("=" * 80)
    
    def interactive_search(self):
        """Interactive TF-IDF search interface"""
        print("\n" + "=" * 80)
        print("üîç TF-IDF VECTOR SPACE SEARCH SYSTEM")
        print("=" * 80)
        print("\nüìã Available Commands:")
        print("  ‚Ä¢ search <query>           - Search for documents similar to query")
        print("  ‚Ä¢ similar <doc_name>       - Find documents similar to a specific document")
        print("  ‚Ä¢ keywords <word1 word2>   - Search by keywords")
        print("  ‚Ä¢ terms <doc_name>         - Show top terms for a document")
        print("  ‚Ä¢ preview <doc_name>       - Preview a document")
        print("  ‚Ä¢ stats                    - Show system statistics")
        print("  ‚Ä¢ rebuild                  - Rebuild TF-IDF index")
        print("  ‚Ä¢ quit                     - Exit")
        print("\nüìù Example searches:")
        print("  ‚Ä¢ search murder evidence trial")
        print("  ‚Ä¢ similar 2025LHC7277.txt")
        print("  ‚Ä¢ keywords supreme court appeal")
        print("  ‚Ä¢ terms 2025LHC7389.txt")
        print("=" * 80)
        
        while True:
            user_input = input("\nüéØ Enter command: ").strip()
            
            if not user_input:
                continue
            
            if user_input.lower() == 'quit':
                print("üëã Goodbye!")
                break
            
            elif user_input.lower() == 'stats':
                self.show_statistics()
            
            elif user_input.lower() == 'rebuild':
                confirm = input("‚ö†Ô∏è  Rebuild TF-IDF index? This may take time. (y/n): ").strip().lower()
                if confirm == 'y':
                    max_features = input("Enter max features (default 10000): ").strip()
                    max_features = int(max_features) if max_features.isdigit() else 10000
                    self.build_tfidf_index(max_features=max_features)
            
            elif user_input.lower().startswith('search '):
                query = user_input[7:].strip()
                if query:
                    print(f"\nüîç Searching for: '{query}'")
                    results = self.search_similar_documents(query, top_k=15)
                    self.display_results(results, "Search Results")
                else:
                    print("‚ùå Please enter a search query")
            
            elif user_input.lower().startswith('similar '):
                doc_name = user_input[8:].strip()
                if doc_name:
                    print(f"\nüîç Finding documents similar to: '{doc_name}'")
                    results = self.find_similar_to_document(doc_name, top_k=15)
                    self.display_results(results, f"Similar to {doc_name}")
                else:
                    print("‚ùå Please enter a document name")
            
            elif user_input.lower().startswith('keywords '):
                keywords = user_input[9:].strip().split()
                if keywords:
                    print(f"\nüîç Searching by keywords: {keywords}")
                    results = self.search_by_keywords(keywords, top_k=15)
                    self.display_results(results, f"Keywords: {', '.join(keywords)}")
                else:
                    print("‚ùå Please enter keywords")
            
            elif user_input.lower().startswith('terms '):
                doc_name = user_input[6:].strip()
                if doc_name:
                    print(f"\nüî§ Top terms for: '{doc_name}'")
                    top_terms = self.get_top_terms_for_document(doc_name, top_n=15)
                    
                    if top_terms:
                        print("\n" + "=" * 80)
                        print(f"üìä TOP TERMS FOR: {doc_name}")
                        print("=" * 80)
                        
                        for i, (term, score) in enumerate(top_terms, 1):
                            print(f"{i:2d}. {term:<25} TF-IDF: {score:.4f}")
                        
                        print("=" * 80)
                    else:
                        print("‚ùå No terms found or document not in index")
                else:
                    print("‚ùå Please enter a document name")
            
            elif user_input.lower().startswith('preview '):
                doc_name = user_input[8:].strip()
                if doc_name:
                    self.show_document_preview(doc_name)
                else:
                    print("‚ùå Please enter a document name")
            
            else:
                # Try as a search query
                print(f"\nüîç Searching for: '{user_input}'")
                results = self.search_similar_documents(user_input, top_k=10)
                self.display_results(results, "Search Results")
    
    def display_results(self, results: List[Dict], title: str):
        """Display search results"""
        if not results:
            print(f"\n‚ùå No results found")
            return
        
        print(f"\n‚úÖ {title}")
        print(f"üìä Found {len(results)} document(s)")
        print("=" * 80)
        
        for i, result in enumerate(results[:15], 1):  # Show only top 15
            print(f"\n{i:2d}. üìÑ {result['name']}")
            print(f"    üìè Length: {result['token_count']:,} tokens")
            print(f"    ü•á Rank: #{result['rank']}")
            print(f"    ‚≠ê Similarity: {result['similarity_score']:.4f}")
            
            if 'relevance_score' in result:
                print(f"    üéØ Relevance: {result['relevance_score']:.3f}")
            
            if 'query_terms_found' in result and result['query_terms_found']:
                print(f"    üîç Query terms found: {', '.join(result['query_terms_found'][:5])}")
                if len(result['query_terms_found']) > 5:
                    print(f"       ... and {len(result['query_terms_found']) - 5} more")
            
            if 'keywords_found' in result and result['keywords_found']:
                print(f"    üîë Keywords found: {', '.join(result['keywords_found'][:3])}")
                if len(result['keywords_found']) > 3:
                    print(f"       ... and {len(result['keywords_found']) - 3} more")
                if 'keyword_density' in result:
                    print(f"    üìà Keyword density: {result['keyword_density']:.4f}")
        
        if len(results) > 15:
            print(f"\n... and {len(results) - 15} more documents")
        
        print("\n" + "=" * 80)
        
        # Ask for document preview
        if results:
            choice = input("\nüìñ Preview a document? (enter number or 'n'): ").strip()
            if choice.lower() != 'n' and choice.isdigit():
                idx = int(choice) - 1
                if 0 <= idx < len(results):
                    self.show_document_preview(results[idx]['name'])


def main():
    """Main function"""
    print("=" * 80)
    print("üîç SUPREME COURT - TF-IDF VECTOR SPACE SEARCH SYSTEM")
    print("=" * 80)
    
    # Set corpus folder path
    corpus_folder = r"C:\Users\Armaghan Rafique\Desktop\AI Project\cleaned_corpus"
    
    # Check if folder exists
    if not os.path.exists(corpus_folder):
        print(f"‚ùå Corpus folder not found: {corpus_folder}")
        
        # Try alternative locations
        alt_folders = [
            r"C:\Users\Armaghan Rafique\Desktop\AI Project\supreme_court_judgements_txt",
            r"C:\Users\Armaghan Rafique\Desktop\AI Project",
            os.path.join(os.path.expanduser("~"), "Desktop", "AI Project", "cleaned_corpus")
        ]
        
        found = False
        for folder in alt_folders:
            if os.path.exists(folder):
                corpus_folder = folder
                print(f"‚úÖ Using folder: {corpus_folder}")
                found = True
                break
        
        if not found:
            corpus_folder = input("üìÅ Enter corpus folder path: ").strip()
            if not os.path.exists(corpus_folder):
                print(f"‚ùå Folder does not exist: {corpus_folder}")
                return
    
    print(f"\nüìÅ Using corpus folder: {corpus_folder}")
    
    # Create TF-IDF system
    tfidf_system = SupremeCourtTFIDFSystem(corpus_folder)
    
    # Load or build index
    print("\nüìÇ Initializing TF-IDF system...")
    if not tfidf_system.load_index():
        print("‚ùå Failed to initialize TF-IDF system")
        return
    
    # Show system info
    print(f"\nüìä TF-IDF SYSTEM READY")
    print(f"   Documents: {tfidf_system.stats['total_documents']:,}")
    print(f"   Vocabulary: {tfidf_system.stats['vocabulary_size']:,}")
    print(f"   Total terms: {tfidf_system.stats['total_terms']:,}")
    
    # Start interactive search
    tfidf_system.interactive_search()


if __name__ == "__main__":
    main()

ü•á SUPREME COURT - TF-IDF DOCUMENT RANKING SYSTEM

üìÅ Using corpus folder: C:\Users\Armaghan Rafique\Desktop\AI Project\cleaned_corpus

üìÇ Initializing TF-IDF ranking system...
‚ùå Index not found at: C:\Users\Armaghan Rafique\Desktop\AI Project\cleaned_corpus\tfidf_ranking\tfidf_ranking.pkl
Building new index...
üî® Calculating TF-IDF Scores for Ranking...
‚ùå No documents loaded. Loading documents first...
üìÇ Loading documents...
‚úÖ Loaded 1460 documents
üìä Total terms: 1,048,901
üìä Vectorizer parameters: {'max_features': 10000, 'min_df': 2, 'max_df': 0.95, 'stop_words': 'english', 'ngram_range': (1, 2), 'sublinear_tf': False, 'norm': None, 'use_idf': True, 'smooth_idf': True}

üíæ Index saved to: C:\Users\Armaghan Rafique\Desktop\AI Project\cleaned_corpus\tfidf_ranking\tfidf_ranking.pkl
üìä Statistics saved to: C:\Users\Armaghan Rafique\Desktop\AI Project\cleaned_corpus\tfidf_ranking\tfidf_ranking_stats.json

‚úÖ TF-IDF Scores calculated successfully!
   Documents: 1


üéØ Enter command:  quit


üëã Goodbye!


In [None]:
import os
import json
import pickle
import numpy as np
import re
from collections import defaultdict, Counter
from typing import List, Dict, Set, Tuple
import math
from sklearn.feature_extraction.text import TfidfVectorizer as SklearnTfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import scipy.sparse as sp

class SupremeCourtTFIDFRanking:
    """
    TF-IDF Ranking System for Supreme Court Documents
    Properly ranks documents based on TF-IDF relevance scores
    """
    
    def __init__(self, corpus_folder: str):
        """
        Initialize TF-IDF Ranking System
        
        Args:
            corpus_folder: Path to the cleaned_corpus folder
        """
        self.corpus_folder = corpus_folder
        self.index_folder = os.path.join(corpus_folder, "tfidf_ranking")
        
        # Create index folder if it doesn't exist
        if not os.path.exists(self.index_folder):
            os.makedirs(self.index_folder)
        
        # Data structures
        self.documents = {}  # doc_id -> document info
        self.doc_texts = {}  # doc_id -> full text
        self.doc_tokens = {}  # doc_id -> list of tokens
        
        # TF-IDF components
        self.tfidf_matrix = None  # Document-term TF-IDF matrix
        self.query_tfidf_matrix = None  # Query-term TF-IDF matrix (for ranking)
        self.feature_names = []  # Vocabulary
        self.vectorizer = None   # Scikit-learn vectorizer
        
        # IDF values for ranking
        self.idf_values = {}
        
        # Statistics
        self.stats = {
            'total_documents': 0,
            'vocabulary_size': 0,
            'total_terms': 0
        }
        
        # Document mapping
        self.doc_id_to_index = {}
        self.index_to_doc_id = {}
        self.doc_names = []
    
    def load_documents(self):
        """Load documents from the corpus"""
        print("üìÇ Loading documents...")
        
        # Try to find document tokens file
        doc_tokens_file = os.path.join(self.corpus_folder, "document_tokens.json")
        
        if not os.path.exists(doc_tokens_file):
            print(f"‚ùå Document tokens file not found: {doc_tokens_file}")
            
            # Try alternative locations
            alt_locations = [
                os.path.join(self.corpus_folder, "..", "document_tokens.json"),
                os.path.join(self.corpus_folder, "statistics", "document_tokens.json"),
            ]
            
            for location in alt_locations:
                if os.path.exists(location):
                    doc_tokens_file = location
                    print(f"‚úÖ Found at: {doc_tokens_file}")
                    break
        
        if not os.path.exists(doc_tokens_file):
            print("‚ùå Could not find document tokens file")
            return False
        
        # Load document tokens
        try:
            with open(doc_tokens_file, 'r', encoding='utf-8') as f:
                doc_data = json.load(f)
            
            doc_id = 0
            for doc_name, doc_info in doc_data.items():
                doc_id += 1
                doc_key = f"doc_{doc_id:05d}"
                
                tokens = doc_info.get('tokens', [])
                token_count = doc_info.get('token_count', 0)
                
                if tokens and token_count > 0:
                    self.documents[doc_key] = {
                        'name': doc_name,
                        'token_count': token_count,
                        'tokens': tokens
                    }
                    
                    # Store tokens as text for vectorization
                    self.doc_texts[doc_key] = " ".join(tokens)
                    self.doc_tokens[doc_key] = tokens
            
            self.stats['total_documents'] = len(self.documents)
            self.stats['total_terms'] = sum(doc['token_count'] for doc in self.documents.values())
            
            print(f"‚úÖ Loaded {self.stats['total_documents']} documents")
            print(f"üìä Total terms: {self.stats['total_terms']:,}")
            
            return True
            
        except Exception as e:
            print(f"‚ùå Error loading documents: {e}")
            return False
    
    def calculate_tf_idf_scores(self, max_features: int = 10000, **kwargs):
        """
        Calculate TF-IDF scores and build ranking matrix
        
        Args:
            max_features: Maximum number of features to keep
            **kwargs: Additional parameters for TfidfVectorizer
        """
        print("üî® Calculating TF-IDF Scores for Ranking...")
        
        if not self.doc_texts:
            print("‚ùå No documents loaded. Loading documents first...")
            if not self.load_documents():
                return False
        
        # Prepare document texts
        doc_ids = sorted(self.doc_texts.keys())
        doc_names = [self.documents[doc_id]['name'] for doc_id in doc_ids]
        doc_texts = [self.doc_texts[doc_id] for doc_id in doc_ids]
        
        # Configure TF-IDF vectorizer
        vectorizer_kwargs = {
            'max_features': max_features,
            'min_df': 2,  # Ignore terms that appear in less than 2 documents
            'max_df': 0.95,  # Ignore terms that appear in more than 95% of documents
            'stop_words': 'english',
            'ngram_range': (1, 2),  # Use unigrams and bigrams
            'sublinear_tf': False,  # Use raw TF for ranking
            'norm': None,  # No normalization for ranking
            'use_idf': True,
            'smooth_idf': True,
            **kwargs
        }
        
        print(f"üìä Vectorizer parameters: {vectorizer_kwargs}")
        
        # Create and fit vectorizer
        self.vectorizer = SklearnTfidfVectorizer(**vectorizer_kwargs)
        self.tfidf_matrix = self.vectorizer.fit_transform(doc_texts)
        
        # Get feature names and IDF values
        self.feature_names = self.vectorizer.get_feature_names_out()
        self.idf_values = dict(zip(self.feature_names, self.vectorizer.idf_))
        
        # Store document mapping
        self.doc_id_to_index = {doc_id: idx for idx, doc_id in enumerate(doc_ids)}
        self.index_to_doc_id = {idx: doc_id for idx, doc_id in enumerate(doc_ids)}
        self.doc_names = doc_names
        
        # Update statistics
        self.stats['vocabulary_size'] = len(self.feature_names)
        
        # Save index
        self.save_index()
        
        print(f"\n‚úÖ TF-IDF Scores calculated successfully!")
        print(f"   Documents: {self.stats['total_documents']:,}")
        print(f"   Vocabulary: {self.stats['vocabulary_size']:,}")
        print(f"   TF-IDF matrix shape: {self.tfidf_matrix.shape}")
        
        # Show TF-IDF statistics
        self.show_tfidf_statistics()
        
        return True
    
    def show_tfidf_statistics(self):
        """Show TF-IDF statistics"""
        if self.tfidf_matrix is None:
            return
        
        # Convert to dense for analysis (sample only)
        sample_matrix = self.tfidf_matrix[:10].toarray()
        
        print(f"\nüìà TF-IDF STATISTICS:")
        print("-" * 50)
        print(f"Average TF-IDF score per document: {sample_matrix.mean():.4f}")
        print(f"Maximum TF-IDF score: {sample_matrix.max():.4f}")
        print(f"Minimum TF-IDF score: {sample_matrix.min():.4f}")
        
        # Show terms with highest IDF (most discriminating)
        sorted_idf = sorted(self.idf_values.items(), key=lambda x: x[1], reverse=True)[:10]
        print(f"\nüìä Top 10 terms by IDF (most discriminating):")
        for i, (term, idf) in enumerate(sorted_idf, 1):
            print(f"  {i:2d}. {term:<20} IDF: {idf:.3f}")
        
        print("-" * 50)
    
    def save_index(self):
        """Save TF-IDF index to disk"""
        if self.vectorizer is None or self.tfidf_matrix is None:
            print("‚ùå No index to save")
            return
        
        # Save vectorizer and matrix
        index_data = {
            'vectorizer': self.vectorizer,
            'tfidf_matrix': self.tfidf_matrix,
            'documents': self.documents,
            'doc_texts': self.doc_texts,
            'doc_tokens': self.doc_tokens,
            'feature_names': self.feature_names,
            'idf_values': self.idf_values,
            'stats': self.stats,
            'doc_id_to_index': self.doc_id_to_index,
            'index_to_doc_id': self.index_to_doc_id,
            'doc_names': self.doc_names
        }
        
        index_file = os.path.join(self.index_folder, "tfidf_ranking.pkl")
        with open(index_file, 'wb') as f:
            pickle.dump(index_data, f)
        
        # Also save a human-readable version
        readable_file = os.path.join(self.index_folder, "tfidf_ranking_stats.json")
        readable_data = {
            'stats': self.stats,
            'matrix_shape': self.tfidf_matrix.shape,
            'top_terms_by_idf': sorted(self.idf_values.items(), key=lambda x: x[1], reverse=True)[:50],
            'document_count': len(self.documents)
        }
        
        with open(readable_file, 'w', encoding='utf-8') as f:
            json.dump(readable_data, f, indent=2, ensure_ascii=False)
        
        print(f"\nüíæ Index saved to: {index_file}")
        print(f"üìä Statistics saved to: {readable_file}")
    
    def load_index(self):
        """Load TF-IDF index from disk"""
        index_file = os.path.join(self.index_folder, "tfidf_ranking.pkl")
        
        if not os.path.exists(index_file):
            print(f"‚ùå Index not found at: {index_file}")
            print("Building new index...")
            return self.calculate_tf_idf_scores()
        
        try:
            print(f"üìÇ Loading TF-IDF ranking index from: {index_file}")
            with open(index_file, 'rb') as f:
                index_data = pickle.load(f)
            
            self.vectorizer = index_data['vectorizer']
            self.tfidf_matrix = index_data['tfidf_matrix']
            self.documents = index_data['documents']
            self.doc_texts = index_data['doc_texts']
            self.doc_tokens = index_data['doc_tokens']
            self.feature_names = index_data['feature_names']
            self.idf_values = index_data['idf_values']
            self.stats = index_data['stats']
            self.doc_id_to_index = index_data['doc_id_to_index']
            self.index_to_doc_id = index_data['index_to_doc_id']
            self.doc_names = index_data['doc_names']
            
            print(f"‚úÖ TF-IDF Ranking Index loaded successfully!")
            print(f"   Documents: {self.stats['total_documents']:,}")
            print(f"   Vocabulary: {self.stats['vocabulary_size']:,}")
            print(f"   Matrix shape: {self.tfidf_matrix.shape}")
            
            return True
            
        except Exception as e:
            print(f"‚ùå Error loading index: {e}")
            import traceback
            traceback.print_exc()
            return self.calculate_tf_idf_scores()
    
    def rank_documents_by_query(self, query: str, top_k: int = 20) -> List[Dict]:
        """
        Rank documents by TF-IDF relevance to query
        
        Args:
            query: Search query string
            top_k: Number of top results to return
            
        Returns:
            List of ranked documents with TF-IDF scores
        """
        if self.vectorizer is None or self.tfidf_matrix is None:
            print("‚ùå TF-IDF index not loaded")
            return []
        
        print(f"\nüîç Ranking documents for query: '{query}'")
        
        # Transform query to TF-IDF vector
        query_vector = self.vectorizer.transform([query])
        
        # Calculate dot product (TF-IDF similarity) between query and documents
        # This gives us the sum of TF-IDF scores for query terms in each document
        relevance_scores = (query_vector * self.tfidf_matrix.T).toarray().flatten()
        
        # Get top K documents by relevance score
        top_indices = relevance_scores.argsort()[::-1][:top_k]
        
        results = []
        for rank, idx in enumerate(top_indices, 1):
            if relevance_scores[idx] <= 0:
                continue  # Skip documents with zero relevance
                
            doc_id = self.index_to_doc_id[idx]
            doc_info = self.documents[doc_id]
            score = relevance_scores[idx]
            
            # Get query terms present in document with their TF-IDF scores
            query_terms = query.lower().split()
            term_scores = self.get_query_term_scores(doc_id, query_terms)
            
            # Calculate document length normalization factor
            doc_length = doc_info['token_count']
            norm_factor = math.log(1 + doc_length) if doc_length > 0 else 1
            
            # Normalized score (adjust for document length)
            normalized_score = score / norm_factor if norm_factor > 0 else score
            
            results.append({
                'doc_id': doc_id,
                'name': doc_info['name'],
                'token_count': doc_info['token_count'],
                'tfidf_score': score,
                'normalized_score': normalized_score,
                'rank': rank,
                'query_terms': query_terms,
                'term_scores': term_scores,
                'terms_found': len([t for t in term_scores if t['tfidf'] > 0]),
                'total_terms': len(query_terms)
            })
        
        # Sort by normalized score (higher is better)
        results.sort(key=lambda x: x['normalized_score'], reverse=True)
        
        # Re-rank
        for i, result in enumerate(results, 1):
            result['rank'] = i
        
        return results
    
    def get_query_term_scores(self, doc_id: str, query_terms: List[str]) -> List[Dict]:
        """
        Get TF-IDF scores for query terms in a specific document
        
        Args:
            doc_id: Document ID
            query_terms: List of query terms
            
        Returns:
            List of term score dictionaries
        """
        if doc_id not in self.doc_id_to_index:
            return []
        
        doc_idx = self.doc_id_to_index[doc_id]
        doc_vector = self.tfidf_matrix[doc_idx]
        
        # Convert to dense for easier access
        if sp.issparse(doc_vector):
            doc_vector = doc_vector.toarray().flatten()
        
        term_scores = []
        for term in query_terms:
            # Find term index in vocabulary
            term_idx = None
            for i, feature in enumerate(self.feature_names):
                if term in feature.lower():
                    term_idx = i
                    break
            
            if term_idx is not None and term_idx < len(doc_vector):
                tfidf_score = doc_vector[term_idx]
                idf_score = self.idf_values.get(self.feature_names[term_idx], 0)
                
                # Get raw term frequency from document
                doc_tokens = self.doc_tokens[doc_id]
                term_freq = doc_tokens.count(term.lower())
                
                term_scores.append({
                    'term': term,
                    'tf': term_freq,
                    'idf': idf_score,
                    'tfidf': tfidf_score,
                    'in_vocabulary': True
                })
            else:
                term_scores.append({
                    'term': term,
                    'tf': 0,
                    'idf': 0,
                    'tfidf': 0,
                    'in_vocabulary': False
                })
        
        return term_scores
    
    def rank_documents_by_keywords(self, keywords: List[str], top_k: int = 20) -> List[Dict]:
        """
        Rank documents by keyword relevance using TF-IDF
        
        Args:
            keywords: List of keywords
            top_k: Number of top results to return
            
        Returns:
            List of ranked documents
        """
        query = " ".join(keywords)
        return self.rank_documents_by_query(query, top_k)
    
    def get_top_tfidf_terms_for_document(self, doc_name: str, top_n: int = 15) -> List[Dict]:
        """
        Get top TF-IDF terms for a specific document
        
        Args:
            doc_name: Name of the document
            top_n: Number of top terms to return
            
        Returns:
            List of term dictionaries with TF-IDF scores
        """
        if self.tfidf_matrix is None:
            print("‚ùå TF-IDF matrix not loaded")
            return []
        
        # Find the document
        target_doc_id = None
        for doc_id, info in self.documents.items():
            if info['name'] == doc_name:
                target_doc_id = doc_id
                break
        
        if target_doc_id is None:
            print(f"‚ùå Document not found: {doc_name}")
            return []
        
        # Get document index
        if target_doc_id not in self.doc_id_to_index:
            print(f"‚ùå Document not in TF-IDF matrix: {doc_name}")
            return []
        
        doc_idx = self.doc_id_to_index[target_doc_id]
        
        # Get document vector
        doc_vector = self.tfidf_matrix[doc_idx]
        
        # Convert to dense if sparse
        if sp.issparse(doc_vector):
            doc_vector = doc_vector.toarray().flatten()
        
        # Get top term indices
        top_indices = doc_vector.argsort()[::-1][:top_n]
        
        # Get term details
        top_terms = []
        for idx in top_indices:
            score = doc_vector[idx]
            if score > 0:
                term = self.feature_names[idx]
                idf = self.idf_values.get(term, 0)
                
                # Calculate term frequency in document
                doc_tokens = self.doc_tokens[target_doc_id]
                term_lower = term.lower()
                tf = sum(1 for token in doc_tokens if token == term_lower or term_lower in token)
                
                top_terms.append({
                    'term': term,
                    'tf': tf,
                    'idf': idf,
                    'tfidf': score,
                    'rank': len(top_terms) + 1
                })
        
        return top_terms
    
    def show_document_tfidf_analysis(self, doc_name: str):
        """Show comprehensive TF-IDF analysis for a document"""
        print(f"\nüìä TF-IDF ANALYSIS FOR: {doc_name}")
        print("=" * 80)
        
        # Get document info
        doc_info = None
        doc_id = None
        for d_id, info in self.documents.items():
            if info['name'] == doc_name:
                doc_info = info
                doc_id = d_id
                break
        
        if not doc_info:
            print(f"‚ùå Document not found: {doc_name}")
            return
        
        print(f"üìÑ Document: {doc_name}")
        print(f"üìè Tokens: {doc_info['token_count']:,}")
        
        # Get top terms
        top_terms = self.get_top_tfidf_terms_for_document(doc_name, top_n=20)
        
        if top_terms:
            print(f"\nüèÜ TOP 20 TERMS BY TF-IDF SCORE:")
            print("-" * 80)
            print(f"{'Rank':<6} {'Term':<25} {'TF':<8} {'IDF':<8} {'TF-IDF':<10}")
            print("-" * 80)
            
            for term_info in top_terms:
                print(f"{term_info['rank']:<6} {term_info['term'][:24]:<25} "
                      f"{term_info['tf']:<8} {term_info['idf']:<8.3f} {term_info['tfidf']:<10.4f}")
        
        print("=" * 80)
    
    def show_ranking_statistics(self):
        """Show ranking system statistics"""
        print(f"\nüìä TF-IDF RANKING SYSTEM STATISTICS")
        print("=" * 80)
        print(f"Documents: {self.stats['total_documents']:,}")
        print(f"Vocabulary: {self.stats['vocabulary_size']:,}")
        print(f"Total terms: {self.stats['total_terms']:,}")
        
        if self.tfidf_matrix is not None:
            print(f"TF-IDF matrix shape: {self.tfidf_matrix.shape}")
            
            # Calculate average document score
            if self.tfidf_matrix.shape[0] > 0:
                avg_score = self.tfidf_matrix.sum() / (self.tfidf_matrix.shape[0] * self.tfidf_matrix.shape[1])
                print(f"Average TF-IDF score per term: {avg_score:.6f}")
        
        # Show most discriminating terms (highest IDF)
        sorted_idf = sorted(self.idf_values.items(), key=lambda x: x[1], reverse=True)[:15]
        print(f"\nüîù TOP 15 MOST DISCRIMINATING TERMS (Highest IDF):")
        for i, (term, idf) in enumerate(sorted_idf, 1):
            print(f"  {i:2d}. {term:<25} IDF: {idf:.3f}")
        
        # Show most common terms (lowest IDF)
        sorted_idf_low = sorted(self.idf_values.items(), key=lambda x: x[1])[:15]
        print(f"\nüìâ TOP 15 MOST COMMON TERMS (Lowest IDF):")
        for i, (term, idf) in enumerate(sorted_idf_low, 1):
            print(f"  {i:2d}. {term:<25} IDF: {idf:.3f}")
        
        print("=" * 80)
    
    def interactive_ranking(self):
        """Interactive TF-IDF ranking interface"""
        print("\n" + "=" * 80)
        print("ü•á TF-IDF DOCUMENT RANKING SYSTEM")
        print("=" * 80)
        print("\nüìã Available Commands:")
        print("  ‚Ä¢ rank <query>           - Rank documents by query relevance")
        print("  ‚Ä¢ keywords <word1 word2> - Rank documents by keywords")
        print("  ‚Ä¢ terms <doc_name>       - Show top TF-IDF terms for document")
        print("  ‚Ä¢ analyze <doc_name>     - Show TF-IDF analysis for document")
        print("  ‚Ä¢ stats                  - Show ranking statistics")
        print("  ‚Ä¢ rebuild                - Recalculate TF-IDF scores")
        print("  ‚Ä¢ quit                   - Exit")
        print("\nüìù Example commands:")
        print("  ‚Ä¢ rank murder evidence")
        print("  ‚Ä¢ keywords supreme court appeal")
        print("  ‚Ä¢ terms 2025LHC7277.txt")
        print("  ‚Ä¢ analyze 2025LHC7389.txt")
        print("=" * 80)
        
        while True:
            user_input = input("\nüéØ Enter command: ").strip()
            
            if not user_input:
                continue
            
            if user_input.lower() == 'quit':
                print("üëã Goodbye!")
                break
            
            elif user_input.lower() == 'stats':
                self.show_ranking_statistics()
            
            elif user_input.lower() == 'rebuild':
                confirm = input("‚ö†Ô∏è  Recalculate TF-IDF scores? This may take time. (y/n): ").strip().lower()
                if confirm == 'y':
                    max_features = input("Enter max features (default 10000): ").strip()
                    max_features = int(max_features) if max_features.isdigit() else 10000
                    self.calculate_tf_idf_scores(max_features=max_features)
            
            elif user_input.lower().startswith('rank '):
                query = user_input[5:].strip()
                if query:
                    print(f"\nüîç Ranking documents for query: '{query}'")
                    results = self.rank_documents_by_query(query, top_k=20)
                    self.display_ranking_results(results, f"Ranking for: '{query}'")
                else:
                    print("‚ùå Please enter a query")
            
            elif user_input.lower().startswith('keywords '):
                keywords = user_input[9:].strip().split()
                if keywords:
                    print(f"\nüîç Ranking documents for keywords: {keywords}")
                    results = self.rank_documents_by_keywords(keywords, top_k=20)
                    self.display_ranking_results(results, f"Keywords: {', '.join(keywords)}")
                else:
                    print("‚ùå Please enter keywords")
            
            elif user_input.lower().startswith('terms '):
                doc_name = user_input[6:].strip()
                if doc_name:
                    print(f"\nüî§ Top TF-IDF terms for: '{doc_name}'")
                    top_terms = self.get_top_tfidf_terms_for_document(doc_name, top_n=15)
                    
                    if top_terms:
                        print("\n" + "=" * 80)
                        print(f"üèÜ TOP TF-IDF TERMS FOR: {doc_name}")
                        print("=" * 80)
                        
                        for term_info in top_terms:
                            print(f"{term_info['rank']:2d}. {term_info['term']:<25} "
                                  f"TF-IDF: {term_info['tfidf']:.4f} "
                                  f"(TF: {term_info['tf']}, IDF: {term_info['idf']:.3f})")
                        
                        print("=" * 80)
                    else:
                        print("‚ùå No terms found or document not in index")
                else:
                    print("‚ùå Please enter a document name")
            
            elif user_input.lower().startswith('analyze '):
                doc_name = user_input[8:].strip()
                if doc_name:
                    self.show_document_tfidf_analysis(doc_name)
                else:
                    print("‚ùå Please enter a document name")
            
            else:
                # Try as a ranking query
                print(f"\nüîç Ranking documents for query: '{user_input}'")
                results = self.rank_documents_by_query(user_input, top_k=15)
                self.display_ranking_results(results, f"Ranking for: '{user_input}'")
    
    def display_ranking_results(self, results: List[Dict], title: str):
        """Display ranking results"""
        if not results:
            print(f"\n‚ùå No relevant documents found")
            return
        
        print(f"\n‚úÖ {title}")
        print(f"üìä Ranked {len(results)} document(s) by TF-IDF relevance")
        print("=" * 80)
        
        for i, result in enumerate(results[:15], 1):  # Show top 15
            print(f"\n{i:2d}. üìÑ {result['name']}")
            print(f"    ü•á Rank: #{result['rank']}")
            print(f"    üìè Length: {result['token_count']:,} tokens")
            print(f"    ‚≠ê TF-IDF Score: {result['tfidf_score']:.4f}")
            print(f"    üìä Normalized Score: {result['normalized_score']:.4f}")
            
            if result['term_scores']:
                # Show top contributing terms
                top_terms = sorted([t for t in result['term_scores'] if t['tfidf'] > 0], 
                                 key=lambda x: x['tfidf'], reverse=True)[:3]
                
                if top_terms:
                    print(f"    üîù Top contributing terms:")
                    for term_info in top_terms:
                        print(f"       ‚Ä¢ {term_info['term']}: TF-IDF={term_info['tfidf']:.4f} "
                              f"(TF={term_info['tf']}, IDF={term_info['idf']:.3f})")
            
            print(f"    üîç Terms matched: {result['terms_found']}/{result['total_terms']}")
        
        if len(results) > 15:
            print(f"\n... and {len(results) - 15} more ranked documents")
        
        print("\n" + "=" * 80)
        
        # Ask for analysis
        if results:
            choice = input("\nüìä Analyze a document? (enter rank number or 'n'): ").strip()
            if choice.lower() != 'n' and choice.isdigit():
                idx = int(choice) - 1
                if 0 <= idx < len(results):
                    self.show_document_tfidf_analysis(results[idx]['name'])


def main():
    """Main function"""
    print("=" * 80)
    print("ü•á SUPREME COURT - TF-IDF DOCUMENT RANKING SYSTEM")
    print("=" * 80)
    
    # Set corpus folder path
    corpus_folder = r"C:\Users\Armaghan Rafique\Desktop\AI Project\cleaned_corpus"
    
    # Check if folder exists
    if not os.path.exists(corpus_folder):
        print(f"‚ùå Corpus folder not found: {corpus_folder}")
        
        # Try alternative locations
        alt_folders = [
            r"C:\Users\Armaghan Rafique\Desktop\AI Project\supreme_court_judgements_txt",
            r"C:\Users\Armaghan Rafique\Desktop\AI Project",
            os.path.join(os.path.expanduser("~"), "Desktop", "AI Project", "cleaned_corpus")
        ]
        
        found = False
        for folder in alt_folders:
            if os.path.exists(folder):
                corpus_folder = folder
                print(f"‚úÖ Using folder: {corpus_folder}")
                found = True
                break
        
        if not found:
            corpus_folder = input("üìÅ Enter corpus folder path: ").strip()
            if not os.path.exists(corpus_folder):
                print(f"‚ùå Folder does not exist: {corpus_folder}")
                return
    
    print(f"\nüìÅ Using corpus folder: {corpus_folder}")
    
    # Create TF-IDF ranking system
    ranking_system = SupremeCourtTFIDFRanking(corpus_folder)
    
    # Load or build index
    print("\nüìÇ Initializing TF-IDF ranking system...")
    if not ranking_system.load_index():
        print("‚ùå Failed to initialize TF-IDF ranking system")
        return
    
    # Show system info
    print(f"\nüìä TF-IDF RANKING SYSTEM READY")
    print(f"   Documents: {ranking_system.stats['total_documents']:,}")
    print(f"   Vocabulary: {ranking_system.stats['vocabulary_size']:,}")
    print(f"   Total terms: {ranking_system.stats['total_terms']:,}")
    
    # Start interactive ranking
    ranking_system.interactive_ranking()


if __name__ == "__main__":
    main()

ü•á SUPREME COURT - TF-IDF DOCUMENT RANKING SYSTEM

üìÅ Using corpus folder: C:\Users\Armaghan Rafique\Desktop\AI Project\cleaned_corpus

üìÇ Initializing TF-IDF ranking system...
üìÇ Loading TF-IDF ranking index from: C:\Users\Armaghan Rafique\Desktop\AI Project\cleaned_corpus\tfidf_ranking\tfidf_ranking.pkl
‚úÖ TF-IDF Ranking Index loaded successfully!
   Documents: 1,460
   Vocabulary: 10,000
   Matrix shape: (1460, 10000)

üìä TF-IDF RANKING SYSTEM READY
   Documents: 1,460
   Vocabulary: 10,000
   Total terms: 1,048,901

ü•á TF-IDF DOCUMENT RANKING SYSTEM

üìã Available Commands:
  ‚Ä¢ rank <query>           - Rank documents by query relevance
  ‚Ä¢ keywords <word1 word2> - Rank documents by keywords
  ‚Ä¢ terms <doc_name>       - Show top TF-IDF terms for document
  ‚Ä¢ analyze <doc_name>     - Show TF-IDF analysis for document
  ‚Ä¢ stats                  - Show ranking statistics
  ‚Ä¢ rebuild                - Recalculate TF-IDF scores
  ‚Ä¢ quit                   - Exit


üéØ Enter command:  police



üîç Ranking documents for query: 'police'

üîç Ranking documents for query: 'police'

‚úÖ Ranking for: 'police'
üìä Ranked 15 document(s) by TF-IDF relevance

 1. üìÑ SMC_No_102017_Shoulder_Out_20of_20Turn_20Promotion.txt
    ü•á Rank: #1
    üìè Length: 19,904 tokens
    ‚≠ê TF-IDF Score: 148.8222
    üìä Normalized Score: 15.0345
    üîç Terms matched: 0/1

 2. üìÑ SHOULDEROUT_OF_TURN_PROMOTION_IN_GB_POLICE_Shoulder_Out_20of_20Turn_20Promotion.txt
    ü•á Rank: #2
    üìè Length: 19,904 tokens
    ‚≠ê TF-IDF Score: 148.8222
    üìä Normalized Score: 15.0345
    üîç Terms matched: 0/1

 3. üìÑ Civil_Appeal_No_802016_in_CPLA_No1172016_Prov._20Govt_20__20others_20vs_20Rehmat_20Jan_20DSP_20__20others.txt
    ü•á Rank: #3
    üìè Length: 871 tokens
    ‚≠ê TF-IDF Score: 91.5829
    üìä Normalized Score: 13.5262
    üîç Terms matched: 0/1

 4. üìÑ Provincial_Government_through_Chief_Secretary_Gilg_Prov._20Govt_20__20others_20vs_20Rehmat_20Jan_20DSP_20__20others.txt
    


üìä Analyze a document? (enter rank number or 'n'):  n

üéØ Enter command:  skardu



üîç Ranking documents for query: 'skardu'

üîç Ranking documents for query: 'skardu'

‚úÖ Ranking for: 'skardu'
üìä Ranked 15 document(s) by TF-IDF relevance

 1. üìÑ Clean_Drinking_water_Clean_20Drinking_20water.txt
    ü•á Rank: #1
    üìè Length: 1,105 tokens
    ‚≠ê TF-IDF Score: 202.3664
    üìä Normalized Score: 28.8744
    üîç Terms matched: 0/1

 2. üìÑ SMC_No032009_Clean_20Drinking_20water.txt
    ü•á Rank: #2
    üìè Length: 1,105 tokens
    ‚≠ê TF-IDF Score: 202.3664
    üìä Normalized Score: 28.8744
    üîç Terms matched: 0/1

 3. üìÑ The_Deputy_Commissioner_Skardu_Versus_Akhond_Muham_the_Deputy_Commissioner_Skardu___others_versus_Akhond_Muhammad___others.txt
    ü•á Rank: #3
    üìè Length: 1,270 tokens
    ‚≠ê TF-IDF Score: 141.6565
    üìä Normalized Score: 19.8189
    üîç Terms matched: 0/1

 4. üìÑ Civil_Appeal_No_152015_in_CPLA_No_492015_the_Deputy_Commissioner_Skardu___others_versus_Akhond_Muhammad___others.txt
    ü•á Rank: #4
    üìè Length: 1


üìä Analyze a document? (enter rank number or 'n'):  n#

üéØ Enter command:  respondant



üîç Ranking documents for query: 'respondant'

üîç Ranking documents for query: 'respondant'

‚ùå No relevant documents found
