In [5]:
import os
import re
import json
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import string
from collections import Counter
import pandas as pd
from pathlib import Path
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK data
def download_nltk_resources():
    """Download necessary NLTK resources"""
    resources = [
        'punkt',
        'stopwords',
        'wordnet',
        'averaged_perceptron_tagger'
    ]
    
    for resource in resources:
        try:
            nltk.data.find(f'tokenizers/{resource}' if resource == 'punkt' else f'corpora/{resource}')
            print(f"‚úÖ {resource} already available")
        except LookupError:
            print(f"üì• Downloading {resource}...")
            nltk.download(resource, quiet=True)
            print(f"‚úÖ {resource} downloaded")

class SupremeCourtCorpusPreprocessor:
    def __init__(self, txt_folder_path):
        """
        Initialize the preprocessor for Supreme Court judgements
        
        Args:
            txt_folder_path: Path to folder containing text files
        """
        self.txt_folder = txt_folder_path
        self.corpus_folder = os.path.join(os.path.dirname(self.txt_folder), "cleaned_corpus")
        
        # Setup paths
        self.setup_folders()
        
        # Initialize NLP tools
        download_nltk_resources()
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.stemmer = PorterStemmer()
        
        # Custom legal stop words for court judgements
        self.legal_stop_words = {
            'court', 'judgement', 'judgment', 'case', 'appellant', 'respondent',
            'petitioner', 'respondent', 'honourable', 'justice', 'bench',
            'supreme', 'high', 'section', 'act', 'article', 'clause',
            'hereinafter', 'aforesaid', 'whereas', 'hereinafter',
            'plaintiff', 'defendant', 'witness', 'evidence', 'exhibit',
            'dated', 'year', 'month', 'day', 'vs', 'versus', 'v',
            'no.', 'number', 'nos', 'para', 'paragraph'
        }
        
        # Combine all stop words
        self.all_stop_words = self.stop_words.union(self.legal_stop_words)
        
        # Statistics
        self.stats = {
            'total_files': 0,
            'total_words_raw': 0,
            'total_words_cleaned': 0,
            'total_sentences': 0,
            'vocabulary_size': 0,
            'most_common_words': [],
            'file_stats': [],
            'processing_time': 0
        }
        
        # Store processed data
        self.documents = []
        self.tokens_by_doc = []
        self.vocabulary = set()
        self.word_frequencies = Counter()
    
    def setup_folders(self):
        """Create necessary folders for processed data"""
        folders_to_create = [
            self.corpus_folder,
            os.path.join(self.corpus_folder, "cleaned_docs"),
            os.path.join(self.corpus_folder, "tokens"),
            os.path.join(self.corpus_folder, "statistics"),
            os.path.join(self.corpus_folder, "visualizations")
        ]
        
        for folder in folders_to_create:
            if not os.path.exists(folder):
                os.makedirs(folder)
                print(f"üìÅ Created: {folder}")
    
    def get_all_text_files(self):
        """Get all text files from the folder - FIXED VERSION"""
        text_files = []
        
        print(f"üîç Scanning folder: {self.txt_folder}")
        
        try:
            # List all files in directory
            all_files = os.listdir(self.txt_folder)
            
            for filename in all_files:
                filepath = os.path.join(self.txt_folder, filename)
                
                # Check if it's a file (not directory)
                if os.path.isfile(filepath):
                    # Check if it's a text file (case insensitive)
                    if filename.lower().endswith('.txt'):
                        # Check if it's NOT an error file
                        if not filename.startswith('ERROR_'):
                            text_files.append(filepath)
            
            # Sort files alphabetically
            text_files.sort()
            
            self.stats['total_files'] = len(text_files)
            
            # Debug output
            print(f"   Found {len(text_files)} valid text files")
            if text_files:
                print(f"   First 5 files: {[os.path.basename(f) for f in text_files[:5]]}")
                if len(text_files) > 5:
                    print(f"   ... and {len(text_files) - 5} more")
            
            return text_files
            
        except Exception as e:
            print(f"‚ùå Error scanning folder: {e}")
            return []
    
    def remove_metadata_header(self, text):
        """Remove the conversion metadata header from text files"""
        # More robust pattern to find and remove the metadata header
        header_patterns = [
            r'=+\s*\nPDF TO TEXT CONVERSION REPORT\s*\n=+\s*\n.*?\nTEXT CONTENT:\s*\n=+\s*\n',
            r'=+\s*\nCONVERSION REPORT\s*\n=+\s*\n.*?\nTEXT:\s*\n=+\s*\n',
            r'SOURCE PDF:.*?\nTEXT CONTENT:.*?\n=+\s*\n'
        ]
        
        for pattern in header_patterns:
            match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
            if match:
                text = text[match.end():]
                break
        
        return text.strip()
    
    def clean_text_phase1(self, text):
        """Phase 1: Basic text cleaning"""
        if not text:
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Remove URLs
        text = re.sub(r'https?://\S+|www\.\S+', '', text)
        
        # Remove email addresses
        text = re.sub(r'\S+@\S+', '', text)
        
        # Remove special characters but keep basic punctuation and important symbols
        text = re.sub(r'[^\w\s.,!?;:()\-‚Äì‚Äî\'\"`]', ' ', text)
        
        # Replace multiple spaces/newlines/tabs with single space
        text = re.sub(r'\s+', ' ', text)
        
        # Remove standalone numbers (but keep numbers attached to words)
        text = re.sub(r'\b\d+\b', '', text)
        
        return text.strip()
    
    def clean_text_phase2(self, text):
        """Phase 2: Legal document specific cleaning"""
        # Remove case citations (e.g., [2024] SC 123, PLD 2023 SC 456)
        text = re.sub(r'\[\d{4}\]\s*\w+\s*\d+\s*\w*', '', text)
        text = re.sub(r'\b(plj|pld|mlc|scmr|scr|clr|mlj)\s*\d{4}\s*\w+\s*\d+', '', text, flags=re.IGNORECASE)
        
        # Remove section references (e.g., S. 302, Section 34, Art. 25)
        text = re.sub(r'\b(?:section|s\.?|art\.?|article|cl\.?|clause|rule|r\.?)\s*\w*\s*\d+[a-z]*', '', text, flags=re.IGNORECASE)
        
        # Remove date patterns
        text = re.sub(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b', '', text)
        text = re.sub(r'\b(?:january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|jun|jul|aug|sep|oct|nov|dec)[a-z]*\s+\d{1,2},?\s+\d{4}\b', '', text, flags=re.IGNORECASE)
        
        # Remove judge names patterns
        text = re.sub(r'\b(?:honourable|hon\.?|mr\.?|mrs\.?|ms\.?|justice|judge|j\.?)\s+\w+(?:\s+\w+)*\b', '', text, flags=re.IGNORECASE)
        
        # Remove court room numbers and case numbers
        text = re.sub(r'\b(?:court|cr\.?|case|no\.?|number)\s*(?:no\.?\s*)?\w*\d+\b', '', text, flags=re.IGNORECASE)
        
        # Remove common legal phrases that don't add semantic value
        legal_phrases = [
            r'in the matter of',
            r'in re:',
            r'versus\b',
            r'\bvs\.?\b',
            r'\bv\.\b',
            r'petition for',
            r'appeal against',
            r'reference made',
            r'order dated',
            r'judgment dated',
            r'passed by',
            r'heard by',
            r'presided over by',
            r'coram:',
            r'before:',
        ]
        
        for phrase in legal_phrases:
            text = re.sub(phrase, '', text, flags=re.IGNORECASE)
        
        # Remove page numbers and references
        text = re.sub(r'\bpage\s*\d+\b', '', text, flags=re.IGNORECASE)
        text = re.sub(r'\bpara\s*\d+\b', '', text, flags=re.IGNORECASE)
        text = re.sub(r'\bparagraph\s*\d+\b', '', text, flags=re.IGNORECASE)
        
        return text.strip()
    
    def clean_text_phase3(self, text):
        """Phase 3: Advanced cleaning and normalization"""
        # Remove extra whitespace (again, after phase 2)
        text = re.sub(r'\s+', ' ', text)
        
        # Remove short words (less than 2 characters) except important ones
        words = text.split()
        filtered_words = []
        
        for word in words:
            # Keep words with at least 2 characters, or important single letters
            if len(word) >= 2 or word in ['a', 'i', 'u']:
                filtered_words.append(word)
        
        text = ' '.join(filtered_words)
        
        # Normalize common legal abbreviations
        replacements = {
            'appellant': 'appellant',
            'appellants': 'appellant',
            'respondent': 'respondent',
            'respondents': 'respondent',
            'petitioner': 'petitioner',
            'petitioners': 'petitioner',
            'plaintiff': 'plaintiff',
            'plaintiffs': 'plaintiff',
            'defendant': 'defendant',
            'defendants': 'defendant',
            'judgement': 'judgment',
            'judgements': 'judgment',
            'honourable': 'honorable',
            'honble': 'honorable',
        }
        
        words = text.split()
        normalized_words = []
        for word in words:
            if word in replacements:
                normalized_words.append(replacements[word])
            else:
                normalized_words.append(word)
        
        text = ' '.join(normalized_words)
        
        return text.strip()
    
    def tokenize_text(self, text):
        """Tokenize text into words and sentences"""
        # First tokenize into sentences
        sentences = sent_tokenize(text)
        
        # Then tokenize each sentence into words
        tokens = []
        for sentence in sentences:
            # Clean sentence before tokenization
            sentence = re.sub(r'[^\w\s.,!?;:()\-]', ' ', sentence)
            words = word_tokenize(sentence)
            tokens.extend(words)
        
        return tokens, sentences
    
    def normalize_tokens(self, tokens):
        """Normalize tokens (lemmatization) with legal context"""
        normalized = []
        
        for token in tokens:
            # Skip if token is punctuation
            if token in string.punctuation:
                continue
            
            # Skip stopwords
            if token.lower() in self.all_stop_words:
                continue
            
            # Lemmatize (better for legal documents than stemming)
            lemma = self.lemmatizer.lemmatize(token.lower())
            
            # Only keep tokens with at least 2 characters
            if len(lemma) >= 2:
                normalized.append(lemma)
        
        return normalized
    
    def process_single_document(self, file_path):
        """Process a single text document"""
        file_name = os.path.basename(file_path)
        
        try:
            # Read the file with error handling
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    text = f.read()
            except UnicodeDecodeError:
                # Try different encoding if UTF-8 fails
                with open(file_path, 'r', encoding='latin-1') as f:
                    text = f.read()
            
            # Check if file has content
            if not text or len(text.strip()) < 10:
                print(f"   ‚ö†Ô∏è File is empty or too short: {file_name}")
                return None
            
            # Remove metadata header
            text = self.remove_metadata_header(text)
            
            # Phase 1: Basic cleaning
            text_phase1 = self.clean_text_phase1(text)
            
            # Phase 2: Legal-specific cleaning
            text_phase2 = self.clean_text_phase2(text_phase1)
            
            # Phase 3: Advanced cleaning
            text_phase3 = self.clean_text_phase3(text_phase2)
            
            # Tokenization
            raw_tokens, sentences = self.tokenize_text(text_phase3)
            
            # Check if we have tokens
            if not raw_tokens:
                print(f"   ‚ö†Ô∏è No tokens extracted: {file_name}")
                return None
            
            # Normalize tokens
            clean_tokens = self.normalize_tokens(raw_tokens)
            
            # Calculate statistics
            file_stats = {
                'file_name': file_name,
                'raw_text_length': len(text),
                'cleaned_text_length': len(text_phase3),
                'sentence_count': len(sentences),
                'raw_token_count': len(raw_tokens),
                'clean_token_count': len(clean_tokens),
                'unique_tokens': len(set(clean_tokens)),
                'avg_sentence_length': len(raw_tokens) / len(sentences) if sentences else 0
            }
            
            return {
                'file_name': file_name,
                'original_text': text[:500] + "..." if len(text) > 500 else text,  # Store only preview
                'cleaned_text': text_phase3,
                'raw_tokens': raw_tokens,
                'clean_tokens': clean_tokens,
                'sentences': sentences,
                'stats': file_stats
            }
            
        except Exception as e:
            print(f"‚ùå Error processing {file_name}: {str(e)[:80]}")
            return None
    
    def create_corpus(self):
        """Create a cleaned corpus from all text files"""
        print("=" * 80)
        print("‚öñÔ∏è  SUPREME COURT JUDGEMENTS CORPUS PREPROCESSING")
        print("=" * 80)
        
        start_time = datetime.now()
        
        # Get all text files
        text_files = self.get_all_text_files()
        
        if not text_files:
            print(f"\n‚ùå No text files found in: {self.txt_folder}")
            
            # Show what's in the folder
            self.debug_folder_contents()
            return False
        
        print(f"\nüìÅ Text Folder: {self.txt_folder}")
        print(f"üìÅ Corpus Folder: {self.corpus_folder}")
        print(f"üìö Found {len(text_files)} text file(s)")
        print("=" * 80)
        
        # Process each file
        successful_files = 0
        failed_files = []
        
        for idx, file_path in enumerate(text_files, 1):
            print(f"\n[{idx}/{len(text_files)}] Processing: {os.path.basename(file_path)}")
            
            result = self.process_single_document(file_path)
            
            if result:
                # Store processed data
                self.documents.append(result)
                self.tokens_by_doc.append(result['clean_tokens'])
                self.vocabulary.update(result['clean_tokens'])
                self.word_frequencies.update(result['clean_tokens'])
                self.stats['file_stats'].append(result['stats'])
                
                # Update statistics
                self.stats['total_words_raw'] += result['stats']['raw_token_count']
                self.stats['total_words_cleaned'] += result['stats']['clean_token_count']
                self.stats['total_sentences'] += result['stats']['sentence_count']
                
                # Save cleaned document
                cleaned_doc_path = os.path.join(
                    self.corpus_folder, 
                    "cleaned_docs", 
                    result['file_name']
                )
                with open(cleaned_doc_path, 'w', encoding='utf-8') as f:
                    f.write(result['cleaned_text'])
                
                # Save tokens
                tokens_path = os.path.join(
                    self.corpus_folder,
                    "tokens",
                    f"{os.path.splitext(result['file_name'])[0]}_tokens.txt"
                )
                with open(tokens_path, 'w', encoding='utf-8') as f:
                    f.write('\n'.join(result['clean_tokens']))
                
                print(f"   ‚úÖ Cleaned: {result['stats']['clean_token_count']:,} tokens")
                successful_files += 1
            else:
                failed_files.append(os.path.basename(file_path))
                print(f"   ‚ùå Failed to process")
        
        # Check if we processed any files
        if successful_files == 0:
            print(f"\n‚ùå No files were successfully processed!")
            return False
        
        # Finalize statistics
        self.stats['vocabulary_size'] = len(self.vocabulary)
        self.stats['most_common_words'] = self.word_frequencies.most_common(50)
        self.stats['processing_time'] = (datetime.now() - start_time).total_seconds()
        
        # Save processed data
        self.save_processed_data()
        self.generate_statistics_report()
        
        # Create visualizations (optional - can be commented out if causing issues)
        try:
            self.create_visualizations()
        except Exception as e:
            print(f"‚ö†Ô∏è Could not create visualizations: {e}")
        
        # Print summary
        self.print_summary()
        
        # Show failed files if any
        if failed_files:
            print(f"\n‚ö†Ô∏è  Failed to process {len(failed_files)} files:")
            for i, failed in enumerate(failed_files[:10], 1):
                print(f"   {i}. {failed}")
            if len(failed_files) > 10:
                print(f"   ... and {len(failed_files) - 10} more")
        
        return True
    
    def debug_folder_contents(self):
        """Debug: Show what's in the folder"""
        print(f"\nüìÇ Contents of {self.txt_folder}:")
        try:
            items = os.listdir(self.txt_folder)
            print(f"   Total items: {len(items)}")
            
            # Count by type
            files = []
            dirs = []
            
            for item in items[:20]:  # Show first 20
                item_path = os.path.join(self.txt_folder, item)
                if os.path.isfile(item_path):
                    files.append(item)
                elif os.path.isdir(item_path):
                    dirs.append(item)
            
            print(f"   Files: {len(files)}")
            print(f"   Directories: {len(dirs)}")
            
            # Show text files
            txt_files = [f for f in files if f.lower().endswith('.txt')]
            print(f"\n   Text files found: {len(txt_files)}")
            for txt in txt_files[:10]:
                print(f"     ‚Ä¢ {txt}")
            
        except Exception as e:
            print(f"   Error reading folder: {e}")
    
    def save_processed_data(self):
        """Save all processed data to files"""
        print("\nüíæ Saving processed data...")
        
        # Save vocabulary
        vocab_path = os.path.join(self.corpus_folder, "vocabulary.txt")
        with open(vocab_path, 'w', encoding='utf-8') as f:
            sorted_vocab = sorted(self.vocabulary)
            f.write('\n'.join(sorted_vocab))
        print(f"   ‚úÖ Vocabulary: {len(self.vocabulary):,} words saved")
        
        # Save word frequencies
        freq_path = os.path.join(self.corpus_folder, "word_frequencies.json")
        with open(freq_path, 'w', encoding='utf-8') as f:
            json.dump(dict(self.word_frequencies.most_common()), f, indent=2)
        
        # Save document-token mapping
        doc_tokens_path = os.path.join(self.corpus_folder, "document_tokens.json")
        doc_tokens_data = {}
        for doc in self.documents:
            doc_tokens_data[doc['file_name']] = {
                'tokens': doc['clean_tokens'][:500],  # First 500 tokens only
                'token_count': len(doc['clean_tokens'])
            }
        with open(doc_tokens_path, 'w', encoding='utf-8') as f:
            json.dump(doc_tokens_data, f, indent=2)
        
        # Save complete corpus (all cleaned text in one file)
        corpus_path = os.path.join(self.corpus_folder, "complete_corpus.txt")
        with open(corpus_path, 'w', encoding='utf-8') as f:
            for doc in self.documents:
                f.write(f"\n\n{'='*80}\n")
                f.write(f"DOCUMENT: {doc['file_name']}\n")
                f.write(f"TOKENS: {len(doc['clean_tokens']):,}\n")
                f.write(f"{'='*80}\n\n")
                f.write(doc['cleaned_text'])
        
        # Save statistics
        stats_path = os.path.join(self.corpus_folder, "corpus_statistics.json")
        with open(stats_path, 'w', encoding='utf-8') as f:
            json.dump(self.stats, f, indent=2)
        
        print("   ‚úÖ All processed data saved")
    
    def generate_statistics_report(self):
        """Generate comprehensive statistics report"""
        print("\nüìä Generating statistics report...")
        
        # Create statistics directory
        stats_dir = os.path.join(self.corpus_folder, "statistics")
        
        # 1. Basic statistics
        basic_stats = {
            "corpus_statistics": {
                "total_documents": len(self.documents),
                "total_sentences": self.stats['total_sentences'],
                "total_words_raw": self.stats['total_words_raw'],
                "total_words_cleaned": self.stats['total_words_cleaned'],
                "vocabulary_size": self.stats['vocabulary_size'],
                "average_words_per_document": self.stats['total_words_cleaned'] / len(self.documents) if self.documents else 0,
                "average_sentences_per_document": self.stats['total_sentences'] / len(self.documents) if self.documents else 0,
                "processing_time_seconds": self.stats['processing_time']
            }
        }
        
        with open(os.path.join(stats_dir, "basic_statistics.json"), 'w', encoding='utf-8') as f:
            json.dump(basic_stats, f, indent=2)
        
        # 2. Document-level statistics
        doc_stats_df = pd.DataFrame(self.stats['file_stats'])
        doc_stats_df.to_csv(os.path.join(stats_dir, "document_statistics.csv"), index=False)
        
        # 3. Word frequency statistics
        word_stats = []
        for word, freq in self.word_frequencies.most_common(1000):
            word_stats.append({
                'word': word,
                'frequency': freq,
                'percentage': (freq / self.stats['total_words_cleaned']) * 100
            })
        
        word_stats_df = pd.DataFrame(word_stats)
        word_stats_df.to_csv(os.path.join(stats_dir, "word_frequencies.csv"), index=False)
        
        print("   ‚úÖ Statistics reports generated")
    
    def create_visualizations(self):
        """Create visualizations of the corpus"""
        print("\nüé® Creating visualizations...")
        
        viz_dir = os.path.join(self.corpus_folder, "visualizations")
        
        try:
            # Combine all tokens for word cloud
            all_text = ' '.join([' '.join(doc['clean_tokens']) for doc in self.documents])
            
            if all_text.strip():
                # 1. Word Cloud
                wordcloud = WordCloud(
                    width=1200,
                    height=800,
                    background_color='white',
                    max_words=200,
                    contour_width=3,
                    contour_color='steelblue'
                ).generate(all_text)
                
                plt.figure(figsize=(15, 10))
                plt.imshow(wordcloud, interpolation='bilinear')
                plt.axis('off')
                plt.title('Supreme Court Judgements Word Cloud', fontsize=20, pad=20)
                plt.savefig(os.path.join(viz_dir, 'wordcloud.png'), dpi=300, bbox_inches='tight')
                plt.close()
                
                # 2. Top 20 words bar chart
                top_words = self.word_frequencies.most_common(20)
                if top_words:
                    words, freqs = zip(*top_words)
                    
                    plt.figure(figsize=(15, 8))
                    plt.barh(range(len(words)), freqs, color='steelblue')
                    plt.yticks(range(len(words)), words)
                    plt.xlabel('Frequency')
                    plt.title('Top 20 Most Frequent Words')
                    plt.gca().invert_yaxis()
                    plt.tight_layout()
                    plt.savefig(os.path.join(viz_dir, 'top_words.png'), dpi=300)
                    plt.close()
                
                print("   ‚úÖ Visualizations created")
            else:
                print("   ‚ö†Ô∏è No text for visualizations")
                
        except Exception as e:
            print(f"   ‚ö†Ô∏è Could not create visualizations: {e}")
    
    def print_summary(self):
        """Print preprocessing summary"""
        print("\n" + "=" * 80)
        print("üìä PREPROCESSING SUMMARY")
        print("=" * 80)
        
        print(f"\nüìÅ CORPUS LOCATION:")
        print(f"   {os.path.abspath(self.corpus_folder)}")
        
        print(f"\nüìä BASIC STATISTICS:")
        print(f"   Documents Processed: {len(self.documents):,}")
        print(f"   Total Sentences: {self.stats['total_sentences']:,}")
        print(f"   Total Words (Raw): {self.stats['total_words_raw']:,}")
        print(f"   Total Words (Cleaned): {self.stats['total_words_cleaned']:,}")
        print(f"   Vocabulary Size: {self.stats['vocabulary_size']:,}")
        print(f"   Processing Time: {self.stats['processing_time']:.2f} seconds")
        
        print(f"\nüìà AVERAGE PER DOCUMENT:")
        avg_words = self.stats['total_words_cleaned'] / len(self.documents) if self.documents else 0
        avg_sents = self.stats['total_sentences'] / len(self.documents) if self.documents else 0
        print(f"   Words: {avg_words:,.0f}")
        print(f"   Sentences: {avg_sents:,.0f}")
        
        print(f"\nüî§ TOP 10 MOST FREQUENT WORDS:")
        for i, (word, freq) in enumerate(self.stats['most_common_words'][:10], 1):
            percentage = (freq / self.stats['total_words_cleaned']) * 100
            print(f"   {i:2d}. {word:20s} {freq:8,d} ({percentage:.2f}%)")
        
        print(f"\nüìÇ CREATED FOLDERS:")
        folders = [
            "cleaned_docs/ - Individual cleaned documents",
            "tokens/ - Tokenized versions",
            "statistics/ - Statistical reports",
            "visualizations/ - Charts and graphs"
        ]
        for folder in folders:
            print(f"   ‚Ä¢ {folder}")
        
        print(f"\nüìÑ CREATED FILES:")
        files = [
            "complete_corpus.txt - All documents combined",
            "vocabulary.txt - Complete vocabulary list",
            "word_frequencies.json - Word frequency data",
            "document_tokens.json - Document-token mapping",
            "corpus_statistics.json - Complete statistics"
        ]
        for file in files:
            print(f"   ‚Ä¢ {file}")
    
    def get_corpus_info(self):
        """Return corpus information for further processing"""
        return {
            'documents': self.documents,
            'vocabulary': list(self.vocabulary),
            'word_frequencies': self.word_frequencies,
            'tokens_by_doc': self.tokens_by_doc,
            'stats': self.stats
        }

def main():
    """Main function"""
    print("=" * 80)
    print("‚öñÔ∏è  SUPREME COURT JUDGEMENTS - CORPUS PREPROCESSING")
    print("=" * 80)
    
    # Set text folder path
    txt_folder = r"C:\Users\Armaghan Rafique\Desktop\AI Project\supreme_court_judgements_txt"
    
    # Verify the folder exists
    if not os.path.exists(txt_folder):
        print(f"\n‚ùå Error: Folder not found: {txt_folder}")
        
        # Suggest alternatives
        current_dir = os.getcwd()
        print(f"\nCurrent directory: {current_dir}")
        print("Looking for text files in:")
        
        # Check common locations
        possible_locations = [
            os.path.join(current_dir, "supreme_court_judgements_txt"),
            os.path.join(os.path.dirname(current_dir), "supreme_court_judgements_txt"),
            os.path.join(os.path.expanduser('~'), "Desktop", "AI Project", "supreme_court_judgements_txt")
        ]
        
        found = False
        for loc in possible_locations:
            if os.path.exists(loc):
                print(f"‚úÖ Found: {loc}")
                txt_folder = loc
                found = True
                break
        
        if not found:
            alt_path = input("\nEnter correct folder path: ").strip()
            if alt_path:
                txt_folder = alt_path
            else:
                return
    
    # First verify the folder
    print(f"\nüîç Verifying folder: {txt_folder}")
    
    if not os.path.exists(txt_folder):
        print(f"‚ùå Folder does not exist!")
        return
    
    # Count files
    try:
        files = [f for f in os.listdir(txt_folder) if os.path.isfile(os.path.join(txt_folder, f))]
        txt_files = [f for f in files if f.lower().endswith('.txt')]
        
        print(f"üìÇ Total files in folder: {len(files)}")
        print(f"üìù Text files found: {len(txt_files)}")
        
        if len(txt_files) == 0:
            print(f"\n‚ùå No text files found!")
            print(f"\nFirst 10 files in folder:")
            for f in files[:10]:
                print(f"  ‚Ä¢ {f}")
            return
        
    except Exception as e:
        print(f"‚ùå Error reading folder: {e}")
        return
    
    # Create and run preprocessor
    print(f"\nüöÄ Starting corpus preprocessing...")
    
    preprocessor = SupremeCourtCorpusPreprocessor(txt_folder)
    
    # Start preprocessing
    success = preprocessor.create_corpus()
    
    if success:
        print("\n" + "=" * 80)
        print("üéâ CORPUS PREPROCESSING COMPLETED!")
        print("=" * 80)
        
        # Get corpus info for further processing
        corpus_info = preprocessor.get_corpus_info()
        
        print(f"\nüìÇ Your corpus is ready at:")
        print(f"   {os.path.abspath(preprocessor.corpus_folder)}")
        
        print(f"\nüìù Next steps you can take:")
        print(f"   1. Train word embeddings (Word2Vec, GloVe)")
        print(f"   2. Create TF-IDF matrix")
        print(f"   3. Train topic models (LDA, NMF)")
        print(f"   4. Build a search engine")
        print(f"   5. Train a text classifier")
        
        return corpus_info
    else:
        print("\n‚ùå Preprocessing failed")
        return None

# Simple test function
def test_preprocessing():
    """Test preprocessing on a small sample"""
    txt_folder = r"C:\Users\Armaghan Rafique\Desktop\AI Project\supreme_court_judgements_txt"
    
    if not os.path.exists(txt_folder):
        print(f"‚ùå Folder not found: {txt_folder}")
        return
    
    # Get first 5 text files
    files = [f for f in os.listdir(txt_folder) 
             if os.path.isfile(os.path.join(txt_folder, f)) 
             and f.lower().endswith('.txt')]
    
    if not files:
        print("‚ùå No text files found")
        return
    
    files = files[:5]  # Take first 5
    
    print(f"Testing preprocessing on {len(files)} files:")
    for f in files:
        print(f"  ‚Ä¢ {f}")
    
    # Create preprocessor
    preprocessor = SupremeCourtCorpusPreprocessor(txt_folder)
    
    # Test single file
    test_file = os.path.join(txt_folder, files[0])
    print(f"\nüìÑ Testing file: {files[0]}")
    
    result = preprocessor.process_single_document(test_file)
    
    if result:
        print(f"‚úÖ Successfully processed")
        print(f"   Original length: {result['stats']['raw_text_length']:,} chars")
        print(f"   Cleaned tokens: {len(result['clean_tokens']):,}")
        print(f"   Sample tokens: {', '.join(result['clean_tokens'][:20])}...")
    else:
        print(f"‚ùå Failed to process")

if __name__ == "__main__":
    # You can run either:
    # 1. Full preprocessing
    corpus_result = main()
    
    # 2. Or test on a small sample first
    # test_preprocessing()

‚öñÔ∏è  SUPREME COURT JUDGEMENTS - CORPUS PREPROCESSING

üîç Verifying folder: C:\Users\Armaghan Rafique\Desktop\AI Project\supreme_court_judgements_txt
üìÇ Total files in folder: 1475
üìù Text files found: 1474

üöÄ Starting corpus preprocessing...
üìÅ Created: C:\Users\Armaghan Rafique\Desktop\AI Project\cleaned_corpus
üìÅ Created: C:\Users\Armaghan Rafique\Desktop\AI Project\cleaned_corpus\cleaned_docs
üìÅ Created: C:\Users\Armaghan Rafique\Desktop\AI Project\cleaned_corpus\tokens
üìÅ Created: C:\Users\Armaghan Rafique\Desktop\AI Project\cleaned_corpus\statistics
üìÅ Created: C:\Users\Armaghan Rafique\Desktop\AI Project\cleaned_corpus\visualizations
‚úÖ punkt already available
‚úÖ stopwords already available
üì• Downloading wordnet...
‚úÖ wordnet downloaded
üì• Downloading averaged_perceptron_tagger...
‚úÖ averaged_perceptron_tagger downloaded
‚öñÔ∏è  SUPREME COURT JUDGEMENTS CORPUS PREPROCESSING
üîç Scanning folder: C:\Users\Armaghan Rafique\Desktop\AI Project\supreme_c