In [7]:
print("Setting up enhanced language detection...")

# Import all available libraries
import sys

# Try importing multiple libraries
try:
    import pycld2 as cld2
    cld2_available = True
    print("✓ CLD2 loaded")
except:
    cld2_available = False
    print("✗ CLD2 not available")

try:
    from langdetect import detect, detect_langs
    from langdetect.lang_detect_exception import LangDetectException
    langdetect_available = True
    print("✓ langdetect loaded")
except:
    langdetect_available = False
    print("✗ langdetect not available")

try:
    import langid
    langid_available = True
    print("✓ langid loaded")
except:
    langid_available = False
    print("✗ langid not available")

print(f"\nActive detectors: {sum([cld2_available, langdetect_available, langid_available])}/3")
print("="*80 + "\n")    


Setting up enhanced language detection...
✓ CLD2 loaded
✓ langdetect loaded
✓ langid loaded

Active detectors: 3/3



In [8]:
def detect_language_enhanced(text, min_confidence=70):
    
    results = []
    
    # Method 1: CLD2 (Best for Unicode scripts like Sinhala, Tamil)
    if cld2_available:
        try:
            isReliable, textBytesFound, details = cld2.detect(text)
            if details and len(details) > 0:
                results.append({
                    'library': 'cld2',
                    'language': details[0][0],
                    'code': details[0][1],
                    'confidence': details[0][2],
                    'reliable': isReliable,
                    'weight': 3  # Higher weight for CLD2
                })
        except:
            pass
    
    # Method 2: langdetect (Good for European languages like Spanish)
    if langdetect_available:
        try:
            lang_code = detect(text)
            langs = detect_langs(text)
            confidence = langs[0].prob * 100 if langs else 0
            
            results.append({
                'library': 'langdetect',
                'language': get_language_name(lang_code),
                'code': lang_code,
                'confidence': confidence,
                'reliable': confidence > 90,
                'weight': 2
            })
        except LangDetectException:
            pass
        except:
            pass
    
    # Method 3: langid (Fast and reliable)
    if langid_available:
        try:
            lang_code, confidence_raw = langid.classify(text)
            confidence = confidence_raw * 100 if confidence_raw > 0 else confidence_raw
            
            results.append({
                'library': 'langid',
                'language': get_language_name(lang_code),
                'code': lang_code,
                'confidence': abs(confidence),
                'reliable': abs(confidence) > 0,
                'weight': 2
            })
        except:
            pass
    
    # If no results, return Unknown
    if not results:
        return {
            'language': 'Unknown',
            'code': 'unknown',
            'confidence': 0,
            'reliable': False,
            'method': 'none'
        }
    
    # Voting system: Prioritize results with higher confidence and weight
    # Sort by: reliability, confidence, and weight
    results.sort(key=lambda x: (x['reliable'], x['confidence'] * x['weight']), reverse=True)
    
    best_result = results[0]
    
    # Check if we have consensus among libraries
    if len(results) > 1:
        # If multiple libraries agree on the language, increase confidence
        languages_detected = [r['language'].upper() for r in results]
        most_common = max(set(languages_detected), key=languages_detected.count)
        
        if languages_detected.count(most_common) > 1:
            # Multiple libraries agree - boost confidence
            for r in results:
                if r['language'].upper() == most_common:
                    best_result = r
                    best_result['confidence'] = min(99, best_result['confidence'] * 1.1)
                    best_result['method'] = 'consensus'
                    break
    
    best_result['method'] = best_result.get('method', best_result['library'])
    return best_result

def get_language_name(code):
    """Convert language code to full name"""
    language_names = {
        'af': 'Afrikaans', 'ar': 'Arabic', 'bg': 'Bulgarian', 'bn': 'Bengali',
        'ca': 'Catalan', 'cs': 'Czech', 'cy': 'Welsh', 'da': 'Danish',
        'de': 'German', 'el': 'Greek', 'en': 'English', 'es': 'Spanish',
        'et': 'Estonian', 'fa': 'Persian', 'fi': 'Finnish', 'fr': 'French',
        'gu': 'Gujarati', 'he': 'Hebrew', 'hi': 'Hindi', 'hr': 'Croatian',
        'hu': 'Hungarian', 'id': 'Indonesian', 'it': 'Italian', 'ja': 'Japanese',
        'kn': 'Kannada', 'ko': 'Korean', 'lt': 'Lithuanian', 'lv': 'Latvian',
        'mk': 'Macedonian', 'ml': 'Malayalam', 'mr': 'Marathi', 'ne': 'Nepali',
        'nl': 'Dutch', 'no': 'Norwegian', 'pa': 'Punjabi', 'pl': 'Polish',
        'pt': 'Portuguese', 'ro': 'Romanian', 'ru': 'Russian', 'sk': 'Slovak',
        'sl': 'Slovenian', 'so': 'Somali', 'sq': 'Albanian', 'sv': 'Swedish',
        'sw': 'Swahili', 'ta': 'Tamil', 'te': 'Telugu', 'th': 'Thai',
        'tl': 'Tagalog', 'tr': 'Turkish', 'uk': 'Ukrainian', 'ur': 'Urdu',
        'vi': 'Vietnamese', 'zh': 'Chinese', 'zh-cn': 'Chinese', 'zh-tw': 'Chinese',
        'si': 'Sinhala', 'SINHALESE': 'Sinhala', 'TAMIL': 'Tamil', 
        'SPANISH': 'Spanish', 'ENGLISH': 'English', 'FRENCH': 'French'
    }
    
    # Handle uppercase names from CLD2
    if code.upper() in language_names.values():
        return code.title()
    
    return language_names.get(code.lower(), code.upper())

print("✓ Enhanced detection system ready!")

✓ Enhanced detection system ready!


In [9]:
class EnhancedReviewAnalyzer:
    """Improved Review Analyzer with better language detection"""
    
    def __init__(self):
        self.language_sentiment = {}
        self.detection_details = []
    
    def analyze_reviews(self, reviews):
        """Analyze reviews with enhanced language detection"""
        all_results = []
        
        for review in reviews:
            # Use enhanced detection
            detection = detect_language_enhanced(review['text'])
            language = detection['language']
            
            # Initialize language category if needed
            if language not in self.language_sentiment:
                self.language_sentiment[language] = {
                    'reviews': [],
                    'avg_rating': 0,
                    'count': 0,
                    'total_confidence': 0
                }
            
            # Add review to language category
            self.language_sentiment[language]['reviews'].append(review)
            self.language_sentiment[language]['count'] += 1
            self.language_sentiment[language]['total_confidence'] += detection['confidence']
            
            # Store detailed result
            result = {
                'text': review['text'][:50] + '...' if len(review['text']) > 50 else review['text'],
                'full_text': review['text'],
                'language': language,
                'rating': review.get('rating', 0),
                'confidence': detection['confidence'],
                'detection_method': detection.get('method', 'unknown'),
                'reliable': detection.get('reliable', False)
            }
            
            all_results.append(result)
            self.detection_details.append(result)
        
        # Calculate average ratings and confidence
        for lang in self.language_sentiment:
            reviews = self.language_sentiment[lang]['reviews']
            count = self.language_sentiment[lang]['count']
            if reviews and count > 0:
                avg_rating = sum(r.get('rating', 0) for r in reviews) / len(reviews)
                self.language_sentiment[lang]['avg_rating'] = avg_rating
                
                avg_confidence = self.language_sentiment[lang]['total_confidence'] / count
                self.language_sentiment[lang]['avg_confidence'] = avg_confidence
        
        return self.language_sentiment, all_results
    
    def print_detailed_report(self):
        """Print comprehensive analysis report"""
        total_reviews = sum(data['count'] for data in self.language_sentiment.values())
        
        print("\n" + "="*80)
        print("ENHANCED PRODUCT REVIEW ANALYSIS")
        print("="*80)
        print(f"Total reviews analyzed: {total_reviews}")
        print(f"Languages detected: {len(self.language_sentiment)}")
        print("="*80 + "\n")
        
        # Sort languages by number of reviews
        sorted_langs = sorted(
            self.language_sentiment.items(), 
            key=lambda x: x[1]['count'], 
            reverse=True
        )
        
        for lang, data in sorted_langs:
            percentage = (data['count'] / total_reviews) * 100
            
            print(f"{lang.upper()}")
            print(f"  📊 Reviews: {data['count']} ({percentage:.1f}% of total)")
            print(f"  ⭐ Average rating: {data['avg_rating']:.2f}/5.0")
            print(f"  🎯 Detection confidence: {data['avg_confidence']:.1f}%")
            print(f"  📝 Sample reviews:")
            
            # Show 2 sample reviews
            for i, review in enumerate(data['reviews'][:2], 1):
                text_preview = review['text'][:60] + '...' if len(review['text']) > 60 else review['text']
                print(f"     {i}. \"{text_preview}\" - Rating: {review.get('rating', 'N/A')}/5")
            
            print()
        
        return sorted_langs
    
    def get_low_confidence_reviews(self, threshold=80):
        """Identify reviews with low detection confidence"""
        low_conf = [
            detail for detail in self.detection_details 
            if detail['confidence'] < threshold
        ]
        
        if low_conf:
            print(f"\n⚠️  Reviews with confidence below {threshold}%:")
            print("-"*80)
            for detail in low_conf:
                print(f"Text: {detail['text']}")
                print(f"  Detected as: {detail['language']} ({detail['confidence']:.1f}%)")
                print(f"  Method: {detail['detection_method']}")
                print()
        else:
            print(f"\n✓ All reviews detected with confidence above {threshold}%")
        
        return low_conf
    
    def compare_detection_methods(self, text):
        """Show how each library detects a specific text"""
        print(f"\nDetection Method Comparison")
        print(f"Text: \"{text}\"")
        print("-"*80)
        
        # CLD2
        if cld2_available:
            try:
                isReliable, textBytesFound, details = cld2.detect(text)
                if details:
                    print(f"CLD2: {details[0][0]} (confidence: {details[0][2]}%, reliable: {isReliable})")
            except Exception as e:
                print(f"CLD2: Error - {str(e)}")
        
        # langdetect
        if langdetect_available:
            try:
                lang = detect(text)
                langs = detect_langs(text)
                print(f"langdetect: {get_language_name(lang)} (confidence: {langs[0].prob*100:.1f}%)")
            except Exception as e:
                print(f"langdetect: Error - {str(e)}")
        
        # langid
        if langid_available:
            try:
                lang, conf = langid.classify(text)
                print(f"langid: {get_language_name(lang)} (confidence: {abs(conf*100):.1f}%)")
            except Exception as e:
                print(f"langid: Error - {str(e)}")
        
        # Final enhanced result
        final = detect_language_enhanced(text)
        print(f"\n→ FINAL RESULT: {final['language']} (confidence: {final['confidence']:.1f}%)")
        print(f"  Method used: {final.get('method', 'unknown')}")
        print("-"*80)

print("✓ Enhanced Review Analyzer ready!")

✓ Enhanced Review Analyzer ready!


In [10]:
import pandas as pd
df = pd.read_csv('C:\\Users\\SANKALPA\\Downloads\\train.csv')

In [11]:
import time
import pycld2 as cld2
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import multiprocessing as mp
from functools import lru_cache
import pandas as pd

In [12]:
def split_into_batches(data, batch_size):
    """Split data into batches for parallel processing"""
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]

In [13]:
def detect_language_fast(text):
    """
    Fast detection using only CLD2 (fastest and most accurate)
    Uses caching to avoid re-processing identical reviews
    """
    if not text or len(text.strip()) < 3:
        return {
            'language': 'Unknown',
            'code': 'unknown',
            'confidence': 0,
            'reliable': False
        }
    
    try:
        # CLD2 is the fastest and works well for most languages
        isReliable, textBytesFound, details = cld2.detect(text)
        if details and len(details) > 0:
            return {
                'language': details[0][0],
                'code': details[0][1],
                'confidence': details[0][2],
                'reliable': isReliable
            }
    except:
        pass
    
    # Fallback for very short or problematic text
    return {
        'language': 'Unknown',
        'code': 'unknown',
        'confidence': 0,
        'reliable': False
    }

In [14]:
def process_review_batch(reviews_batch):
    """Process a batch of reviews (for parallel processing)"""
    results = []
    for review in reviews_batch:
        text = review.get('text', '')
        rating = review.get('rating', 0)
        
        detection = detect_language_fast(text)
        
        results.append({
            'text': text[:50] + '...' if len(text) > 50 else text,
            'full_text': text,
            'language': detection['language'],
            'code': detection['code'],
            'rating': rating,
            'confidence': detection['confidence'],
            'reliable': detection['reliable']
        })
    
    return results

In [15]:
class FastReviewAnalyzer:
    """Optimized analyzer for large datasets"""
    
    def __init__(self, use_parallel=True, n_workers=None):
        self.language_sentiment = {}
        self.detection_details = []
        self.use_parallel = use_parallel
        self.n_workers = n_workers or max(1, mp.cpu_count() - 1)
        self.processing_time = 0
    
    def analyze_reviews(self, reviews, show_progress=True):
        """
        Fast analysis with optional parallel processing
        
        Args:
            reviews: List of review dicts with 'text' and optional 'rating'
            show_progress: Show progress updates
        """
        start_time = time.time()
        total = len(reviews)
        
        print(f"Processing {total:,} reviews...")
        if self.use_parallel and total > 100:
            print(f"Using parallel processing with {self.n_workers} workers")
        print()
        
        # Process reviews
        if self.use_parallel and total > 100:
            # Parallel processing for large datasets
            all_results = self._process_parallel(reviews, show_progress)
        else:
            # Sequential processing for small datasets
            all_results = self._process_sequential(reviews, show_progress)
        
        # Aggregate results
        self._aggregate_results(all_results)
        
        self.processing_time = time.time() - start_time
        
        print(f"\n✓ Processed {total:,} reviews in {self.processing_time:.2f} seconds")
        print(f"  Speed: {total/self.processing_time:.0f} reviews/second")
        
        return self.language_sentiment, all_results
    
    def _process_sequential(self, reviews, show_progress):
        """Sequential processing (for small datasets)"""
        results = []
        total = len(reviews)
        
        for i, review in enumerate(reviews):
            text = review.get('text', '')
            rating = review.get('rating', 0)
            
            detection = detect_language_fast(text)
            
            result = {
                'text': text[:50] + '...' if len(text) > 50 else text,
                'full_text': text,
                'language': detection['language'],
                'code': detection['code'],
                'rating': rating,
                'confidence': detection['confidence'],
                'reliable': detection['reliable']
            }
            
            results.append(result)
            
            # Progress update
            if show_progress and (i + 1) % 1000 == 0:
                progress = (i + 1) / total * 100
                print(f"Progress: {i+1:,}/{total:,} ({progress:.1f}%)")
        
        return results
    
    def _process_parallel(self, reviews, show_progress):
        """Parallel processing (for large datasets)"""
        # Determine optimal batch size
        batch_size = max(100, len(reviews) // (self.n_workers * 4))
        batches = list(split_into_batches(reviews, batch_size))
        
        all_results = []
        processed = 0
        total = len(reviews)
        
        # Use ThreadPoolExecutor (faster for I/O bound tasks)
        with ThreadPoolExecutor(max_workers=self.n_workers) as executor:
            futures = [executor.submit(process_review_batch, batch) for batch in batches]
            
            for future in futures:
                batch_results = future.result()
                all_results.extend(batch_results)
                
                processed += len(batch_results)
                if show_progress and processed % 5000 < batch_size:
                    progress = processed / total * 100
                    print(f"Progress: {processed:,}/{total:,} ({progress:.1f}%)")
        
        return all_results
    
    def _aggregate_results(self, all_results):
        """Aggregate results into language categories"""
        self.detection_details = all_results
        
        for result in all_results:
            language = result['language']
            
            if language not in self.language_sentiment:
                self.language_sentiment[language] = {
                    'reviews': [],
                    'count': 0,
                    'total_rating': 0,
                    'avg_rating': 0,
                    'total_confidence': 0,
                    'avg_confidence': 0
                }
            
            self.language_sentiment[language]['reviews'].append(result)
            self.language_sentiment[language]['count'] += 1
            self.language_sentiment[language]['total_rating'] += result['rating']
            self.language_sentiment[language]['total_confidence'] += result['confidence']
        
        # Calculate averages
        for lang in self.language_sentiment:
            data = self.language_sentiment[lang]
            count = data['count']
            if count > 0:
                data['avg_rating'] = data['total_rating'] / count
                data['avg_confidence'] = data['total_confidence'] / count
    
    def print_summary(self, top_n=10):
        """Print quick summary"""
        total = sum(data['count'] for data in self.language_sentiment.values())
        
        print("\n" + "="*80)
        print("REVIEW ANALYSIS SUMMARY")
        print("="*80)
        print(f"Total reviews: {total:,}")
        print(f"Languages detected: {len(self.language_sentiment)}")
        print(f"Processing time: {self.processing_time:.2f} seconds")
        print(f"Speed: {total/self.processing_time:.0f} reviews/second")
        print("="*80 + "\n")
        
        # Sort by count
        sorted_langs = sorted(
            self.language_sentiment.items(),
            key=lambda x: x[1]['count'],
            reverse=True
        )
        
        print(f"Top {min(top_n, len(sorted_langs))} Languages:")
        print("-"*80)
        
        for i, (lang, data) in enumerate(sorted_langs[:top_n], 1):
            percentage = (data['count'] / total) * 100
            print(f"{i}. {lang}")
            print(f"   Reviews: {data['count']:,} ({percentage:.1f}%)")
            print(f"   Avg Rating: {data['avg_rating']:.2f}/5.0")
            print(f"   Avg Confidence: {data['avg_confidence']:.1f}%")
            print()
    
    def get_low_confidence_reviews(self, threshold=80, max_display=20):
        """Identify low confidence reviews (optimized)"""
        low_conf = [
            detail for detail in self.detection_details
            if detail['confidence'] < threshold
        ]
        
        print(f"\n⚠️  Reviews with confidence below {threshold}%: {len(low_conf):,}")
        
        if low_conf and len(low_conf) <= max_display:
            print("-"*80)
            for i, detail in enumerate(low_conf[:max_display], 1):
                print(f"{i}. {detail['text']}")
                print(f"   Detected: {detail['language']} ({detail['confidence']:.1f}%)")
                print()
        elif len(low_conf) > max_display:
            print(f"Showing first {max_display}:")
            print("-"*80)
            for i, detail in enumerate(low_conf[:max_display], 1):
                print(f"{i}. {detail['text']}")
                print(f"   Detected: {detail['language']} ({detail['confidence']:.1f}%)")
                print()
            print(f"... and {len(low_conf) - max_display:,} more")
        else:
            print(f"✓ All reviews detected with confidence above {threshold}%")
        
        return low_conf
    
    def export_to_dataframe(self):
        """Convert results to pandas DataFrame"""
        data = []
        for detail in self.detection_details:
            data.append({
                'review_text': detail['full_text'],
                'language': detail['language'],
                'language_code': detail['code'],
                'rating': detail['rating'],
                'confidence': detail['confidence'],
                'reliable': detail['reliable']
            })
        
        return pd.DataFrame(data)

print("✓ Fast Review Analyzer ready!")

✓ Fast Review Analyzer ready!


In [16]:
analyzer = FastReviewAnalyzer(use_parallel=True)
product_reviews = [{'text': text} for text in df['review_body']]
sentiment, results = analyzer.analyze_reviews(product_reviews)
analyzer.print_summary()
analyzer.get_low_confidence_reviews(threshold=80)

Processing 1,200,000 reviews...
Using parallel processing with 3 workers

Progress: 100,000/1,200,000 (8.3%)
Progress: 200,000/1,200,000 (16.7%)
Progress: 300,000/1,200,000 (25.0%)
Progress: 400,000/1,200,000 (33.3%)
Progress: 500,000/1,200,000 (41.7%)
Progress: 600,000/1,200,000 (50.0%)
Progress: 700,000/1,200,000 (58.3%)
Progress: 800,000/1,200,000 (66.7%)
Progress: 900,000/1,200,000 (75.0%)
Progress: 1,000,000/1,200,000 (83.3%)
Progress: 1,100,000/1,200,000 (91.7%)
Progress: 1,200,000/1,200,000 (100.0%)

✓ Processed 1,200,000 reviews in 11.93 seconds
  Speed: 100624 reviews/second

REVIEW ANALYSIS SUMMARY
Total reviews: 1,200,000
Languages detected: 52
Processing time: 11.93 seconds
Speed: 100624 reviews/second

Top 10 Languages:
--------------------------------------------------------------------------------
1. ENGLISH
   Reviews: 201,861 (16.8%)
   Avg Rating: 0.00/5.0
   Avg Confidence: 98.3%

2. Japanese
   Reviews: 200,034 (16.7%)
   Avg Rating: 0.00/5.0
   Avg Confidence: 98.1

[{'text': 'nach 2 monaten kaputt',
  'full_text': 'nach 2 monaten kaputt',
  'language': 'Unknown',
  'code': 'un',
  'rating': 0,
  'confidence': 0,
  'reliable': False},
 {'text': 'Funktioniert nicht mit Osmo pocket',
  'full_text': 'Funktioniert nicht mit Osmo pocket',
  'language': 'Unknown',
  'code': 'un',
  'rating': 0,
  'confidence': 0,
  'reliable': False},
 {'text': 'Windows 10 nach Clean Installation kein Lizenz! Su...',
  'full_text': 'Windows 10 nach Clean Installation kein Lizenz! Support für Treiber SEHR SCHLECHT!',
  'language': 'Unknown',
  'code': 'un',
  'rating': 0,
  'confidence': 0,
  'reliable': False},
 {'text': 'The beans smelled and tasted sour and spoiled, eve...',
  'full_text': 'The beans smelled and tasted sour and spoiled, even though the "use-by" date was in the future. Do not waste your money on them! Die Bohnen haben sauer und vergammelt geschmeckt und gerochen, obwohl der MHD in Ordnung war. Ich kann sie nicht empfehlen!',
  'language': 'ENGLISH',
  

In [20]:
print("\n" + "="*80)
print("DETECTION STATISTICS & INSIGHTS")
print("="*80 + "\n")

all_detections = analyzer.detection_details + analyzer.detection_details

# Count by detection method
method_counts = {}
for detail in all_detections:
    method = detail.get('detection_method', 'unknown')
    method_counts[method] = method_counts.get(method, 0) + 1

print("Detection Methods Used:")
for method, count in sorted(method_counts.items(), key=lambda x: x[1], reverse=True):
    percentage = (count / len(all_detections)) * 100
    print(f"  {method}: {count} times ({percentage:.1f}%)")

# Average confidence by language
print("\nAverage Confidence by Language:")
lang_confidence = {}
for detail in all_detections:
    lang = detail['language']
    if lang not in lang_confidence:
        lang_confidence[lang] = []
    lang_confidence[lang].append(detail['confidence'])

for lang, confidences in sorted(lang_confidence.items()):
    avg_conf = sum(confidences) / len(confidences)
    print(f"  {lang}: {avg_conf:.1f}% (from {len(confidences)} review(s))")

print("\n" + "="*80)
print("="*80)


DETECTION STATISTICS & INSIGHTS

Detection Methods Used:
  unknown: 2400000 times (100.0%)

Average Confidence by Language:
  CATALAN: 96.3% (from 128 review(s))
  CORSICAN: 96.5% (from 4 review(s))
  CZECH: 96.0% (from 2 review(s))
  Chinese: 97.7% (from 389484 review(s))
  ChineseT: 91.5% (from 1418 review(s))
  DANISH: 95.9% (from 186 review(s))
  DUTCH: 96.3% (from 30 review(s))
  ENGLISH: 98.3% (from 403722 review(s))
  ESPERANTO: 96.0% (from 2 review(s))
  ESTONIAN: 95.0% (from 2 review(s))
  FINNISH: 96.0% (from 2 review(s))
  FRENCH: 98.4% (from 391052 review(s))
  FRISIAN: 97.0% (from 2 review(s))
  GALICIAN: 97.2% (from 314 review(s))
  GERMAN: 98.6% (from 398404 review(s))
  GREEK: 16.3% (from 6 review(s))
  GUARANI: 96.0% (from 6 review(s))
  HAITIAN_CREOLE: 95.5% (from 4 review(s))
  INDONESIAN: 96.4% (from 418 review(s))
  INTERLINGUA: 96.2% (from 20 review(s))
  INTERLINGUE: 96.3% (from 6 review(s))
  ITALIAN: 96.8% (from 68 review(s))
  Japanese: 98.1% (from 400068 rev

In [24]:
def export_analysis_to_csv(analyzer, filename='review_analysis.csv', allow_overwrite=False):
    """
    Export detailed analysis to CSV with enhanced features and error handling
    
    Args:
        analyzer: FastReviewAnalyzer instance
        filename: Target filename (supports .csv or .xlsx)
        allow_overwrite: If False, will not overwrite existing files
    """
    import os
    from datetime import datetime
    
    # Validate analyzer
    if not hasattr(analyzer, 'detection_details') or not analyzer.detection_details:
        raise ValueError("No analysis results to export!")
    
    # Validate and adjust filename
    if not filename.lower().endswith(('.csv', '.xlsx')):
        filename += '.csv'
    
    # Check if file exists
    if os.path.exists(filename) and not allow_overwrite:
        base, ext = os.path.splitext(filename)
        filename = f"{base}_{datetime.now().strftime('%Y%m%d_%H%M%S')}{ext}"
        print(f"File already exists. Saving as: {filename}")
    
    try:
        total_rows = len(analyzer.detection_details)
        print(f"\nPreparing to export {total_rows:,} reviews...")
        
        # Prepare data with validation
        export_data = []
        for detail in analyzer.detection_details:
            if not isinstance(detail, dict):
                continue
                
            export_data.append({
                'Review Text': detail.get('full_text', ''),
                'Language': detail.get('language', 'Unknown'),
                'Language Code': detail.get('code', 'unknown'),
                'Rating': float(detail.get('rating', 0)),
                'Confidence %': round(float(detail.get('confidence', 0)), 1),
                'Detection Method': detail.get('detection_method', 'unknown'),
                'Reliable': 'Yes' if detail.get('reliable', False) else 'No',
                'Text Length': len(str(detail.get('full_text', ''))),
                'Export Date': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            })
        
        # Export based on file format
        if filename.lower().endswith('.xlsx'):
            import pandas as pd
            df = pd.DataFrame(export_data)
            df.to_excel(filename, index=False)
        else:
            import csv
            with open(filename, 'w', newline='', encoding='utf-8') as f:
                writer = csv.DictWriter(f, fieldnames=export_data[0].keys())
                writer.writeheader()
                
                for i, row in enumerate(export_data, 1):
                    writer.writerow(row)
                    if i % 5000 == 0:
                        print(f"Progress: {i:,}/{total_rows:,} ({i/total_rows*100:.1f}%)")
        
        print(f"\n✓ Successfully exported {total_rows:,} reviews to {filename}")
        print(f"  File size: {os.path.getsize(filename):,} bytes")
        
    except Exception as e:
        print(f"\n❌ Error during export: {str(e)}")
        raise

# Example usage
  # Export as Excel
# OR
export_analysis_to_csv(analyzer, 'review_analysis.csv')   # Export as CSV

File already exists. Saving as: review_analysis_20251005_140015.csv

Preparing to export 1,200,000 reviews...
Progress: 5,000/1,200,000 (0.4%)
Progress: 10,000/1,200,000 (0.8%)
Progress: 15,000/1,200,000 (1.2%)
Progress: 20,000/1,200,000 (1.7%)
Progress: 5,000/1,200,000 (0.4%)
Progress: 10,000/1,200,000 (0.8%)
Progress: 15,000/1,200,000 (1.2%)
Progress: 20,000/1,200,000 (1.7%)
Progress: 25,000/1,200,000 (2.1%)
Progress: 30,000/1,200,000 (2.5%)
Progress: 35,000/1,200,000 (2.9%)
Progress: 25,000/1,200,000 (2.1%)
Progress: 30,000/1,200,000 (2.5%)
Progress: 35,000/1,200,000 (2.9%)
Progress: 40,000/1,200,000 (3.3%)
Progress: 45,000/1,200,000 (3.8%)
Progress: 50,000/1,200,000 (4.2%)
Progress: 55,000/1,200,000 (4.6%)
Progress: 40,000/1,200,000 (3.3%)
Progress: 45,000/1,200,000 (3.8%)
Progress: 50,000/1,200,000 (4.2%)
Progress: 55,000/1,200,000 (4.6%)
Progress: 60,000/1,200,000 (5.0%)
Progress: 65,000/1,200,000 (5.4%)
Progress: 70,000/1,200,000 (5.8%)
Progress: 75,000/1,200,000 (6.2%)
Progress