In [1]:
import spacy
import bz2
import re
from collections import Counter, defaultdict
import pandas as pd

# Install required packages in Colab
!pip install spacy pandas
!python -m spacy download en_core_web_sm

# Import required libraries
import spacy
import bz2
import re
from collections import Counter, defaultdict
import pandas as pd
from google.colab import files
import io

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def upload_and_load_data(max_samples=1000):
    """
    Upload and load Amazon review data from bz2 compressed file in Colab
    Expected format: __label__1 or __label__2 followed by review text
    """
    print("Please upload your .bz2 files (train.ft.txt.bz2 and test.ft.txt.bz2)")
    uploaded = files.upload()

    datasets = {}

    for filename in uploaded.keys():
        print(f"\nProcessing {filename}...")
        reviews = []
        labels = []

        try:
            # Read the uploaded file
            file_content = uploaded[filename]

            # Decompress and read
            with bz2.open(io.BytesIO(file_content), 'rt', encoding='utf-8') as f:
                for i, line in enumerate(f):
                    if i >= max_samples:  # Limit for demo purposes
                        break

                    line = line.strip()
                    if line.startswith('__label__'):
                        # Extract label and review text
                        parts = line.split(' ', 1)
                        if len(parts) == 2:
                            label = parts[0].replace('__label__', '')
                            review_text = parts[1]

                            labels.append(int(label))
                            reviews.append(review_text)

            datasets[filename] = (reviews, labels)
            print(f"Loaded {len(reviews)} reviews from {filename}")

        except Exception as e:
            print(f"Error loading {filename}: {e}")
            datasets[filename] = ([], [])

    return datasets

def load_amazon_data_colab(file_content, max_samples=1000):
    """
    Load Amazon review data from uploaded file content
    Expected format: __label__1 or __label__2 followed by review text
    """
    reviews = []
    labels = []

    try:
        # Decompress and read
        with bz2.open(io.BytesIO(file_content), 'rt', encoding='utf-8') as f:
            for i, line in enumerate(f):
                if i >= max_samples:  # Limit for demo purposes
                    break

                line = line.strip()
                if line.startswith('__label__'):
                    # Extract label and review text
                    parts = line.split(' ', 1)
                    if len(parts) == 2:
                        label = parts[0].replace('__label__', '')
                        review_text = parts[1]

                        labels.append(int(label))
                        reviews.append(review_text)

    except Exception as e:
        print(f"Error loading data: {e}")
        return [], []

    return reviews, labels

class RuleBasedSentimentAnalyzer:
    """Simple rule-based sentiment analyzer"""

    def __init__(self):
        # Positive and negative word lists
        self.positive_words = {
            'excellent', 'amazing', 'great', 'good', 'fantastic', 'wonderful',
            'perfect', 'love', 'awesome', 'brilliant', 'outstanding', 'superb',
            'impressed', 'satisfied', 'recommend', 'best', 'quality', 'happy',
            'pleased', 'smooth', 'fast', 'easy', 'comfortable', 'durable'
        }

        self.negative_words = {
            'bad', 'terrible', 'awful', 'horrible', 'worst', 'hate', 'disappointed',
            'poor', 'cheap', 'broken', 'defective', 'useless', 'waste', 'annoying',
            'frustrating', 'slow', 'difficult', 'uncomfortable', 'flimsy', 'terrible',
            'pathetic', 'garbage', 'trash', 'regret', 'problem', 'issue', 'fail'
        }

        # Negation words that can flip sentiment
        self.negation_words = {'not', 'no', 'never', 'none', 'nothing', 'nowhere',
                              'neither', 'nobody', 'hardly', "don't", "doesn't",
                              "didn't", "won't", "wouldn't", "can't", "couldn't"}

    def analyze_sentiment(self, text):
        """Analyze sentiment of text using rule-based approach"""
        text_lower = text.lower()
        words = text_lower.split()

        positive_score = 0
        negative_score = 0

        # Check for negation context
        negated = False
        for i, word in enumerate(words):
            # Reset negation after 3 words
            if i > 0 and i % 3 == 0:
                negated = False

            if word in self.negation_words:
                negated = True
                continue

            # Clean word (remove punctuation)
            clean_word = re.sub(r'[^\w]', '', word)

            if clean_word in self.positive_words:
                if negated:
                    negative_score += 1
                else:
                    positive_score += 1
            elif clean_word in self.negative_words:
                if negated:
                    positive_score += 1
                else:
                    negative_score += 1

        # Determine overall sentiment
        if positive_score > negative_score:
            return 'positive', positive_score - negative_score
        elif negative_score > positive_score:
            return 'negative', negative_score - positive_score
        else:
            return 'neutral', 0

def extract_entities_and_sentiment(reviews, labels=None):
    """Extract named entities and analyze sentiment from reviews"""
    sentiment_analyzer = RuleBasedSentimentAnalyzer()

    results = []
    entity_counter = Counter()
    brand_counter = Counter()
    product_counter = Counter()

    for i, review in enumerate(reviews):
        # Process with spaCy
        doc = nlp(review)

        # Extract entities
        entities = []
        brands = []
        products = []

        for ent in doc.ents:
            entity_info = {
                'text': ent.text,
                'label': ent.label_,
                'description': spacy.explain(ent.label_)
            }
            entities.append(entity_info)
            entity_counter[f"{ent.text} ({ent.label_})"] += 1

            # Categorize potential brands and products
            if ent.label_ in ['ORG', 'PRODUCT']:
                if ent.label_ == 'ORG':
                    brands.append(ent.text)
                    brand_counter[ent.text] += 1
                elif ent.label_ == 'PRODUCT':
                    products.append(ent.text)
                    product_counter[ent.text] += 1

        # Analyze sentiment
        sentiment, score = sentiment_analyzer.analyze_sentiment(review)

        # Store results
        result = {
            'review_id': i,
            'review_text': review[:200] + '...' if len(review) > 200 else review,
            'actual_label': labels[i] if labels else None,
            'entities': entities,
            'brands': brands,
            'products': products,
            'sentiment': sentiment,
            'sentiment_score': score
        }
        results.append(result)

    return results, entity_counter, brand_counter, product_counter

def display_results(results, entity_counter, brand_counter, product_counter, num_samples=5):
    """Display analysis results"""
    print("="*80)
    print("AMAZON REVIEWS ANALYSIS RESULTS")
    print("="*80)

    # Sample results
    print(f"\nSample Analysis Results (showing first {num_samples} reviews):")
    print("-" * 60)

    for i, result in enumerate(results[:num_samples]):
        print(f"\nReview {result['review_id'] + 1}:")
        print(f"Text: {result['review_text']}")
        print(f"Predicted Sentiment: {result['sentiment']} (score: {result['sentiment_score']})")
        if result['actual_label']:
            actual_sentiment = 'positive' if result['actual_label'] == 2 else 'negative'
            print(f"Actual Label: {actual_sentiment}")

        if result['entities']:
            print("Named Entities:")
            for ent in result['entities']:
                print(f"  - {ent['text']} ({ent['label']}: {ent['description']})")

        if result['brands']:
            print(f"Brands detected: {', '.join(result['brands'])}")

        if result['products']:
            print(f"Products detected: {', '.join(result['products'])}")

        print("-" * 40)

    # Summary statistics
    print(f"\nSUMMARY STATISTICS (Total reviews analyzed: {len(results)})")
    print("=" * 50)

    # Sentiment distribution
    sentiment_dist = Counter([r['sentiment'] for r in results])
    print("\nSentiment Distribution:")
    for sentiment, count in sentiment_dist.items():
        percentage = (count / len(results)) * 100
        print(f"  {sentiment.capitalize()}: {count} ({percentage:.1f}%)")

    # Top entities
    print(f"\nTop 10 Named Entities:")
    for entity, count in entity_counter.most_common(10):
        print(f"  {entity}: {count}")

    # Top brands
    if brand_counter:
        print(f"\nTop Brands Detected:")
        for brand, count in brand_counter.most_common(5):
            print(f"  {brand}: {count}")

    # Top products
    if product_counter:
        print(f"\nTop Products Detected:")
        for product, count in product_counter.most_common(5):
            print(f"  {product}: {count}")

def main_colab():
    """Main function to run the analysis in Google Colab"""
    print("🚀 Amazon Reviews Analysis with spaCy NER and Rule-based Sentiment")
    print("="*70)
    print("📁 Upload your .bz2 files when prompted")

    # Upload and load data
    datasets = upload_and_load_data(max_samples=500)  # Reduced for Colab performance

    all_results = {}

    for filename, (reviews, labels) in datasets.items():
        if not reviews:
            print(f"⚠️  No data loaded from {filename}")
            continue

        print(f"\n🔍 Analyzing {filename}...")
        print(f"📊 Processing {len(reviews)} reviews")

        # Extract entities and sentiment
        results, entities, brands, products = extract_entities_and_sentiment(reviews, labels)

        # Store results
        all_results[filename] = {
            'results': results,
            'entities': entities,
            'brands': brands,
            'products': products
        }

        # Display results
        print(f"\n📈 ANALYSIS RESULTS for {filename}")
        print("="*50)
        display_results(results, entities, brands, products)

        # Create and display DataFrame
        print(f"\n📋 Creating summary DataFrame for {filename}...")
        df = create_summary_dataframe(results)
        print(df.head())

        # Export option
        export_filename = filename.replace('.bz2', '_analysis_results.csv')
        export_results_to_csv(results, export_filename)

        print(f"\n💾 Results saved as {export_filename}")
        print("📥 You can download it from the Files panel in Colab")

    # Comparison if multiple datasets
    if len(all_results) > 1:
        print(f"\n🔍 DATASET COMPARISON")
        print("="*50)
        compare_datasets(all_results)

    return all_results

def create_summary_dataframe(results):
    """Create a pandas DataFrame summary of results"""
    data = []
    for result in results:
        row = {
            'review_id': result['review_id'],
            'sentiment': result['sentiment'],
            'sentiment_score': result['sentiment_score'],
            'actual_label': result['actual_label'],
            'num_entities': len(result['entities']),
            'num_brands': len(result['brands']),
            'num_products': len(result['products']),
            'review_length': len(result['review_text'])
        }
        data.append(row)

    return pd.DataFrame(data)

def compare_datasets(all_results):
    """Compare results across different datasets"""
    for filename, data in all_results.items():
        results = data['results']

        # Calculate metrics
        total_reviews = len(results)
        sentiment_dist = Counter([r['sentiment'] for r in results])
        avg_entities = sum(len(r['entities']) for r in results) / total_reviews

        print(f"\n📊 {filename}:")
        print(f"   Total reviews: {total_reviews}")
        print(f"   Positive: {sentiment_dist.get('positive', 0)} ({sentiment_dist.get('positive', 0)/total_reviews*100:.1f}%)")
        print(f"   Negative: {sentiment_dist.get('negative', 0)} ({sentiment_dist.get('negative', 0)/total_reviews*100:.1f}%)")
        print(f"   Neutral: {sentiment_dist.get('neutral', 0)} ({sentiment_dist.get('neutral', 0)/total_reviews*100:.1f}%)")
        print(f"   Avg entities per review: {avg_entities:.1f}")

# Quick start function for Colab
def quick_start():
    """Quick start function with smaller sample size for testing"""
    print("🚀 QUICK START - Amazon Reviews Analysis")
    print("="*50)
    print("This will process a smaller sample (100 reviews per file) for quick testing")

    datasets = upload_and_load_data(max_samples=100)

    for filename, (reviews, labels) in datasets.items():
        if reviews:
            print(f"\n🔍 Quick analysis of {filename} ({len(reviews)} reviews):")
            results, entities, brands, products = extract_entities_and_sentiment(reviews, labels)

            # Show just summary stats
            sentiment_dist = Counter([r['sentiment'] for r in results])
            print(f"   Sentiment: {dict(sentiment_dist)}")
            print(f"   Top entities: {dict(entities.most_common(3))}")
            print(f"   Brands found: {list(brands.keys())[:5]}")

    return datasets

if __name__ == "__main__":
    # For Google Colab - choose your preferred method:

    # Option 1: Full analysis (recommended)
    results = main_colab()

    # Option 2: Quick start for testing (uncomment to use instead)
    # results = quick_start()

    print("\n✅ Analysis complete! Check the results above and download CSV files from the Files panel.")

# Additional utility functions for further analysis

def export_results_to_csv(results, filename):
    """Export results to CSV for further analysis"""
    data = []
    for result in results:
        row = {
            'review_id': result['review_id'],
            'review_text': result['review_text'],
            'sentiment': result['sentiment'],
            'sentiment_score': result['sentiment_score'],
            'actual_label': result['actual_label'],
            'num_entities': len(result['entities']),
            'entities': '; '.join([f"{e['text']}({e['label']})" for e in result['entities']]),
            'brands': '; '.join(result['brands']),
            'products': '; '.join(result['products'])
        }
        data.append(row)

    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)
    print(f"Results exported to {filename}")

# Example usage for CSV export:
# export_results_to_csv(train_results, 'train_analysis_results.csv')
# export_results_to_csv(test_results, 'test_analysis_results.csv')

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m94.7 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
🚀 Amazon Reviews Analysis with spaCy NER and Rule-based Sentiment
📁 Upload your .bz2 files when prompted
Please upload your .bz2 files (train.ft.txt.bz2 and test.ft.txt.bz2)


Saving test.ft.txt.bz2 to test.ft.txt.bz2
Saving train.ft.txt.bz2 to train.ft.txt.bz2

Processing test.ft.txt.bz2...
Loaded 500 reviews from test.ft.txt.bz2

Processing train.ft.txt.bz2...
Loaded 500 reviews from train.ft.txt.bz2

🔍 Analyzing test.ft.txt.bz2...
📊 Processing 500 reviews

📈 ANALYSIS RESULTS for test.ft.txt.bz2
AMAZON REVIEWS ANALYSIS RESULTS

Sample Analysis Results (showing first 5 reviews):
------------------------------------------------------------

Review 1:
Text: Great CD: My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I'm in a good mood it makes me feel better. A bad mood just evapor...
Predicted Sentiment: positive (score: 3)
Actual Label: positive
Named Entities:
  - Pat (PERSON: People, including fictional)
  - one (CARDINAL: Numerals that do not fall under another type)
  - GREAT (ORG: Companies, agencies, institutions, etc.)
  - YEARS (DATE: Absolute or relative dates or periods)
  -

NameError: name 'export_results_to_csv' is not defined

In [2]:
import spacy
import bz2
import re
from collections import Counter, defaultdict
import pandas as pd
from google.colab import files
import io # Import the io module

# Install required packages in Colab
!pip install spacy pandas
!python -m spacy download en_core_web_sm

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def upload_and_load_data(max_samples=1000):
    """
    Upload and load Amazon review data from bz2 compressed file in Colab
    Expected format: __label__1 or __label__2 followed by review text
    """
    print("Please upload your .bz2 files (train.ft.txt.bz2 and test.ft.txt.bz2)")
    uploaded = files.upload()

    datasets = {}

    for filename in uploaded.keys():
        print(f"\nProcessing {filename}...")
        reviews = []
        labels = []

        try:
            # Read the uploaded file
            file_content = uploaded[filename]

            # Decompress and read
            with bz2.open(io.BytesIO(file_content), 'rt', encoding='utf-8') as f:
                for i, line in enumerate(f):
                    if i >= max_samples:  # Limit for demo purposes
                        break

                    line = line.strip()
                    if line.startswith('__label__'):
                        # Extract label and review text
                        parts = line.split(' ', 1)
                        if len(parts) == 2:
                            label = parts[0].replace('__label__', '')
                            review_text = parts[1]

                            labels.append(int(label))
                            reviews.append(review_text)

            datasets[filename] = (reviews, labels)
            print(f"Loaded {len(reviews)} reviews from {filename}")

        except Exception as e:
            print(f"Error loading {filename}: {e}")
            datasets[filename] = ([], [])

    return datasets

def load_amazon_data_colab(file_content, max_samples=1000):
    """
    Load Amazon review data from uploaded file content
    Expected format: __label__1 or __label__2 followed by review text
    """
    reviews = []
    labels = []

    try:
        # Decompress and read
        with bz2.open(io.BytesIO(file_content), 'rt', encoding='utf-8') as f:
            for i, line in enumerate(f):
                if i >= max_samples:  # Limit for demo purposes
                    break

                line = line.strip()
                if line.startswith('__label__'):
                    # Extract label and review text
                    parts = line.split(' ', 1)
                    if len(parts) == 2:
                        label = parts[0].replace('__label__', '')
                        review_text = parts[1]

                        labels.append(int(label))
                        reviews.append(review_text)

    except Exception as e:
        print(f"Error loading data: {e}")
        return [], []

    return reviews, labels

class RuleBasedSentimentAnalyzer:
    """Simple rule-based sentiment analyzer"""

    def __init__(self):
        # Positive and negative word lists
        self.positive_words = {
            'excellent', 'amazing', 'great', 'good', 'fantastic', 'wonderful',
            'perfect', 'love', 'awesome', 'brilliant', 'outstanding', 'superb',
            'impressed', 'satisfied', 'recommend', 'best', 'quality', 'happy',
            'pleased', 'smooth', 'fast', 'easy', 'comfortable', 'durable'
        }

        self.negative_words = {
            'bad', 'terrible', 'awful', 'horrible', 'worst', 'hate', 'disappointed',
            'poor', 'cheap', 'broken', 'defective', 'useless', 'waste', 'annoying',
            'frustrating', 'slow', 'difficult', 'uncomfortable', 'flimsy', 'terrible',
            'pathetic', 'garbage', 'trash', 'regret', 'problem', 'issue', 'fail'
        }

        # Negation words that can flip sentiment
        self.negation_words = {'not', 'no', 'never', 'none', 'nothing', 'nowhere',
                              'neither', 'nobody', 'hardly', "don't", "doesn't",
                              "didn't", "won't", "wouldn't", "can't", "couldn't"}

    def analyze_sentiment(self, text):
        """Analyze sentiment of text using rule-based approach"""
        text_lower = text.lower()
        words = text_lower.split()

        positive_score = 0
        negative_score = 0

        # Check for negation context
        negated = False
        for i, word in enumerate(words):
            # Reset negation after 3 words
            if i > 0 and i % 3 == 0:
                negated = False

            if word in self.negation_words:
                negated = True
                continue

            # Clean word (remove punctuation)
            clean_word = re.sub(r'[^\w]', '', word)

            if clean_word in self.positive_words:
                if negated:
                    negative_score += 1
                else:
                    positive_score += 1
            elif clean_word in self.negative_words:
                if negated:
                    positive_score += 1
                else:
                    negative_score += 1

        # Determine overall sentiment
        if positive_score > negative_score:
            return 'positive', positive_score - negative_score
        elif negative_score > positive_score:
            return 'negative', negative_score - positive_score
        else:
            return 'neutral', 0

def extract_entities_and_sentiment(reviews, labels=None):
    """Extract named entities and analyze sentiment from reviews"""
    sentiment_analyzer = RuleBasedSentimentAnalyzer()

    results = []
    entity_counter = Counter()
    brand_counter = Counter()
    product_counter = Counter()

    for i, review in enumerate(reviews):
        # Process with spaCy
        doc = nlp(review)

        # Extract entities
        entities = []
        brands = []
        products = []

        for ent in doc.ents:
            entity_info = {
                'text': ent.text,
                'label': ent.label_,
                'description': spacy.explain(ent.label_)
            }
            entities.append(entity_info)
            entity_counter[f"{ent.text} ({ent.label_})"] += 1

            # Categorize potential brands and products
            if ent.label_ in ['ORG', 'PRODUCT']:
                if ent.label_ == 'ORG':
                    brands.append(ent.text)
                    brand_counter[ent.text] += 1
                elif ent.label_ == 'PRODUCT':
                    products.append(ent.text)
                    product_counter[ent.text] += 1

        # Analyze sentiment
        sentiment, score = sentiment_analyzer.analyze_sentiment(review)

        # Store results
        result = {
            'review_id': i,
            'review_text': review[:200] + '...' if len(review) > 200 else review,
            'actual_label': labels[i] if labels else None,
            'entities': entities,
            'brands': brands,
            'products': products,
            'sentiment': sentiment,
            'sentiment_score': score
        }
        results.append(result)

    return results, entity_counter, brand_counter, product_counter

def display_results(results, entity_counter, brand_counter, product_counter, num_samples=5):
    """Display analysis results"""
    print("="*80)
    print("AMAZON REVIEWS ANALYSIS RESULTS")
    print("="*80)

    # Sample results
    print(f"\nSample Analysis Results (showing first {num_samples} reviews):")
    print("-" * 60)

    for i, result in enumerate(results[:num_samples]):
        print(f"\nReview {result['review_id'] + 1}:")
        print(f"Text: {result['review_text']}")
        print(f"Predicted Sentiment: {result['sentiment']} (score: {result['sentiment_score']})")
        if result['actual_label']:
            actual_sentiment = 'positive' if result['actual_label'] == 2 else 'negative'
            print(f"Actual Label: {actual_sentiment}")

        if result['entities']:
            print("Named Entities:")
            for ent in result['entities']:
                print(f"  - {ent['text']} ({ent['label']}: {ent['description']})")

        if result['brands']:
            print(f"Brands detected: {', '.join(result['brands'])}")

        if result['products']:
            print(f"Products detected: {', '.join(result['products'])}")

        print("-" * 40)

    # Summary statistics
    print(f"\nSUMMARY STATISTICS (Total reviews analyzed: {len(results)})")
    print("=" * 50)

    # Sentiment distribution
    sentiment_dist = Counter([r['sentiment'] for r in results])
    print("\nSentiment Distribution:")
    for sentiment, count in sentiment_dist.items():
        percentage = (count / len(results)) * 100
        print(f"  {sentiment.capitalize()}: {count} ({percentage:.1f}%)")

    # Top entities
    print(f"\nTop 10 Named Entities:")
    for entity, count in entity_counter.most_common(10):
        print(f"  {entity}: {count}")

    # Top brands
    if brand_counter:
        print(f"\nTop Brands Detected:")
        for brand, count in brand_counter.most_common(5):
            print(f"  {brand}: {count}")

    # Top products
    if product_counter:
        print(f"\nTop Products Detected:")
        for product, count in product_counter.most_common(5):
            print(f"  {product}: {count}")

def create_summary_dataframe(results):
    """Create a pandas DataFrame summary of results"""
    data = []
    for result in results:
        row = {
            'review_id': result['review_id'],
            'sentiment': result['sentiment'],
            'sentiment_score': result['sentiment_score'],
            'actual_label': result['actual_label'],
            'num_entities': len(result['entities']),
            'num_brands': len(result['brands']),
            'num_products': len(result['products']),
            'review_length': len(result['review_text'])
        }
        data.append(row)

    return pd.DataFrame(data)

def compare_datasets(all_results):
    """Compare results across different datasets"""
    for filename, data in all_results.items():
        results = data['results']

        # Calculate metrics
        total_reviews = len(results)
        sentiment_dist = Counter([r['sentiment'] for r in results])
        avg_entities = sum(len(r['entities']) for r in results) / total_reviews

        print(f"\n📊 {filename}:")
        print(f"   Total reviews: {total_reviews}")
        print(f"   Positive: {sentiment_dist.get('positive', 0)} ({sentiment_dist.get('positive', 0)/total_reviews*100:.1f}%)")
        print(f"   Negative: {sentiment_dist.get('negative', 0)} ({sentiment_dist.get('negative', 0)/total_reviews*100:.1f}%)")
        print(f"   Neutral: {sentiment_dist.get('neutral', 0)} ({sentiment_dist.get('neutral', 0)/total_reviews*100:.1f}%)")
        print(f"   Avg entities per review: {avg_entities:.1f}")

# Additional utility functions for further analysis
# Moved this function definition before it is called in main_colab
def export_results_to_csv(results, filename):
    """Export results to CSV for further analysis"""
    data = []
    for result in results:
        row = {
            'review_id': result['review_id'],
            'review_text': result['review_text'],
            'sentiment': result['sentiment'],
            'sentiment_score': result['sentiment_score'],
            'actual_label': result['actual_label'],
            'num_entities': len(result['entities']),
            'entities': '; '.join([f"{e['text']}({e['label']})" for e in result['entities']]),
            'brands': '; '.join(result['brands']),
            'products': '; '.join(result['products'])
        }
        data.append(row)

    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)
    print(f"Results exported to {filename}")

def main_colab():
    """Main function to run the analysis in Google Colab"""
    print("🚀 Amazon Reviews Analysis with spaCy NER and Rule-based Sentiment")
    print("="*70)
    print("📁 Upload your .bz2 files when prompted")

    # Upload and load data
    datasets = upload_and_load_data(max_samples=500)  # Reduced for Colab performance

    all_results = {}

    for filename, (reviews, labels) in datasets.items():
        if not reviews:
            print(f"⚠️  No data loaded from {filename}")
            continue

        print(f"\n🔍 Analyzing {filename}...")
        print(f"📊 Processing {len(reviews)} reviews")

        # Extract entities and sentiment
        results, entities, brands, products = extract_entities_and_sentiment(reviews, labels)

        # Store results
        all_results[filename] = {
            'results': results,
            'entities': entities,
            'brands': brands,
            'products': products
        }

        # Display results
        print(f"\n📈 ANALYSIS RESULTS for {filename}")
        print("="*50)
        display_results(results, entities, brands, products)

        # Create and display DataFrame
        print(f"\n📋 Creating summary DataFrame for {filename}...")
        df = create_summary_dataframe(results)
        print(df.head())

        # Export option
        export_filename = filename.replace('.bz2', '_analysis_results.csv')
        export_results_to_csv(results, export_filename) # This call will now succeed

        print(f"\n💾 Results saved as {export_filename}")
        print("📥 You can download it from the Files panel in Colab")

    # Comparison if multiple datasets
    if len(all_results) > 1:
        print(f"\n🔍 DATASET COMPARISON")
        print("="*50)
        compare_datasets(all_results)

    return all_results


# Quick start function for Colab
def quick_start():
    """Quick start function with smaller sample size for testing"""
    print("🚀 QUICK START - Amazon Reviews Analysis")
    print("="*50)
    print("This will process a smaller sample (100 reviews per file) for quick testing")

    datasets = upload_and_load_data(max_samples=100)

    for filename, (reviews, labels) in datasets.items():
        if reviews:
            print(f"\n🔍 Quick analysis of {filename} ({len(reviews)} reviews):")
            results, entities, brands, products = extract_entities_and_sentiment(reviews, labels)

            # Show just summary stats
            sentiment_dist = Counter([r['sentiment'] for r in results])
            print(f"   Sentiment: {dict(sentiment_dist)}")
            print(f"   Top entities: {dict(entities.most_common(3))}")
            print(f"   Brands found: {list(brands.keys())[:5]}")

    return datasets


if __name__ == "__main__":
    # For Google Colab - choose your preferred method:

    # Option 1: Full analysis (recommended)
    results = main_colab()

    # Option 2: Quick start for testing (uncomment to use instead)
    # results = quick_start()

    print("\n✅ Analysis complete! Check the results above and download CSV files from the Files panel.")

# Example usage for CSV export:
# export_results_to_csv(train_results, 'train_analysis_results.csv')
# export_results_to_csv(test_results, 'test_analysis_results.csv')

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m74.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
🚀 Amazon Reviews Analysis with spaCy NER and Rule-based Sentiment
📁 Upload your .bz2 files when prompted
Please upload your .bz2 files (train.ft.txt.bz2 and test.ft.txt.bz2)


Saving test.ft.txt.bz2 to test.ft.txt (1).bz2
Saving train.ft.txt.bz2 to train.ft.txt (1).bz2

Processing test.ft.txt (1).bz2...
Loaded 500 reviews from test.ft.txt (1).bz2

Processing train.ft.txt (1).bz2...
Loaded 500 reviews from train.ft.txt (1).bz2

🔍 Analyzing test.ft.txt (1).bz2...
📊 Processing 500 reviews

📈 ANALYSIS RESULTS for test.ft.txt (1).bz2
AMAZON REVIEWS ANALYSIS RESULTS

Sample Analysis Results (showing first 5 reviews):
------------------------------------------------------------

Review 1:
Text: Great CD: My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I'm in a good mood it makes me feel better. A bad mood just evapor...
Predicted Sentiment: positive (score: 3)
Actual Label: positive
Named Entities:
  - Pat (PERSON: People, including fictional)
  - one (CARDINAL: Numerals that do not fall under another type)
  - GREAT (ORG: Companies, agencies, institutions, etc.)
  - YEARS (DATE: Absolute o