In [1]:
from datasets import load_dataset, concatenate_datasets
import pandas as pd
import numpy as np
from datetime import datetime

def load_amazon_reviews_hf(categories):
    """
    Load Amazon reviews for specific categories using Hugging Face datasets.
    
    Args:
        categories: List of categories to include (e.g., "All_Beauty", "Appliances")
    
    Returns:
        DataFrame with filtered reviews
    """
    all_reviews = []
    
    for category in categories:
        try:
            # Format category name for dataset loading
            formatted_category = category.replace(" ", "_")
            dataset_name = f"raw_review_{formatted_category}"
            
            print(f"Loading {category} reviews...")
            dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", dataset_name, split="full", trust_remote_code=True)
            
            # Convert to pandas DataFrame
            category_reviews = pd.DataFrame(dataset)
            
            # Add readable category name
            category_reviews['category'] = category.replace("_", " ")
            
            # Convert timestamp to datetime
            category_reviews['datetime'] = category_reviews['timestamp'].apply(
                lambda x: datetime.fromtimestamp(x/1000)
            )
            
            all_reviews.append(category_reviews)
            print(f"Loaded {len(category_reviews)} reviews from {category}")
            
        except Exception as e:
            print(f"Error loading {category}: {e}")
    
    if all_reviews:
        combined_reviews = pd.concat(all_reviews, ignore_index=True)
        return combined_reviews
    else:
        return pd.DataFrame()

def load_amazon_metadata_hf(categories):
    """
    Load Amazon product metadata for specific categories using Hugging Face datasets.
    
    Args:
        categories: List of categories to include (e.g., "All_Beauty", "Appliances")
    
    Returns:
        DataFrame with product metadata
    """
    all_metadata = []
    
    for category in categories:
        try:
            # Format category name for dataset loading
            formatted_category = category.replace(" ", "_")
            dataset_name = f"raw_meta_{formatted_category}"
            
            print(f"Loading {category} metadata...")
            dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", dataset_name, split="full", trust_remote_code=True)
            
            # Convert to pandas DataFrame
            category_metadata = pd.DataFrame(dataset)
            
            # Add readable category name
            category_metadata['category'] = category.replace("_", " ")
            
            all_metadata.append(category_metadata)
            print(f"Loaded {len(category_metadata)} product metadata from {category}")
            
        except Exception as e:
            print(f"Error loading {category} metadata: {e}")
    
    if all_metadata:
        combined_metadata = pd.concat(all_metadata, ignore_index=True)
        return combined_metadata
    else:
        return pd.DataFrame()

def preprocess_reviews(reviews_df, sample_size=None):
    """
    Preprocess the reviews dataframe.
    
    Args:
        reviews_df: DataFrame with reviews
        sample_size: Number of reviews to sample (optional)
    
    Returns:
        Preprocessed DataFrame
    """
    # Remove reviews with empty text
    reviews_df = reviews_df[reviews_df['text'].notna() & (reviews_df['text'] != "")]
    
    # Create a 'reviewText' column for compatibility with the rest of the code
    reviews_df['reviewText'] = reviews_df['text']
    
    # Sample if needed
    if sample_size and len(reviews_df) > sample_size:
        reviews_df = reviews_df.sample(sample_size, random_state=42)
    
    return reviews_df

def merge_reviews_with_metadata(reviews_df, metadata_df):
    """
    Merge reviews with product metadata.
    
    Args:
        reviews_df: DataFrame with reviews
        metadata_df: DataFrame with product metadata
    
    Returns:
        Merged DataFrame
    """
    # Merge on parent_asin
    merged_df = pd.merge(
        reviews_df,
        metadata_df[['parent_asin', 'main_category', 'title', 'average_rating', 'store']],
        on='parent_asin',
        how='left',
        suffixes=('_review', '_product')
    )
    
    # Rename columns for clarity
    merged_df = merged_df.rename(columns={
        'title_review': 'review_title',
        'title_product': 'product_title'
    })
    
    return merged_df

# Example usage
if __name__ == "__main__":
    categories = [
        "All Beauty", 
        "Health and Personal Care", 
        "Appliances",
        "Video_Games"
    ]
    
    # Load reviews
    reviews_df = load_amazon_reviews_hf(categories)
    
    # Load metadata
    metadata_df = load_amazon_metadata_hf(categories)
    
    # Preprocess reviews
    reviews_df = preprocess_reviews(reviews_df, sample_size=10000)
    
    # Merge reviews with metadata
    merged_df = merge_reviews_with_metadata(reviews_df, metadata_df)
    
    print(f"Total reviews loaded: {len(reviews_df)}")
    print(f"Total products loaded: {len(metadata_df)}")
    print(f"Merged dataset size: {len(merged_df)}")
    print(f"Sample data:\n{merged_df.head()}")
    
    # Save the filtered dataset
    merged_df.to_parquet("filtered_amazon_reviews.parquet")

Loading All Beauty reviews...
Loaded 701528 reviews from All Beauty
Loading Health and Personal Care reviews...
Loaded 494121 reviews from Health and Personal Care
Loading Appliances reviews...
Loaded 2128605 reviews from Appliances
Loading Video_Games reviews...
Loaded 4624615 reviews from Video_Games
Loading All Beauty metadata...
Loaded 112590 product metadata from All Beauty
Loading Health and Personal Care metadata...
Loaded 60293 product metadata from Health and Personal Care
Loading Appliances metadata...
Loaded 94327 product metadata from Appliances
Loading Video_Games metadata...
Loaded 137269 product metadata from Video_Games


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_df['reviewText'] = reviews_df['text']


Total reviews loaded: 10000
Total products loaded: 404479
Merged dataset size: 10000
Sample data:
   rating                                       review_title  \
0     5.0          A fabulous new installment to the series!   
1     5.0  Waterloo: Tabletop Wargaming in the Age of Nap...   
2     5.0                                Very easy to instal   
3     5.0                                   Easy to install.   
4     1.0                                             sucks!   

                                                text images        asin  \
0  I LOVE this game. Really looking forward to ne...     []  B01N3NNPAB   
1  My son has been a Warhammer enthusiast for the...     []  1907964177   
2                                       Refrigerator     []  B06VW9HKXF   
3                                   Easy to install.     []  B07P3B366X   
4  Cheap plastic. Soft case. My son broke in with...     []  B007PX6MFM   

  parent_asin                       user_id      timestamp  helpfu

In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import re
from transformers import AutoTokenizer, AutoModel
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords', quiet=True)

class AspectDetector:
    """
    Class for detecting aspects in reviews using rule-based and model-based approaches.
    """
    def __init__(self):
        """
        Initialize the AspectDetector with category-specific aspects.
        """
        # Define aspects for each category
        self.category_aspects = {
            "All Beauty": [
                "fragrance", "scent", "smell", "texture", "consistency", "absorption", 
                "packaging", "bottle", "container", "applicator", "effectiveness", 
                "results", "skin", "face", "hair", "nails", "ingredients", "natural", 
                "organic", "chemicals", "allergic", "reaction", "price", "value", 
                "quantity", "size", "brand", "customer_service"
            ],
            "Health and Personal Care": [
                "effectiveness", "results", "side_effects", "ingredients", "natural", 
                "organic", "chemicals", "taste", "flavor", "smell", "scent", "texture", 
                "consistency", "ease_of_use", "convenience", "packaging", "dosage", 
                "instructions", "price", "value", "quantity", "size", "brand", 
                "customer_service", "shipping", "delivery"
            ],
            "Appliances": [
                "performance", "power", "efficiency", "noise", "sound", "volume", 
                "size", "dimensions", "weight", "design", "appearance", "color", 
                "material", "build_quality", "durability", "reliability", "features", 
                "functions", "settings", "controls", "ease_of_use", "installation", 
                "setup", "instructions", "manual", "price", "value", "warranty", 
                "customer_service", "energy_consumption", "maintenance", "cleaning"
            ],
            "Video_Games": [
                "gameplay", "mechanics", "controls", "difficulty", "challenge", 
                "story", "plot", "narrative", "characters", "graphics", "visuals", 
                "animation", "sound", "music", "voice_acting", "performance", 
                "loading_times", "bugs", "glitches", "multiplayer", "online", 
                "community", "replayability", "content", "dlc", "price", "value", 
                "physical_copy", "digital_download", "installation"
            ]
        }
        
        # Initialize tokenizer and model for embedding-based detection
        try:
            self.tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
            self.model = AutoModel.from_pretrained("distilbert-base-uncased")
        except:
            print("Warning: Could not load transformer model. Using rule-based detection only.")
            self.tokenizer = None
            self.model = None
        
        # Initialize stopwords
        self.stop_words = set(stopwords.words('english'))
    
    def detect_aspects(self, review, category=None):
        """
        Detect aspects in a review using both rule-based and embedding-based approaches.
        
        Args:
            review (str): The review text.
            category (str, optional): The product category.
            
        Returns:
            list: List of detected aspects.
        """
        # Use rule-based detection
        rule_based_aspects = self.extract_aspects_rule_based(review, category)
        
        # Use embedding-based detection if model is available
        if self.tokenizer is not None and self.model is not None:
            embedding_based_aspects = self.extract_aspects_embedding_based(review, category)
            # Combine results
            all_aspects = list(set(rule_based_aspects + embedding_based_aspects))
        else:
            all_aspects = rule_based_aspects
        
        return all_aspects
    
    def extract_aspects_rule_based(self, review, category=None):
        """
        Extract aspects from a review using rule-based approach.
        
        Args:
            review (str): The review text.
            category (str, optional): The product category.
            
        Returns:
            list: List of detected aspects.
        """
        if not isinstance(review, str):
            return []
        
        # Lowercase the review
        review = review.lower()
        
        # Tokenize the review
        tokens = word_tokenize(review)
        
        # Remove stopwords
        tokens = [token for token in tokens if token not in self.stop_words]
        
        # Get aspects for the category
        if category and category in self.category_aspects:
            category_aspects = self.category_aspects[category]
        else:
            # If category is not provided or not found, use all aspects
            category_aspects = []
            for aspects in self.category_aspects.values():
                category_aspects.extend(aspects)
            category_aspects = list(set(category_aspects))
        
        # Check for aspects in the review
        detected_aspects = []
        for aspect in category_aspects:
            # Convert aspect with underscores to space-separated words
            aspect_words = aspect.replace('_', ' ').split()
            
            # Check if all words in the aspect are in the review
            if all(word in review for word in aspect_words):
                detected_aspects.append(aspect)
        
        return detected_aspects
    
    def extract_aspects_embedding_based(self, review, category=None):
        """
        Extract aspects from a review using embedding-based approach.
        
        Args:
            review (str): The review text.
            category (str, optional): The product category.
            
        Returns:
            list: List of detected aspects.
        """
        if not isinstance(review, str) or self.tokenizer is None or self.model is None:
            return []
        
        # Get aspects for the category
        if category and category in self.category_aspects:
            category_aspects = self.category_aspects[category]
        else:
            # If category is not provided or not found, use all aspects
            category_aspects = []
            for aspects in self.category_aspects.values():
                category_aspects.extend(aspects)
            category_aspects = list(set(category_aspects))
        
        # Encode the review
        inputs = self.tokenizer(review, return_tensors="pt", truncation=True, max_length=512)
        with torch.no_grad():
            outputs = self.model(**inputs)
        
        # Get the review embedding (average of token embeddings)
        review_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
        
        # Encode each aspect
        aspect_embeddings = {}
        for aspect in category_aspects:
            # Convert aspect with underscores to space-separated words
            aspect_text = aspect.replace('_', ' ')
            
            inputs = self.tokenizer(aspect_text, return_tensors="pt", truncation=True, max_length=512)
            with torch.no_grad():
                outputs = self.model(**inputs)
            
            # Get the aspect embedding
            aspect_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            aspect_embeddings[aspect] = aspect_embedding
        
        # Calculate cosine similarity between review and each aspect
        detected_aspects = []
        for aspect, aspect_embedding in aspect_embeddings.items():
            # Calculate cosine similarity
            similarity = np.dot(review_embedding, aspect_embedding) / (
                np.linalg.norm(review_embedding) * np.linalg.norm(aspect_embedding))
            
            # If similarity is above threshold, consider the aspect as detected
            if similarity > 0.5:  # Threshold can be adjusted
                detected_aspects.append(aspect)
        
        return detected_aspects


class GNNContextModel(nn.Module):
    """
    Graph Neural Network model for context-aware sentiment analysis.
    """
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=2):
        """
        Initialize the GNN model.
        
        Args:
            input_dim (int): Input dimension.
            hidden_dim (int): Hidden dimension.
            output_dim (int): Output dimension.
            num_layers (int): Number of GNN layers.
        """
        super(GNNContextModel, self).__init__()
        
        # Input projection
        self.input_proj = nn.Linear(input_dim, hidden_dim)
        
        # GNN layers
        self.gnn_layers = nn.ModuleList()
        for _ in range(num_layers):
            self.gnn_layers.append(GNNLayer(hidden_dim))
        
        # Output projection
        self.output_proj = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x, adj_matrix):
        """
        Forward pass.
        
        Args:
            x (torch.Tensor): Node features.
            adj_matrix (torch.Tensor): Adjacency matrix.
            
        Returns:
            torch.Tensor: Output features.
        """
        # Input projection
        h = F.relu(self.input_proj(x))
        
        # GNN layers
        for layer in self.gnn_layers:
            h = layer(h, adj_matrix)
        
        # Output projection
        out = self.output_proj(h)
        
        return out


class GNNLayer(nn.Module):
    """
    Graph Neural Network layer.
    """
    def __init__(self, hidden_dim):
        """
        Initialize the GNN layer.
        
        Args:
            hidden_dim (int): Hidden dimension.
        """
        super(GNNLayer, self).__init__()
        
        # Message passing
        self.message = nn.Linear(hidden_dim, hidden_dim)
        
        # Update
        self.update = nn.Linear(2 * hidden_dim, hidden_dim)
    
    def forward(self, x, adj_matrix):
        """
        Forward pass.
        
        Args:
            x (torch.Tensor): Node features.
            adj_matrix (torch.Tensor): Adjacency matrix.
            
        Returns:
            torch.Tensor: Updated node features.
        """
        # Message passing
        m = self.message(x)
        m = torch.matmul(adj_matrix, m)
        
        # Update
        h = torch.cat([x, m], dim=1)
        h = self.update(h)
        h = F.relu(h)
        
        return h


# Example usage
if __name__ == "__main__":
    # Initialize aspect detector
    aspect_detector = AspectDetector()
    
    # Example reviews
    reviews = [
        "This shampoo has a great fragrance and leaves my hair feeling soft.",
        "The vacuum cleaner is powerful but very noisy.",
        "This game has amazing graphics but the story is weak.",
        "The vitamins are effective but expensive."
    ]
    
    categories = [
        "All Beauty",
        "Appliances",
        "Video_Games",
        "Health and Personal Care"
    ]
    
    # Detect aspects
    for review, category in zip(reviews, categories):
        aspects = aspect_detector.detect_aspects(review, category)
        print(f"\nReview: {review}")
        print(f"Category: {category}")
        print(f"Detected aspects: {aspects}")

[nltk_data] Downloading package punkt to C:\Users\Manvi
[nltk_data]     Bhala\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!



Review: This shampoo has a great fragrance and leaves my hair feeling soft.
Category: All Beauty
Detected aspects: ['fragrance', 'hair']

Review: The vacuum cleaner is powerful but very noisy.
Category: Appliances
Detected aspects: ['energy_consumption', 'sound', 'maintenance', 'controls', 'setup', 'power', 'warranty', 'weight', 'manual', 'noise', 'settings', 'design', 'cleaning', 'features', 'ease_of_use', 'customer_service', 'instructions', 'build_quality', 'size', 'durability', 'reliability', 'performance', 'efficiency', 'dimensions', 'installation']

Review: This game has amazing graphics but the story is weak.
Category: Video_Games
Detected aspects: ['glitches', 'replayability', 'voice_acting', 'story', 'graphics', 'dlc', 'gameplay', 'multiplayer']

Review: The vitamins are effective but expensive.
Category: Health and Personal Care
Detected aspects: ['effectiveness', 'side_effects', 'ease_of_use', 'customer_service', 'ingredients', 'taste', 'organic', 'scent', 'smell', 'dosage',

In [24]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to C:\Users\Manvi
[nltk_data]     Bhala\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [26]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import time
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

class EmotionDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze()
        }

class RealTimeSentimentTracker:
    def __init__(self, model_name="bhadresh-savani/distilbert-base-uncased-emotion"):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {self.device}")
        
        # Load EmoBERTa model
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name).to(self.device)
        
        # Emotion labels
        self.emotions = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
        
        # Storage for real-time tracking
        self.sentiment_history = {
            'timestamp': [],
            'category': [],
            'product_id': [],
            'emotion': [],
            'score': []
        }
        
        # Category display names
        self.category_display_names = {
            'All Beauty': 'Beauty',
            'Health and Personal Care': 'Health',
            'Appliances': 'Appliances',
            'Video_Games': 'Games'
        }
    
    def analyze_batch(self, texts, categories=None, product_ids=None, timestamps=None):
        """Analyze emotions in a batch of texts"""
        if categories is None:
            categories = ['unknown'] * len(texts)
        
        if product_ids is None:
            product_ids = ['unknown'] * len(texts)
            
        if timestamps is None:
            timestamps = [datetime.now()] * len(texts)
            
        dataset = EmotionDataset(texts, self.tokenizer)
        dataloader = DataLoader(dataset, batch_size=16)
        
        results = []
        self.model.eval()
        
        with torch.no_grad():
            for batch in dataloader:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                
                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
                predictions = torch.softmax(outputs.logits, dim=1)
                
                for pred in predictions:
                    emotion_scores = {emotion: score.item() for emotion, score in zip(self.emotions, pred)}
                    results.append(emotion_scores)
        
        # Update sentiment history
        for i, emotion_scores in enumerate(results):
            for emotion, score in emotion_scores.items():
                self.sentiment_history['timestamp'].append(timestamps[i])
                self.sentiment_history['category'].append(categories[i])
                self.sentiment_history['product_id'].append(product_ids[i])
                self.sentiment_history['emotion'].append(emotion)
                self.sentiment_history['score'].append(score)
        
        return results
    
    def process_stream(self, review_stream, batch_size=32, interval_seconds=5, max_duration_seconds=60):
        """Process a stream of reviews in real-time"""
        print("Starting real-time sentiment tracking...")
        
        start_time = time.time()
        end_time = start_time + max_duration_seconds
        
        while time.time() < end_time:
            # Collect reviews from the stream
            batch_texts = []
            batch_categories = []
            batch_product_ids = []
            batch_timestamps = []
            
            for _ in range(batch_size):
                if review_stream.has_new():
                    review = review_stream.get_next()
                    batch_texts.append(review['text'])
                    batch_categories.append(review['category'])
                    batch_product_ids.append(review.get('product_id', 'unknown'))
                    batch_timestamps.append(review.get('timestamp', datetime.now()))
                
                if len(batch_texts) == 0:
                    time.sleep(1)  # Wait for new reviews
                    continue
            
            # Process the batch
            if batch_texts:
                print(f"Processing batch of {len(batch_texts)} reviews...")
                results = self.analyze_batch(
                    batch_texts, 
                    batch_categories, 
                    batch_product_ids, 
                    batch_timestamps
                )
                
                # Print summary
                emotions_summary = {}
                for emotion_scores in results:
                    top_emotion = max(emotion_scores, key=emotion_scores.get)
                    emotions_summary[top_emotion] = emotions_summary.get(top_emotion, 0) + 1
                
                print("Emotion distribution in this batch:")
                for emotion, count in emotions_summary.items():
                    print(f"  {emotion}: {count} ({count/len(results)*100:.1f}%)")
            
            # Wait for the next interval
            time.sleep(interval_seconds)
        
        print(f"Completed {max_duration_seconds} seconds of real-time tracking")
    
    def get_sentiment_trends(self, category=None, product_id=None, time_window=None):
        """Get sentiment trends for analysis"""
        df = pd.DataFrame(self.sentiment_history)
        
        if len(df) == 0:
            return pd.DataFrame()
        
        # Filter by category if specified
        if category:
            df = df[df['category'] == category]
        
        # Filter by product if specified
        if product_id:
            df = df[df['product_id'] == product_id]
        
        # Filter by time window if specified
        if time_window:
            cutoff_time = datetime.now() - time_window
            df = df[df['timestamp'] >= cutoff_time]
        
        # Group by emotion and calculate average score
        trends = df.groupby(['emotion'])['score'].mean().reset_index()
        trends = trends.sort_values('score', ascending=False)
        
        return trends
    
    def visualize_emotion_trends(self, category=None, product_id=None, time_window=None):
        """Visualize emotion trends over time"""
        df = pd.DataFrame(self.sentiment_history)
        
        if len(df) == 0:
            print("No data available for visualization")
            return None
        
        # Filter by category if specified
        if category:
            df = df[df['category'] == category]
            title_suffix = f" for {self.category_display_names.get(category, category)}"
        else:
            title_suffix = " across all categories"
        
        # Filter by product if specified
        if product_id:
            df = df[df['product_id'] == product_id]
            title_suffix = f" for product {product_id}"
        
        # Filter by time window if specified
        if time_window:
            cutoff_time = datetime.now() - time_window
            df = df[df['timestamp'] >= cutoff_time]
        
        # Convert timestamp to datetime if it's not already
        if not pd.api.types.is_datetime64_any_dtype(df['timestamp']):
            df['timestamp'] = pd.to_datetime(df['timestamp'])
        
        # Group by timestamp (hourly) and emotion
        df['hour'] = df['timestamp'].dt.floor('H')
        emotion_over_time = df.groupby(['hour', 'emotion'])['score'].mean().reset_index()
        
        # Create the plot
        plt.figure(figsize=(12, 6))
        
        for emotion in self.emotions:
            emotion_data = emotion_over_time[emotion_over_time['emotion'] == emotion]
            if not emotion_data.empty:
                plt.plot(emotion_data['hour'], emotion_data['score'], marker='o', label=emotion)
        
        plt.title(f"Emotion Trends{title_suffix}")
        plt.xlabel("Time")
        plt.ylabel("Average Score")
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        
        # Save the plot
        filename = "emotion_trends.png"
        plt.savefig(filename)
        plt.close()
        
        return filename
    
    def visualize_emotion_distribution(self, category=None):
        """Visualize the distribution of emotions"""
        df = pd.DataFrame(self.sentiment_history)
        
        if len(df) == 0:
            print("No data available for visualization")
            return None
        
        # Filter by category if specified
        if category:
            df = df[df['category'] == category]
            title_suffix = f" for {self.category_display_names.get(category, category)}" 
            df = df[df['category'] == category]
            title_suffix = f" for {self.category_display_names.get(category, category)}"
        else:
            title_suffix = " across all categories"
        
        # Get the top emotion for each review
        df_top_emotions = df.loc[df.groupby(['timestamp', 'category', 'product_id'])['score'].idxmax()]
        
        # Count the occurrences of each emotion
        emotion_counts = df_top_emotions['emotion'].value_counts().reset_index()
        emotion_counts.columns = ['emotion', 'count']
        
        # Create the plot
        plt.figure(figsize=(10, 6))
        sns.barplot(x='emotion', y='count', data=emotion_counts)
        
        plt.title(f"Emotion Distribution{title_suffix}")
        plt.xlabel("Emotion")
        plt.ylabel("Count")
        plt.xticks(rotation=45)
        plt.tight_layout()
        
        # Save the plot
        filename = "emotion_distribution.png"
        plt.savefig(filename)
        plt.close()
        
        return filename

# Example of a simple review stream simulator for testing
class ReviewStreamSimulator:
    def __init__(self, reviews_df, rate=5):
        """
        Simulate a stream of reviews
        
        Args:
            reviews_df: DataFrame with reviews
            rate: Average number of reviews per second
        """
        self.reviews = reviews_df
        self.rate = rate
        self.index = 0
        self.last_time = time.time()
        
    def has_new(self):
        """Check if new reviews are available"""
        if self.index >= len(self.reviews):
            return False
            
        current_time = time.time()
        time_diff = current_time - self.last_time
        expected_reviews = time_diff * self.rate
        
        return expected_reviews >= 1
    
    def get_next(self):
        """Get the next review"""
        if self.index >= len(self.reviews):
            return None
            
        review = {
            'text': self.reviews.iloc[self.index]['reviewText'],
            'category': self.reviews.iloc[self.index]['category'],
            'product_id': self.reviews.iloc[self.index].get('asin', 'unknown'),
            'timestamp': self.reviews.iloc[self.index].get('datetime', datetime.now())
        }
        
        self.index += 1
        self.last_time = time.time()
        
        return review

# Example usage
if __name__ == "__main__":
    # Load sample data
    try:
        reviews_df = pd.read_parquet("filtered_amazon_reviews.parquet")
        print(f"Loaded {len(reviews_df)} reviews for testing")
        
        # Create a review stream simulator
        stream = ReviewStreamSimulator(reviews_df, rate=10)
        
        # Create and start the sentiment tracker
        tracker = RealTimeSentimentTracker()
        
        # Process a batch of reviews
        sample_reviews = reviews_df['reviewText'].tolist()[:100]
        sample_categories = reviews_df['category'].tolist()[:100]
        sample_product_ids = reviews_df['asin'].tolist()[:100]
        
        results = tracker.analyze_batch(sample_reviews, sample_categories, sample_product_ids)
        
        # Print summary
        emotions_summary = {}
        for emotion_scores in results:
            top_emotion = max(emotion_scores, key=emotion_scores.get)
            emotions_summary[top_emotion] = emotions_summary.get(top_emotion, 0) + 1
        
        print("Emotion distribution:")
        for emotion, count in emotions_summary.items():
            print(f"  {emotion}: {count} ({count/len(results)*100:.1f}%)")
        
        # Visualize results
        tracker.visualize_emotion_distribution()
        tracker.visualize_emotion_trends()
        
        # Process stream for a short time
        tracker.process_stream(stream, max_duration_seconds=30)
        
    except FileNotFoundError:
        print("Please run data_loader.py first to create the filtered dataset")

Loaded 10000 reviews for testing
Using device: cpu
Emotion distribution:
  joy: 65 (65.0%)
  anger: 17 (17.0%)
  sadness: 11 (11.0%)
  fear: 3 (3.0%)
  love: 2 (2.0%)
  surprise: 2 (2.0%)


  df['hour'] = df['timestamp'].dt.floor('H')


Starting real-time sentiment tracking...
Processing batch of 1 reviews...
Emotion distribution in this batch:
  joy: 1 (100.0%)
Processing batch of 1 reviews...
Emotion distribution in this batch:
  joy: 1 (100.0%)
Processing batch of 1 reviews...
Emotion distribution in this batch:
  anger: 1 (100.0%)
Processing batch of 1 reviews...
Emotion distribution in this batch:
  joy: 1 (100.0%)
Processing batch of 1 reviews...
Emotion distribution in this batch:
  sadness: 1 (100.0%)
Processing batch of 1 reviews...
Emotion distribution in this batch:
  joy: 1 (100.0%)
Completed 30 seconds of real-time tracking


In [27]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

class DomainAdapter:
    """
    The DomainAdapter class serves as an abstract base class for adapting data
    from various domain-specific formats into a standardized format suitable
    for further processing, such as fake review detection.
    """

    def __init__(self):
        """
        Initializes the DomainAdapter. This base class doesn't require any
        specific initialization parameters.
        """
        pass

    def load_data(self, file_path):
        """
        Abstract method to load data from a file. Subclasses must implement
        this method to handle the specifics of their data format.

        Args:
            file_path (str): The path to the data file.

        Returns:
            pandas.DataFrame: A DataFrame containing the loaded data.

        Raises:
            NotImplementedError: If the method is not implemented in a subclass.
        """
        raise NotImplementedError("Subclasses must implement load_data method")

    def transform_data(self, df):
        """
        Abstract method to transform the loaded data into a standardized format.
        Subclasses must implement this method to handle the specifics of their
        data format.

        Args:
            df (pandas.DataFrame): The DataFrame to transform.

        Returns:
            pandas.DataFrame: A DataFrame containing the transformed data.

        Raises:
            NotImplementedError: If the method is not implemented in a subclass.
        """
        raise NotImplementedError("Subclasses must implement transform_data method")

    def standardize_columns(self, df, column_mapping):
        """
        Standardizes column names in the DataFrame based on a provided mapping.

        Args:
            df (pandas.DataFrame): The DataFrame to standardize.
            column_mapping (dict): A dictionary mapping original column names
                                   to standardized column names.

        Returns:
            pandas.DataFrame: A DataFrame with standardized column names.
        """
        return df.rename(columns=column_mapping)

    def handle_missing_values(self, df, strategy='drop'):
        """
        Handles missing values in the DataFrame based on a specified strategy.

        Args:
            df (pandas.DataFrame): The DataFrame to handle missing values in.
            strategy (str): The strategy for handling missing values.
                            'drop' to drop rows with any missing values.
                            'fill' to fill missing values with a default value.

        Returns:
            pandas.DataFrame: A DataFrame with missing values handled.
        """
        if strategy == 'drop':
            df = df.dropna()
        elif strategy == 'fill':
            # Fill missing values with a default value (e.g., empty string)
            df = df.fillna('')
        return df


import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
from sklearn.model_selection import train_test_split
import networkx as nx
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
import matplotlib.pyplot as plt
import json
from datetime import datetime

class ReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': torch.tensor(label, dtype=torch.long)
        }

class DomainAdapter:
    def __init__(self, base_model_name="distilbert-base-uncased"):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {self.device}")
        
        # Load base model
        self.tokenizer = AutoTokenizer.from_pretrained(base_model_name)
        self.base_model = AutoModel.from_pretrained(base_model_name).to(self.device)
        
        # Domain-specific classifier head
        self.domain_classifiers = {}
        
        # Category-specific adjustments
        self.category_adjustments = {
            "All Beauty": 1.0,  # No adjustment
            "Health and Personal Care": 1.0,  # No adjustment
            "Appliances": 0.95,  # Slightly reduce suspicion for appliance reviews
            "Video_Games": 0.9   # Reduce suspicion more for video game reviews (more emotional)
        }
    
    def create_domain_classifier(self, domain, num_classes):
        """Create a classifier head for a specific domain"""
        # Get the hidden size of the base model
        hidden_size = self.base_model.config.hidden_size
        
        # Create a simple classifier
        classifier = nn.Sequential(
            nn.Linear(hidden_size, hidden_size // 2),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_size // 2, num_classes)
        ).to(self.device)
        
        self.domain_classifiers[domain] = classifier
        return classifier
    
    def train_domain_model(self, domain, texts, labels, num_classes, epochs=3, batch_size=16):
        """Train a domain-specific model"""
        # Create dataset
        train_texts, val_texts, train_labels, val_labels = train_test_split(
            texts, labels, test_size=0.2, random_state=42
        )
        
        train_dataset = ReviewDataset(train_texts, train_labels, self.tokenizer)
        val_dataset = ReviewDataset(val_texts, val_labels, self.tokenizer)
        
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size)
        
        # Create or get domain classifier
        if domain not in self.domain_classifiers:
            classifier = self.create_domain_classifier(domain, num_classes)
        else:
            classifier = self.domain_classifiers[domain]
        
        # Optimizer
        optimizer = torch.optim.AdamW(
            list(self.base_model.parameters()) + list(classifier.parameters()),
            lr=5e-5
        )
        
        # Training loop
        best_accuracy = 0
        
        for epoch in range(epochs):
            # Training
            self.base_model.train()
            classifier.train()
            train_loss = 0
            
            for batch in train_loader:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['label'].to(self.device)
                
                optimizer.zero_grad()
                
                # Forward pass
                outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
                hidden_states = outputs.last_hidden_state[:, 0, :]  # CLS token
                logits = classifier(hidden_states)
                
                # Loss
                loss = F.cross_entropy(logits, labels)
                train_loss += loss.item()
                
                # Backward pass
                loss.backward()
                optimizer.step()
            
            train_loss /= len(train_loader)
            
            # Validation
            self.base_model.eval()
            classifier.eval()
            val_loss = 0
            correct = 0
            total = 0
            
            with torch.no_grad():
                for batch in val_loader:
                    input_ids = batch['input_ids'].to(self.device)
                    attention_mask = batch['attention_mask'].to(self.device)
                    labels = batch['label'].to(self.device)
                    
                    # Forward pass
                    outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
                    hidden_states = outputs.last_hidden_state[:, 0, :]  # CLS token
                    logits = classifier(hidden_states)
                    
                    # Loss
                    loss = F.cross_entropy(logits, labels)
                    val_loss += loss.item()
                    
                    # Accuracy
                    _, predicted = torch.max(logits, 1)
                    total += labels.size(0)
                    correct += (predicted == labels).sum().item()
            
            val_loss /= len(val_loader)
            accuracy = correct / total
            
            print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Accuracy: {accuracy:.4f}")
            
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                # Save model if needed
        
        print(f"Training completed for domain: {domain}, Best Accuracy: {best_accuracy:.4f}")
    
    def predict(self, domain, texts, batch_size=16):
        """Make predictions using a domain-specific model"""
        if domain not in self.domain_classifiers:
            raise ValueError(f"No model trained for domain: {domain}")
        
        classifier = self.domain_classifiers[domain]
        dataset = ReviewDataset(texts, [0] * len(texts), self.tokenizer)  # Dummy labels
        dataloader = DataLoader(dataset, batch_size=batch_size)
        
        predictions = []
        
        self.base_model.eval()
        classifier.eval()
        
        with torch.no_grad():
            for batch in dataloader:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                
                # Forward pass
                outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
                hidden_states = outputs.last_hidden_state[:, 0, :]  # CLS token
                logits = classifier(hidden_states)
                
                # Get predictions
                probs = F.softmax(logits, dim=1)
                batch_preds = probs.cpu().numpy()
                predictions.extend(batch_preds)
        
        return np.array(predictions)

class FakeReviewDetector:
    def __init__(self):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        # Load a pre-trained model for review embeddings
        self.tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
        self.embedding_model = AutoModel.from_pretrained("distilbert-base-uncased").to(self.device)
        
        # GNN for fake review detection
        self.gnn_model = None
        
        # Category-specific adjustments
        self.category_adjustments = {
            "All Beauty": 1.0,  # No adjustment
            "Health and Personal Care": 1.0,  # No adjustment
            "Appliances": 0.95,  # Slightly reduce suspicion for appliance reviews
            "Video_Games": 0.9   # Reduce suspicion more for video game reviews (more emotional)
        }
    
    def extract_product_details(self, details_str):
        """Extract product details from the JSON string"""
        try:
            if details_str and isinstance(details_str, str):
                details = json.loads(details_str)
                return details
            return {}
        except:
            return {}
    
    def build_review_graph(self, reviews_df):
        """Build a heterogeneous graph with reviews, users, and products"""
        G = nx.Graph()
        
        # Add nodes for reviews
        for i, row in reviews_df.iterrows():
            G.add_node(f"review_{i}", type="review", text=row['reviewText'], rating=row['rating'])
        
        # Add nodes for users
        user_ids = set(reviews_df['user_id'])
        for user_id in user_ids:
            G.add_node(f"user_{user_id}", type="user")
        
        # Add nodes for products
        product_ids = set(reviews_df['asin'])
        for product_id in product_ids:
            G.add_node(f"product_{product_id}", type="product")
        
        # Add edges between reviews and users/products
        for i, row in reviews_df.iterrows():
            G.add_edge(f"review_{i}", f"user_{row['user_id']}")
            G.add_edge(f"review_{i}", f"product_{row['asin']}")
        
        # Add edges between users who reviewed the same product
        user_product_dict = {}
        for _, row in reviews_df.iterrows():
            user_id = row['user_id']
            product_id = row['asin']
            
            if user_id not in user_product_dict:
                user_product_dict[user_id] = set()
            user_product_dict[user_id].add(product_id)
        
        for user1 in user_product_dict:
            for user2 in user_product_dict:
                if user1 != user2:
                    common_products = user_product_dict[user1].intersection(user_product_dict[user2])
                    if len(common_products) > 0:
                        G.add_edge(f"user_{user1}", f"user_{user2}", weight=len(common_products))
        
        return G
    
    def get_review_embeddings(self, reviews, batch_size=16):
        """Get embeddings for reviews using the pre-trained model"""
        embeddings = []
        
        self.embedding_model.eval()
        
        for i in range(0, len(reviews), batch_size):
            batch_reviews = reviews[i:i+batch_size]
            
            inputs = self.tokenizer(
                batch_reviews,
                padding=True,
                truncation=True,
                max_length=128,
                return_tensors="pt"
            ).to(self.device)
            
            with torch.no_grad():
                outputs = self.embedding_model(**inputs)
                batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()  # CLS token
                embeddings.extend(batch_embeddings)
        
        return np.array(embeddings)
    
    def detect_fake_reviews(self, reviews_df):
        """Detect fake reviews using graph-based approach"""
        # Extract necessary data
        reviews = reviews_df['reviewText'].tolist()
        user_ids = reviews_df['user_id'].tolist()
        product_ids = reviews_df['asin'].tolist()
        ratings = reviews_df['rating'].tolist()
        categories = reviews_df['category'].tolist() if 'category' in reviews_df.columns else ['unknown'] * len(reviews)
        
        # Get timestamps if available
        if 'datetime' in reviews_df.columns:
            timestamps = reviews_df['datetime'].tolist()
        elif 'timestamp' in reviews_df.columns:
            timestamps = [datetime.fromtimestamp(ts/1000) if isinstance(ts, (int, float)) else ts 
                         for ts in reviews_df['timestamp'].tolist()]
        else:
            timestamps = [None] * len(reviews)
        
        # Build the graph
        G = self.build_review_graph(reviews_df)
        
        # Get review embeddings
        review_embeddings = self.get_review_embeddings(reviews)
        
        # Calculate review similarity
        from sklearn.metrics.pairwise import cosine_similarity
        similarity_matrix = cosine_similarity(review_embeddings)
        
        # Identify suspicious patterns
        suspicious_scores = np.zeros(len(reviews))
        
        # 1. Check for users with multiple similar reviews
        user_reviews = {}
        for i, user_id in enumerate(user_ids):
            if user_id not in user_reviews:
                user_reviews[user_id] = []
            user_reviews[user_id].append(i)
        
        for user_id, review_indices in user_reviews.items():
            if len(review_indices) > 1:
                # Check similarity between reviews by the same user
                for i in range(len(review_indices)):
                    for j in range(i+1, len(review_indices)):
                        idx1, idx2 = review_indices[i], review_indices[j]
                        if similarity_matrix[idx1, idx2] > 0.8:  # High similarity threshold
                            suspicious_scores[idx1] += 0.5
                            suspicious_scores[idx2] += 0.5
        
        # 2. Check for burst patterns (many reviews in a short time)
        if timestamps[0] is not None:
            # Group reviews by product
            product_reviews = {}
            for i, product_id in enumerate(product_ids):
                if product_id not in product_reviews:
                    product_reviews[product_id] = []
                product_reviews[product_id].append(i)
            
            # Check for burst patterns
            for product_id, review_indices in product_reviews.items():
                if len(review_indices) > 1:
                    # Sort by timestamp
                    sorted_indices = sorted(review_indices, key=lambda i: timestamps[i])
                    
                    # Check for reviews in quick succession
                    for i in range(len(sorted_indices)-1):
                        idx1, idx2 = sorted_indices[i], sorted_indices[i+1]
                        time_diff = (timestamps[idx2] - timestamps[idx1]).total_seconds()
                        
                        # If reviews are less than 1 hour apart
                        if time_diff < 3600:
                            suspicious_scores[idx1] += 0.3
                            suspicious_scores[idx2] += 0.3
        
        # 3. Check for network patterns
        # Users who are connected and have similar review patterns
        for user1 in user_reviews:
            for user2 in user_reviews:
                if user1 != user2 and G.has_edge(f"user_{user1}", f"user_{user2}"):
                    # Check if they have similar review patterns
                    reviews1 = user_reviews[user1]
                    reviews2 = user_reviews[user2]
                    
                    for idx1 in reviews1:
                        for idx2 in reviews2:
                            if similarity_matrix[idx1, idx2] > 0.7:  # Similarity threshold
                                suspicious_scores[idx1] += 0.3
                                suspicious_scores[idx2] += 0.3
        
        # 4. Check for extreme ratings
        for i, rating in enumerate(ratings):
            # Extreme ratings (1 or 5) are slightly more suspicious
            if rating == 1 or rating == 5:
                suspicious_scores[i] += 0.1
        
        # 5. Apply category-specific adjustments
        for i, category in enumerate(categories):
            if category in self.category_adjustments:
                suspicious_scores[i] *= self.category_adjustments[category]
        
        # Normalize scores
        if suspicious_scores.max() > 0:
            suspicious_scores = suspicious_scores / suspicious_scores.max()
        
        # Classify reviews as fake or genuine (threshold = 0.7)
        threshold = 0.7
        fake_labels = (suspicious_scores > threshold).astype(int)
        
        # Add results to the dataframe
        result_df = reviews_df.copy()
        result_df['suspicious_score'] = suspicious_scores
        result_df['is_fake'] = fake_labels
        
        return result_df
    
    def visualize_review_network(self, reviews_df, output_file="review_network.png"):
        """Visualize the review network with fake reviews highlighted"""
        # Ensure we have fake labels
        if 'is_fake' not in reviews_df.columns:
            reviews_df = self.detect_fake_reviews(reviews_df)
        
        # Build the graph
        G = self.build_review_graph(reviews_df)
        
        # Create a plot
        plt.figure(figsize=(12, 10))
        
        # Define node colors based on type and fake status
        node_colors = []
        for node in G.nodes():
            if node.startswith('review_'):
                review_idx = int(node.split('_')[1])
                if review_idx < len(reviews_df) and reviews_df.iloc[review_idx]['is_fake'] == 1:
                    node_colors.append('red')  # Fake reviews
                else:
                    node_colors.append('green')  # Genuine reviews
            elif node.startswith('user_'):
                node_colors.append('blue')  # Users
            else:
                node_colors.append('orange')  # Products
        
        # Draw the graph
        pos = nx.spring_layout(G, seed=42)
        nx.draw_networkx_nodes(G, pos, node_color=node_colors, alpha=0.8)
        nx.draw_networkx_edges(G, pos, alpha=0.2)
        
        # Add labels for important nodes
        labels = {}
        for node in G.nodes():
            if node.startswith('user_') or node.startswith('product_'):
                labels[node] = node
        nx.draw_networkx_labels(G, pos, labels=labels, font_size=8)
        
        plt.title("Review Network (Red: Fake Reviews, Green: Genuine Reviews)")
        plt.axis('off')
        plt.tight_layout()
        plt.savefig(output_file)
        plt.close()
        
        return output_file
    
    def analyze_fake_review_patterns(self, reviews_df):
        """Analyze patterns in fake reviews"""
        # Ensure we have fake labels
        if 'is_fake' not in reviews_df.columns:
            reviews_df = self.detect_fake_reviews(reviews_df)
        
        # Get fake and genuine reviews
        fake_reviews = reviews_df[reviews_df['is_fake'] == 1]
        genuine_reviews = reviews_df[reviews_df['is_fake'] == 0]
        
        # Calculate statistics
        stats = {
            'total_reviews': len(reviews_df),
            'fake_reviews': len(fake_reviews),
            'genuine_reviews': len(genuine_reviews),
            'fake_percentage': len(fake_reviews) / len(reviews_df) * 100 if len(reviews_df) > 0 else 0
        }
        
        # Rating distribution
        fake_rating_dist = fake_reviews['rating'].value_counts().sort_index()
        genuine_rating_dist = genuine_reviews['rating'].value_counts().sort_index()
        
        # Visualize rating distribution
        plt.figure(figsize=(10, 6))
        
        # Create bar positions
        bar_width = 0.35
        index = np.arange(5)  # 5 possible ratings (1-5)
        
        # Get counts for each rating
        fake_counts = [fake_rating_dist.get(i+1, 0) for i in range(5)]
        genuine_counts = [genuine_rating_dist.get(i+1, 0) for i in range(5)]
        
        # Plot bars
        plt.bar(index, fake_counts, bar_width, label='Fake Reviews', color='red', alpha=0.7)
        plt.bar(index + bar_width, genuine_counts, bar_width, label='Genuine Reviews', color='green', alpha=0.7)
        
        plt.xlabel('Rating')
        plt.ylabel('Count')
        plt.title('Rating Distribution: Fake vs. Genuine Reviews')
        plt.xticks(index + bar_width/2, ['1', '2', '3', '4', '5'])
        plt.legend()
        plt.tight_layout()
        
        # Save the plot
        rating_dist_file = "rating_distribution.png"
        plt.savefig(rating_dist_file)
        plt.close()
        
        # Category distribution
        if 'category' in reviews_df.columns:
            fake_category_dist = fake_reviews['category'].value_counts()
            genuine_category_dist = genuine_reviews['category'].value_counts()
            
            # Calculate fake review percentage by category
            category_stats = []
            for category in reviews_df['category'].unique():
                category_reviews = reviews_df[reviews_df['category'] == category]
                category_fake = category_reviews[category_reviews['is_fake'] == 1]
                
                category_stats.append({
                    'category': category,
                    'total': len(category_reviews),
                    'fake': len(category_fake),
                    'fake_percentage': len(category_fake) / len(category_reviews) * 100
                })
            
            # Sort by fake percentage
            category_stats = sorted(category_stats, key=lambda x: x['fake_percentage'], reverse=True)
            
            # Visualize category distribution
            plt.figure(figsize=(12, 6))
            
            categories = [stat['category'] for stat in category_stats]
            percentages = [stat['fake_percentage'] for stat in category_stats]
            
            plt.bar(categories, percentages, color='red', alpha=0.7)
            plt.xlabel('Category')
            plt.ylabel('Fake Review Percentage')
            plt.title('Fake Review Percentage by Category')
            plt.xticks(rotation=45, ha='right')
            plt.tight_layout()
            
            # Save the plot
            category_dist_file = "category_distribution.png"
            plt.savefig(category_dist_file)
            plt.close()
        else:
            category_stats = []
            category_dist_file = None
        
        return {
            'stats': stats,
            'rating_distribution_file': rating_dist_file,
            'category_distribution_file': category_dist_file,
            'category_stats': category_stats
        }

# Example usage
if __name__ == "__main__":
    # Load sample data
    try:
        reviews_df = pd.read_parquet("filtered_amazon_reviews.parquet")
        print(f"Loaded {len(reviews_df)} reviews for analysis")
        
        # Create the fake review detector
        detector = FakeReviewDetector()
        
        # Sample a subset for demonstration
        sample_df = reviews_df.sample(min(1000, len(reviews_df)), random_state=42)
        
        # Detect fake reviews
        result_df = detector.detect_fake_reviews(sample_df)
        
        # Count fake reviews
        fake_count = result_df['is_fake'].sum()
        print(f"Detected {fake_count} potential fake reviews out of {len(result_df)} ({fake_count/len(result_df)*100:.1f}%)")
        
        # Visualize the network
        network_file = detector.visualize_review_network(result_df)
        print(f"Network visualization saved as {network_file}")
        
        # Analyze fake review patterns
        analysis = detector.analyze_fake_review_patterns(result_df)
        print(f"Analysis results:")
        print(f"  Total reviews: {analysis['stats']['total_reviews']}")
        print(f"  Fake reviews: {analysis['stats']['fake_reviews']} ({analysis['stats']['fake_percentage']:.1f}%)")
        print(f"  Rating distribution chart saved as {analysis['rating_distribution_file']}")
        
        if analysis['category_distribution_file']:
            print(f"  Category distribution chart saved as {analysis['category_distribution_file']}")
            print("  Fake review percentage by category:")
            for stat in analysis['category_stats']:
                print(f"    {stat['category']}: {stat['fake_percentage']:.1f}%")
        
    except FileNotFoundError:
        print("Please run data_loader.py first to create the filtered dataset")

Loaded 10000 reviews for analysis
Detected 5 potential fake reviews out of 1000 (0.5%)
Network visualization saved as review_network.png
Analysis results:
  Total reviews: 1000
  Fake reviews: 5 (0.5%)
  Rating distribution chart saved as rating_distribution.png
  Category distribution chart saved as category_distribution.png
  Fake review percentage by category:
    Video Games: 0.9%
    Appliances: 0.0%
    Health and Personal Care: 0.0%
    All Beauty: 0.0%


In [28]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

class ReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': torch.tensor(label, dtype=torch.long)
        }

class SelfAdaptiveModel:
    def __init__(self, base_model_name="distilbert-base-uncased-finetuned-sst-2-english"):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {self.device}")
        
        # Load base sentiment model
        self.tokenizer = AutoTokenizer.from_pretrained(base_model_name)
        self.model = AutoModelForSequenceClassification

In [29]:
# %% [markdown]
# # Amazon Review Analysis (Single Cell)
# This cell contains all imports, function definitions, data loading,
# and execution logic for interactive analysis in Jupyter.

# %%
# =============================================================================
# Section 1: Imports and Setup
# =============================================================================
import argparse
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle
from datetime import datetime
import torch
# from transformers import AutoTokenizer, AutoModelForSequenceClassification # Only needed if using embedding model later
# import torch.nn.functional as F # Only needed if using embedding model later
import time # Added for stream simulation if needed
from IPython.display import display # Use display for better notebook formatting

# Import custom modules (remain commented out as files are not provided)
# from data_loader import load_amazon_reviews_hf, preprocess_reviews
# from gnn_context_model import AspectDetector
# from domain_adapter import FakeReviewDetector
# from context_aware_sentiment import SelfAdaptiveModel

# Download NLTK resources (quiet=True suppresses output if already downloaded)
print("Downloading NLTK resources (if necessary)...")
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
# Download tagger needed for placeholder aspect analysis
# Download tagger needed for placeholder aspect analysis
nltk.download('averaged_perceptron_tagger', quiet=True)
print("NLTK resources checked/downloaded.")

# Ensure results directory exists for saving plots/data
os.makedirs('./results', exist_ok=True)
print("Results directory './results' checked/created.")

# =============================================================================
# Section 2: Function Definitions
# =============================================================================

# --- Text Cleaning ---
def clean_text(text):
    """Clean and preprocess text."""
    if isinstance(text, str):
        # Remove HTML tags
        try:
            text = BeautifulSoup(text, 'html.parser').get_text()
        except Exception:
            pass # Handle potential parsing errors if text isn't valid HTML
        # Remove non-alphabetic characters and handle extra whitespace
        text = re.sub(r'[^a-zA-Z\s]', ' ', text.lower())
        text = re.sub(r'\s+', ' ', text).strip() # Consolidate whitespace
        # Tokenize
        words = text.split()
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        words = [w for w in words if w not in stop_words and len(w) > 1] # Keep words > 1 char
        return ' '.join(words)
    return ""

# --- Sentiment Model Training ---
def train_sentiment_model(df):
    """Train a Naive Bayes model for sentiment analysis."""
    print("\n--- Training Sentiment Analysis Model ---")
    if 'reviewText' not in df.columns or 'rating' not in df.columns:
        print("Error: DataFrame must contain 'reviewText' and 'rating' columns.")
        return None, None

    # Clean text
    print("Cleaning text for sentiment model...")
    df['cleaned_text'] = df['reviewText'].apply(clean_text)

    # Filter out rows with empty cleaned text
    df_filtered = df[df['cleaned_text'].str.len() > 0].copy() # Use .copy() to avoid SettingWithCopyWarning
    if len(df_filtered) == 0:
        print("Error: No valid text data left after cleaning.")
        return None, None
    print(f"Using {len(df_filtered)} reviews for training after cleaning.")

    # Create features
    vectorizer = TfidfVectorizer(max_features=5000)
    print("Vectorizing text (TF-IDF)...")
    X = vectorizer.fit_transform(df_filtered['cleaned_text'])

    # Create target (convert ratings to sentiment classes)
    df_filtered['sentiment_class'] = df_filtered['rating'].apply(lambda x: 'positive' if x >= 4 else ('neutral' if x == 3 else 'negative'))
    y = df_filtered['sentiment_class']

    if len(df_filtered) < 2 or len(y.unique()) < 2:
        print("Error: Not enough data or classes to split and train.")
        return vectorizer, None # Return vectorizer even if model fails

    # Split data
    print("Splitting data (80% train, 20% test)...")
    try:
      X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) # Added stratify
    except ValueError as e:
        print(f"Error during train/test split (likely too few samples in a class): {e}")
        print("Skipping model training.")
        return vectorizer, None


    # Train model
    print("Training Naive Bayes model...")
    model = MultinomialNB()
    model.fit(X_train, y_train)

    # Evaluate model
    print("Evaluating model...")
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Sentiment model accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred, zero_division=0))

    # Create confusion matrix
    try:
        cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                    xticklabels=model.classes_,
                    yticklabels=model.classes_)
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.title('Sentiment Confusion Matrix')
        plt.savefig('./results/sentiment_confusion_matrix.png')
        plt.close()
        print("Confusion matrix saved to ./results/sentiment_confusion_matrix.png")
    except Exception as e:
        print(f"Could not generate confusion matrix plot: {e}")

    print("--- Sentiment Model Training Finished ---")
    return vectorizer, model

# --- Fake Review Detection (Placeholder) ---
def detect_fake_reviews(df):
    """Detect potentially fake reviews (Placeholder)."""
    print("\n--- Detecting Fake Reviews (Placeholder Logic) ---")
    if 'rating' not in df.columns:
         print("Warning: 'rating' column missing for fake review analysis. Skipping.")
         df['is_fake'] = False
         df['suspicious_score'] = 0.0
         return df

    # Using placeholder logic: Assume reviews with extreme ratings (1 or 5) and short text might be slightly more suspicious
    min_text_len_for_genuine = 15 # Arbitrary threshold for text length
    print(f"Flagging reviews with rating 1 or 5 AND text length < {min_text_len_for_genuine} chars.")
    # Ensure reviewText is string and handle NaN
    df['reviewText_str'] = df['reviewText'].astype(str).fillna('')
    is_fake_condition = ((df['rating'] == 1) | (df['rating'] == 5)) & (df['reviewText_str'].str.len() < min_text_len_for_genuine)
    is_fake = np.where(is_fake_condition, True, False)

    # Add results to DataFrame
    df['is_fake'] = is_fake
    # Placeholder score - slightly higher for those flagged by the simple rule
    df['suspicious_score'] = np.random.uniform(0.1, 0.3, size=len(df)) # Base random score
    df.loc[df['is_fake'], 'suspicious_score'] = np.random.uniform(0.5, 0.7, size=df['is_fake'].sum()) # Higher score for flagged
    print(f"Flagged {df['is_fake'].sum()} reviews based on placeholder logic.")

    # Create visualizations
    try:
        plt.figure(figsize=(10, 6))
        sns.countplot(x='rating', hue='is_fake', data=df, palette={True: "red", False: "green"})
        plt.title('Rating Distribution: Flagged vs. Not Flagged Reviews (Placeholder)')
        plt.xlabel('Rating')
        plt.ylabel('Count')
        plt.savefig('./results/fake_rating_distribution.png')
        plt.close()
        print("Rating distribution plot saved to ./results/fake_rating_distribution.png")

        # Fake review percentage by category
        if 'category' in df.columns:
            fake_by_category = df.groupby('category')['is_fake'].mean().reset_index()
            fake_by_category = fake_by_category.sort_values('is_fake', ascending=False)

            if not fake_by_category.empty:
                plt.figure(figsize=(10, max(6, len(fake_by_category) * 0.5))) # Adjust height
                sns.barplot(x='is_fake', y='category', data=fake_by_category, palette="viridis")
                plt.title('Flagged Review Percentage by Category (Placeholder)')
                plt.xlabel('Percentage of Flagged Reviews')
                plt.ylabel('Category')
                plt.tight_layout()
                plt.savefig('./results/fake_category_distribution.png')
                plt.close()
                print("Category distribution plot saved to ./results/fake_category_distribution.png")
            else:
                 print("No category data to generate category distribution plot.")
        else:
            print("Skipping category distribution plot: 'category' column missing.")

    except Exception as e:
        print(f"Could not generate fake review plots: {e}")
    print("--- Fake Review Detection Finished ---")
    return df

# --- Aspect Analysis (Placeholder) ---
def analyze_aspects(df):
    """Analyze aspects mentioned in reviews (Placeholder)."""
    print("\n--- Analyzing Review Aspects (Placeholder: Nouns) ---")
    if 'reviewText' not in df.columns:
        print("Warning: 'reviewText' column missing for aspect analysis. Skipping.")
        df['detected_aspects'] = [[] for _ in range(len(df))]
        return df

    # Detect aspects for each review (Placeholder: extract nouns)
    print("Applying placeholder aspect extraction (extracting nouns)...")
    aspects_list = []

    # Batch processing for potentially faster POS tagging
    # Clean text first
    cleaned_texts = df['reviewText'].apply(clean_text)
    # Tokenize and tag
    try:
        tokens_list = [nltk.word_tokenize(text) for text in cleaned_texts]
        tagged_list = nltk.pos_tag_sents(tokens_list) # Use sentence-based tagging if reviews are long
        # Extract nouns (NN, NNS, NNP, NNPS)
        for tagged_sent in tagged_list:
            detected_aspects = [word for word, tag in tagged_sent if tag.startswith('NN')]
            aspects_list.append(list(set(detected_aspects[:10]))) # Keep unique, max 10 per review
    except Exception as e:
        print(f"Error during batch POS tagging: {e}. Falling back to row-by-row (slower).")
        aspects_list = [] # Reset list
        for text in cleaned_texts:
            try:
                tokens = nltk.word_tokenize(text)
                tagged = nltk.pos_tag(tokens)
                detected_aspects = [word for word, tag in tagged if tag.startswith('NN')]
                aspects_list.append(list(set(detected_aspects[:10])))
            except Exception:
                aspects_list.append([]) # Handle potential errors

    df['detected_aspects'] = aspects_list
    print(f"Processed {len(aspects_list)} reviews for aspects.")

    # Count aspect occurrences
    all_aspects = [aspect for sublist in aspects_list for aspect in sublist]

    if not all_aspects:
        print("No aspects found.")
        print("--- Aspect Analysis Finished ---")
        return df

    aspect_counts = pd.Series(all_aspects).value_counts().reset_index()
    aspect_counts.columns = ['aspect', 'count']
    print(f"Found {len(aspect_counts)} unique aspects.")

    # Create visualization
    try:
        plt.figure(figsize=(12, 8))
        top_n = min(20, len(aspect_counts)) # Show top 20
        sns.barplot(x='count', y='aspect', data=aspect_counts.head(top_n), palette="magma")
        plt.title(f'Top {top_n} Most Common Potential Aspects (Placeholder Nouns)')
        plt.xlabel('Count')
        plt.ylabel('Aspect')
        plt.tight_layout()
        plt.savefig('./results/aspect_distribution.png')
        plt.close()
        print(f"Aspect distribution plot saved to ./results/aspect_distribution.png (Top {top_n})")
    except Exception as e:
        print(f"Could not generate aspect distribution plot: {e}")

    print("--- Aspect Analysis Finished ---")
    return df

# --- Emotion Analysis (Simplified) ---
def analyze_emotions(df):
    """Analyze emotions expressed in reviews (Simplified based on rating)."""
    print("\n--- Analyzing Emotions (Based on Rating) ---")
    if 'rating' not in df.columns:
        print("Warning: 'rating' column missing for emotion analysis. Skipping.")
        df['emotion'] = 'unknown'
        return df

    # Map ratings to emotions
    emotion_map = {
        1: 'angry',
        2: 'disappointed',
        3: 'neutral',
        4: 'satisfied',
        5: 'delighted'
    }
    df['emotion'] = df['rating'].map(emotion_map).fillna('unknown') # Handle potential NaN ratings
    print("Mapped ratings to emotions.")

    # Create visualization
    try:
        plt.figure(figsize=(10, 6))
        emotion_order = ['delighted', 'satisfied', 'neutral', 'disappointed', 'angry', 'unknown']
        # Filter order to only include emotions present in the data
        actual_emotions = df['emotion'].unique()
        ordered_actual_emotions = [e for e in emotion_order if e in actual_emotions]

        sns.countplot(y='emotion', data=df, order=ordered_actual_emotions, palette="coolwarm")
        plt.title('Distribution of Emotions in Reviews (Based on Rating)')
        plt.xlabel('Count')
        plt.ylabel('Emotion')
        plt.tight_layout()
        plt.savefig('./results/emotion_distribution.png')
        plt.close()
        print("Emotion distribution plot saved to ./results/emotion_distribution.png")

        # Emotion trends over time (Check for datetime column)
        if 'datetime' in df.columns and pd.api.types.is_datetime64_any_dtype(df['datetime']):
            print("Analyzing emotion trends over time...")
            df_trends = df.dropna(subset=['datetime']).copy() # Drop rows with NaT datetimes
            df_trends['date'] = df_trends['datetime'].dt.date
            # Resample to monthly if reasonable number of dates
            resample_period = 'M' # 'W' for weekly, 'M' for monthly
            try:
                emotion_trends = df_trends.set_index('datetime').groupby('emotion').resample(resample_period).size().unstack(level=0).fillna(0)

                if not emotion_trends.empty and len(emotion_trends) > 1: # Need at least 2 periods for a trend
                    plt.figure(figsize=(12, 8))
                    emotion_trends.plot(kind='line', marker='.', linestyle='-') # Adjusted style
                    plt.title(f'Emotion Trends Over Time ({resample_period}-based)')
                    plt.xlabel('Date')
                    plt.ylabel('Review Count')
                    plt.legend(title='Emotion', bbox_to_anchor=(1.05, 1), loc='upper left')
                    plt.grid(True, axis='y', linestyle='--')
                    plt.tight_layout(rect=[0, 0, 0.85, 1]) # Adjust layout for legend
                    plt.savefig('./results/emotion_trends.png')
                    plt.close()
                    print("Emotion trends plot saved to ./results/emotion_trends.png")
                else:
                    print("Not enough data points or time periods for meaningful trend analysis after resampling.")
            except Exception as e:
                 print(f"Could not generate emotion trends plot: {e}")
        else:
            print("Skipping emotion trend analysis: 'datetime' column not found, not datetime type, or empty.")

    except Exception as e:
        print(f"Could not generate emotion distribution plot: {e}")

    print("--- Emotion Analysis Finished ---")
    return df

# --- Model Saving ---
def save_model(vectorizer, model, identifier="sentiment"):
    """Save the trained model and vectorizer."""
    if vectorizer is None or model is None:
        print("Skipping model saving: vectorizer or model is None.")
        return

    try:
        model_filename = f'./results/model_{identifier}.pkl'
        vectorizer_filename = f'./results/vectorizer_{identifier}.pkl'
        with open(model_filename, 'wb') as f:
            pickle.dump(model, f)
        with open(vectorizer_filename, 'wb') as f:
            pickle.dump(vectorizer, f)
        print(f"Model saved to {model_filename}")
        print(f"Vectorizer saved to {vectorizer_filename}")
    except Exception as e:
        print(f"Error saving model/vectorizer: {e}")


# --- Processing Workflows ---
def batch_process(df_input):
    """Process the entire dataset in batch mode."""
    print("\n★★★ Starting Batch Processing Workflow ★★★")
    start_time = time.time()
    if df_input is None or df_input.empty:
        print("Error: Input DataFrame is empty. Cannot perform batch processing.")
        return None
    df_processed = df_input.copy() # Work on a copy

    # --- Steps ---
    # 1. Train sentiment model
    vectorizer, model = train_sentiment_model(df_processed)
    if model is not None:
        save_model(vectorizer, model, identifier="batch_sentiment")

    # 2. Detect fake reviews
    df_processed = detect_fake_reviews(df_processed)

    # 3. Analyze aspects
    df_processed = analyze_aspects(df_processed)

    # 4. Analyze emotions
    df_processed = analyze_emotions(df_processed)
    # --- End Steps ---

    # Save processed data
    try:
        output_file = "./results/processed_amazon_reviews_batch.parquet"
        # Select relevant columns to save if df gets too large
        columns_to_save = [col for col in ['reviewText', 'rating', 'category', 'datetime', 'cleaned_text', 'sentiment_class', 'is_fake', 'suspicious_score', 'detected_aspects', 'emotion'] if col in df_processed.columns]
        df_processed[columns_to_save].to_parquet(output_file, index=False)
        print(f"\nBatch processed data saved to {output_file}")
    except Exception as e:
        print(f"Error saving batch processed data: {e}")

    end_time = time.time()
    print(f"★★★ Batch Processing Completed (Duration: {end_time - start_time:.2f} seconds) ★★★")
    return df_processed

def stream_process(df_input, interval=100):
    """Process the dataset in streaming mode (Simulation)."""
    print(f"\n★★★ Starting Stream Processing Simulation (Chunk Size: {interval}) ★★★")
    start_time = time.time()
    if df_input is None or df_input.empty:
        print("Error: Input DataFrame is empty. Cannot perform stream processing simulation.")
        return None
    df_processed = df_input.copy() # Work on a copy
    df_processed['stream_sentiment_score'] = np.nan # Initialize column for stream results

    # Initialize self-adaptive model (Placeholder - if you had one)
    # adaptive_model = SelfAdaptiveModel() #Commented out

    # Process in chunks to simulate streaming
    num_reviews = len(df_processed)
    chunk_size = min(interval, num_reviews)
    if chunk_size <= 0:
        print("No data to process in stream mode.")
        return df_processed

    num_chunks = (num_reviews + chunk_size - 1) // chunk_size # Handle last partial chunk
    print(f"Total reviews: {num_reviews}, Chunks: {num_chunks}")

    for i in range(num_chunks):
        start_idx = i * chunk_size
        end_idx = min(start_idx + chunk_size, num_reviews)
        # Use .loc for slicing to ensure we modify the original df_processed copy
        chunk_indices = df_processed.index[start_idx:end_idx]
        chunk = df_processed.loc[chunk_indices]

        print(f"\nProcessing chunk {i+1}/{num_chunks} (Indices {start_idx}-{end_idx-1}, {len(chunk)} reviews)...")

        # --- Stream Processing Logic ---
        # Placeholder: Assign random sentiment score
        # Replace with actual model prediction (e.g., adaptive_model.predict(chunk))
        print("Applying placeholder stream sentiment analysis...")
        chunk_scores = np.random.uniform(-1, 1, size=len(chunk))
        # -----------------------------

        # Update the main DataFrame slice directly using .loc
        df_processed.loc[chunk_indices, 'stream_sentiment_score'] = chunk_scores
        print(f"Updated sentiment scores for chunk {i+1}.")

        # Simulate real-time processing delay (optional)
        # time.sleep(0.05)

    # Save processed data for streaming simulation
    try:
        output_file = "./results/processed_amazon_reviews_stream.parquet"
        columns_to_save = [col for col in ['reviewText', 'rating', 'category', 'datetime', 'stream_sentiment_score'] if col in df_processed.columns]
        df_processed[columns_to_save].to_parquet(output_file, index=False)
        print(f"\nStream processed data (with scores) saved to {output_file}")
    except Exception as e:
        print(f"Error saving stream processed data: {e}")

    end_time = time.time()
    print(f"★★★ Stream Processing Simulation Completed (Duration: {end_time - start_time:.2f} seconds) ★★★")
    return df_processed

# --- Main Function (primarily for Command-Line Execution) ---
# This function uses argparse and is intended for running the code as a script.
# It WILL NOT be executed automatically when this cell is run in Jupyter.
def main():
    """Main function to orchestrate the analysis pipeline via command line."""
    parser = argparse.ArgumentParser(description='Amazon Review Analysis System (Script Mode)')
    parser.add_argument('--categories', type=str, nargs='+',
                        default=['Electronics', 'Books', 'Home', 'Health'], # Default categories for placeholder
                        help='Categories to analyze (used if loading from HF)')
    parser.add_argument('--mode', type=str, choices=['batch', 'stream'], default='batch',
                        help='Processing mode: batch or stream')
    parser.add_argument('--sample_size', type=int, default=1000, # Smaller default for script testing
                        help='Number of reviews to sample (used if loading/generating placeholder)')
    parser.add_argument('--input_file', type=str, default="filtered_amazon_reviews.parquet",
                        help='Path to pre-filtered Parquet file')
    parser.add_argument('--use_hf', action='store_true',
                        help='Force loading from Hugging Face (placeholder generation)')
    parser.add_argument('--stream_interval', type=int, default=100,
                        help='Chunk size for stream processing simulation')

    # This line WILL CAUSE AN ERROR if main() is called directly within Jupyter
    # because Jupyter passes its own arguments like -f
    args = parser.parse_args()

    print(f"\n--- Running Analysis via main() [Script Mode] ---")
    print(f"Mode: {args.mode}, Categories: {args.categories}, Sample Size: {args.sample_size}")
    print(f"Input File: {args.input_file}, Use Placeholder/HF: {args.use_hf}, Stream Interval: {args.stream_interval}")

    # --- Data Loading (Script Mode) ---
    df_script = None
    if not args.use_hf:
        try:
            print(f"Attempting to load script data from: {args.input_file}")
            df_script = pd.read_parquet(args.input_file)
            print(f"Loaded {len(df_script)} reviews from '{args.input_file}'.")
        except FileNotFoundError:
            print(f"File '{args.input_file}' not found.")
            if not args.use_hf:
                print("Falling back to generating placeholder data.")
                args.use_hf = True
        except Exception as e:
            print(f"Error loading file '{args.input_file}': {e}")
            print("Falling back to generating placeholder data.")
            args.use_hf = True

    if args.use_hf or df_script is None:
        print(f"Generating placeholder data for script execution (Size: {args.sample_size})...")
        n_samples = args.sample_size
        texts = [f"Script review {i+1} about a {np.random.choice(args.categories)} item." for i in range(n_samples)]
        ratings = np.random.randint(1, 6, size=n_samples)
        cats = np.random.choice(args.categories, size=n_samples)
        df_script = pd.DataFrame({
            'reviewText': texts,
            'rating': ratings,
            'category': cats,
            'datetime': pd.to_datetime(np.random.randint(1640995200, 1672531199, size=n_samples), unit='s') # Random dates
            })
        print(f"Generated {len(df_script)} placeholder reviews for script.")
        # Optionally save this generated data
        # try:
        #     df_script.to_parquet(args.input_file)
        #     print(f"Saved generated placeholder data to '{args.input_file}'.")
        # except Exception as e:
        #      print(f"Could not save generated placeholder data: {e}")

    if df_script is None or df_script.empty:
        print("Error: No data loaded or generated. Exiting script mode.")
        return # Exit if data loading failed

    print(f"Proceeding with {len(df_script)} reviews in script mode.")

    # --- Process data based on mode (Script Mode) ---
    if args.mode == 'batch':
        df_processed_script = batch_process(df_script)
    elif args.mode == 'stream':
        df_processed_script = stream_process(df_script, interval=args.stream_interval)
    else:
        print(f"Error: Unknown mode '{args.mode}' in script execution.")
        return

    print("\n--- Analysis via main() [Script Mode] completed successfully. ---")


# =============================================================================
# Section 3: Interactive Execution Configuration
# =============================================================================
# Configure parameters for running the analysis interactively within this notebook cell

# --- Data Loading Config ---
interactive_input_parquet_file = "filtered_amazon_reviews.parquet"
interactive_use_placeholder_data = False # Set to True to generate placeholder data if file is not found
interactive_placeholder_sample_size = 500 # Number of placeholder reviews to generate
interactive_placeholder_categories = ['Electronics', 'Books', 'Home', 'Health', 'Toys']

# --- Analysis Config ---
# Choose the mode: 'batch' or 'stream'
interactive_analysis_mode = 'batch'
# interactive_analysis_mode = 'stream'

# Set stream interval if using stream mode
interactive_stream_interval = 50 # Process in smaller chunks for faster feedback

# =============================================================================
# Section 4: Interactive Data Loading (Runs when cell executes)
# =============================================================================
print("\n===============================================")
print("=== Starting Interactive Data Loading Section ===")
print("===============================================")

df_interactive = None # Initialize dataframe variable

try:
    print(f"\nAttempting to load interactive data from: {interactive_input_parquet_file}")
    df_interactive = pd.read_parquet(interactive_input_parquet_file)
    print(f"Successfully loaded {len(df_interactive)} reviews from '{interactive_input_parquet_file}'.")
except FileNotFoundError:
    print(f"File '{interactive_input_parquet_file}' not found.")
    if interactive_use_placeholder_data:
        print(f"\nGenerating {interactive_placeholder_sample_size} placeholder reviews for interactive use...")
        n_samples = interactive_placeholder_sample_size
        categories = interactive_placeholder_categories
        texts = [f"Interactive placeholder review {i+1} about a {categories[i%len(categories)]} item." for i in range(n_samples)]
        ratings = np.random.randint(1, 6, size=n_samples)
        cats = np.random.choice(categories, size=n_samples)
        df_interactive = pd.DataFrame({
            'reviewText': texts,
            'rating': ratings,
            'category': cats,
            'datetime': pd.to_datetime(np.random.randint(1640995200, 1704067199, size=n_samples), unit='s') # Random dates 2022-2023
            })
        print(f"Generated {len(df_interactive)} placeholder reviews.")
        # Optionally save the generated data
        # try:
        #     df_interactive.to_parquet(interactive_input_parquet_file)
        #     print(f"Saved generated placeholder data to '{interactive_input_parquet_file}'.")
        # except Exception as e:
        #     print(f"Could not save generated placeholder data: {e}")
    else:
        print("\nPlaceholder data generation is disabled ('interactive_use_placeholder_data' is False).")
        print(f"Please create the file '{interactive_input_parquet_file}' or set the flag to True.")
except Exception as e:
    print(f"\nAn error occurred loading '{interactive_input_parquet_file}': {e}")

# Display the first few rows if data was loaded/generated
if df_interactive is not None and not df_interactive.empty:
    print("\nInteractive Data loaded/generated successfully:")
    display(df_interactive.head())
    print(f"\nInteractive DataFrame Info:")
    df_interactive.info()
else:
    print("\nNo interactive data available for processing.")

print("\n=== Finished Interactive Data Loading Section ===")


# =============================================================================
# Section 5: Interactive Analysis Execution (Runs when cell executes)
# =============================================================================
print("\n===================================================")
print("=== Starting Interactive Analysis Execution Section ===")
print("===================================================")

# --- Check if data is loaded ---
if df_interactive is not None and not df_interactive.empty:
    print(f"\nStarting interactive analysis in '{interactive_analysis_mode}' mode...")
    df_processed_interactive = None # Initialize result df

    if interactive_analysis_mode == 'batch':
        # Run the batch process function
        df_processed_interactive = batch_process(df_interactive) # Pass the loaded interactive df

    elif interactive_analysis_mode == 'stream':
        # Run the stream process function
        df_processed_interactive = stream_process(df_interactive, interval=interactive_stream_interval) # Pass the loaded df and interval

    else:
        print(f"Error: Unknown interactive mode '{interactive_analysis_mode}'. Choose 'batch' or 'stream' in Section 3.")

    # Display results if processing occurred
    if df_processed_interactive is not None:
         print(f"\n--- Interactive '{interactive_analysis_mode}' Analysis Finished ---")
         print("Processed DataFrame head:")
         display(df_processed_interactive.head())
         # Optional: display more details
         # print("\nColumns added/modified:")
         # display(df_processed_interactive.columns)
         # if 'sentiment_class' in df_processed_interactive.columns: display(df_processed_interactive[['rating', 'sentiment_class']].head())
         # if 'is_fake' in df_processed_interactive.columns: display(df_processed_interactive[['rating', 'is_fake', 'suspicious_score']].head())
         # if 'detected_aspects' in df_processed_interactive.columns: display(df_processed_interactive[['detected_aspects']].head())
         # if 'emotion' in df_processed_interactive.columns: display(df_processed_interactive[['rating', 'emotion']].head())
         # if 'stream_sentiment_score' in df_processed_interactive.columns: display(df_processed_interactive[['stream_sentiment_score']].head())
    else:
        print("\nAnalysis function did not return a DataFrame (likely due to an error during processing).")

else:
    print("\nCannot run analysis: Interactive data ('df_interactive') was not loaded successfully in Section 4.")

print("\n=== Finished Interactive Analysis Execution Section ===")
print("===================================================")
print("=== End of Cell Execution ===")
print("===================================================")


# =============================================================================
# Section 6: Guard for Command-Line Execution
# =============================================================================
# This block ensures main() is ONLY called when the script is run directly
# from the command line (e.g., `python your_script_name.py --mode batch`).
# It WILL NOT run automatically when this cell is executed in Jupyter Notebook/Lab.
if __name__ == "__main__":
    # Note: If run as a script, the interactive sections 4 and 5 above will still execute
    # before this main() function is called. Consider refactoring further if cleaner
    # separation between interactive and script execution is needed for a .py file.
    main()

Downloading NLTK resources (if necessary)...
NLTK resources checked/downloaded.
Results directory './results' checked/created.

=== Starting Interactive Data Loading Section ===

Attempting to load interactive data from: filtered_amazon_reviews.parquet
Successfully loaded 10000 reviews from 'filtered_amazon_reviews.parquet'.

Interactive Data loaded/generated successfully:


Unnamed: 0,rating,review_title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,category,datetime,reviewText,main_category,product_title,average_rating,store
0,5.0,A fabulous new installment to the series!,I LOVE this game. Really looking forward to ne...,[],B01N3NNPAB,B01GY35HKE,AF4H3UFUUEDLMESMC3SCBPQWUIQA,1494468180000,0,False,Video Games,2017-05-11 07:33:00.000,I LOVE this game. Really looking forward to ne...,Video Games,Mass Effect Andromeda Deluxe - Xbox One,4.3,Electronic Arts
1,5.0,Waterloo: Tabletop Wargaming in the Age of Nap...,My son has been a Warhammer enthusiast for the...,[],1907964177,1907964177,AE4PES27ANXCRHV3NSYDL3Z62ZZA,1325389401000,1,False,Video Games,2012-01-01 09:13:21.000,My son has been a Warhammer enthusiast for the...,Books,Warhammer Historical: Waterloo,5.0,Mark A. Latham (Author)
2,5.0,Very easy to instal,Refrigerator,[],B06VW9HKXF,B06VW9HKXF,AH4ULRXVQOWLDB2DWIVIGK3WFEHQ,1674241249612,0,True,Appliances,2023-01-21 00:30:49.612,Refrigerator,Appliances,GOLDEN ICEPURE DA29-00020B Refrigerator Water ...,4.7,GOLDEN ICEPURE
3,5.0,Easy to install.,Easy to install.,[],B07P3B366X,B07P3B366X,AGJJVKCCOTLJC7AGIFDNP4AECTRA,1680651776518,0,True,Appliances,2023-04-05 05:12:56.518,Easy to install.,Tools & Home Improvement,LIFETIME WARRANTY WR57X10051 Refrigerator Dual...,4.6,BlueStars
4,1.0,sucks!,Cheap plastic. Soft case. My son broke in with...,[],B007PX6MFM,B007PX6MFM,AHB66TQ4PPSZG2EDUUBEGTRZCOCQ,1419384559000,0,True,Video Games,2014-12-24 06:59:19.000,Cheap plastic. Soft case. My son broke in with...,Computers,Insten 28-in-1 Game Card Case Compatible with ...,4.5,eForCity



Interactive DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   rating             10000 non-null  float64       
 1   review_title       10000 non-null  object        
 2   text               10000 non-null  object        
 3   images             10000 non-null  object        
 4   asin               10000 non-null  object        
 5   parent_asin        10000 non-null  object        
 6   user_id            10000 non-null  object        
 7   timestamp          10000 non-null  int64         
 8   helpful_vote       10000 non-null  int64         
 9   verified_purchase  10000 non-null  bool          
 10  category           10000 non-null  object        
 11  datetime           10000 non-null  datetime64[ns]
 12  reviewText         10000 non-null  object        
 13  main_category      9858 non-null 


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='is_fake', y='category', data=fake_by_category, palette="viridis")


Category distribution plot saved to ./results/fake_category_distribution.png
--- Fake Review Detection Finished ---

--- Analyzing Review Aspects (Placeholder: Nouns) ---
Applying placeholder aspect extraction (extracting nouns)...
Error during batch POS tagging: 
**********************************************************************
  Resource [93maveraged_perceptron_tagger_eng[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('averaged_perceptron_tagger_eng')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtaggers/averaged_perceptron_tagger_eng/[0m

  Searched in:
    - 'C:\\Users\\Manvi Bhala/nltk_data'
    - 'C:\\Users\\Manvi Bhala\\anaconda3\\envs\\tf_env\\nltk_data'
    - 'C:\\Users\\Manvi Bhala\\anaconda3\\envs\\tf_env\\share\\nltk_data'
    - 'C:\\Users\\Manvi Bhala\\anaconda3\\envs\\tf_env\\lib\\nltk_data'
    - 'C:\\Users\\Manvi Bhala\\AppData\\Roaming\\nltk_data'
 


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(y='emotion', data=df, order=ordered_actual_emotions, palette="coolwarm")
  emotion_trends = df_trends.set_index('datetime').groupby('emotion').resample(resample_period).size().unstack(level=0).fillna(0)


Emotion trends plot saved to ./results/emotion_trends.png
--- Emotion Analysis Finished ---

Batch processed data saved to ./results/processed_amazon_reviews_batch.parquet
★★★ Batch Processing Completed (Duration: 15.70 seconds) ★★★

--- Interactive 'batch' Analysis Finished ---
Processed DataFrame head:


Unnamed: 0,rating,review_title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,...,main_category,product_title,average_rating,store,cleaned_text,reviewText_str,is_fake,suspicious_score,detected_aspects,emotion
0,5.0,A fabulous new installment to the series!,I LOVE this game. Really looking forward to ne...,[],B01N3NNPAB,B01GY35HKE,AF4H3UFUUEDLMESMC3SCBPQWUIQA,1494468180000,0,False,...,Video Games,Mass Effect Andromeda Deluxe - Xbox One,4.3,Electronic Arts,love game really looking forward new content s...,I LOVE this game. Really looking forward to ne...,False,0.14232,[],delighted
1,5.0,Waterloo: Tabletop Wargaming in the Age of Nap...,My son has been a Warhammer enthusiast for the...,[],1907964177,1907964177,AE4PES27ANXCRHV3NSYDL3Z62ZZA,1325389401000,1,False,...,Books,Warhammer Historical: Waterloo,5.0,Mark A. Latham (Author),son warhammer enthusiast past years christmas ...,My son has been a Warhammer enthusiast for the...,False,0.177134,[],delighted
2,5.0,Very easy to instal,Refrigerator,[],B06VW9HKXF,B06VW9HKXF,AH4ULRXVQOWLDB2DWIVIGK3WFEHQ,1674241249612,0,True,...,Appliances,GOLDEN ICEPURE DA29-00020B Refrigerator Water ...,4.7,GOLDEN ICEPURE,refrigerator,Refrigerator,True,0.520588,[],delighted
3,5.0,Easy to install.,Easy to install.,[],B07P3B366X,B07P3B366X,AGJJVKCCOTLJC7AGIFDNP4AECTRA,1680651776518,0,True,...,Tools & Home Improvement,LIFETIME WARRANTY WR57X10051 Refrigerator Dual...,4.6,BlueStars,easy install,Easy to install.,False,0.264731,[],delighted
4,1.0,sucks!,Cheap plastic. Soft case. My son broke in with...,[],B007PX6MFM,B007PX6MFM,AHB66TQ4PPSZG2EDUUBEGTRZCOCQ,1419384559000,0,True,...,Computers,Insten 28-in-1 Game Card Case Compatible with ...,4.5,eForCity,cheap plastic soft case son broke week waste m...,Cheap plastic. Soft case. My son broke in with...,False,0.141644,[],angry



=== Finished Interactive Analysis Execution Section ===
=== End of Cell Execution ===


usage: ipykernel_launcher.py [-h] [--categories CATEGORIES [CATEGORIES ...]] [--mode {batch,stream}]
                             [--sample_size SAMPLE_SIZE] [--input_file INPUT_FILE] [--use_hf]
                             [--stream_interval STREAM_INTERVAL]
ipykernel_launcher.py: error: unrecognized arguments: -f C:\Users\Manvi Bhala\AppData\Roaming\jupyter\runtime\kernel-65983b55-de1f-49a5-a21e-907b1fcb1742.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


<Figure size 1200x800 with 0 Axes>

In [30]:
import pandas as pd
import numpy as np
from transformers import pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import os

class EmotionAnalyzer:
    def __init__(self, model_name="j-hartmann/emotion-english-distilroberta-base"):
        """
        Initialize the EmotionAnalyzer with a pre-trained emotion detection model.
        
        Args:
            model_name (str): The name of the pre-trained model to use.
        """
        try:
            self.classifier = pipeline("text-classification", 
                                      model=model_name, 
                                      return_all_scores=True)
            self.emotions = ["joy", "sadness", "anger", "fear", "surprise", "disgust", "neutral"]
            print(f"Emotion analyzer initialized with model: {model_name}")
        except Exception as e:
            print(f"Error initializing emotion analyzer: {e}")
            self.classifier = None
    
    def analyze_text(self, text):
        """
        Analyze the emotions in a text.
        
        Args:
            text (str): The text to analyze.
            
        Returns:
            dict: A dictionary mapping emotions to their scores.
        """
        if not self.classifier or not isinstance(text, str) or len(text.strip()) == 0:
            return {emotion: 0.0 for emotion in self.emotions}
        
        try:
            # Truncate text if it's too long
            max_length = 512
            if len(text) > max_length:
                text = text[:max_length]
            
            # Get emotion scores
            result = self.classifier(text)[0]
            
            # Convert to dictionary
            emotion_scores = {item['label']: item['score'] for item in result}
            
            return emotion_scores
        except Exception as e:
            print(f"Error analyzing text: {e}")
            return {emotion: 0.0 for emotion in self.emotions}
    
    def analyze_batch(self, texts, batch_size=32):
        """
        Analyze emotions in a batch of texts.
        
        Args:
            texts (list): A list of texts to analyze.
            batch_size (int): The batch size for processing.
            
        Returns:
            list: A list of dictionaries mapping emotions to their scores.
        """
        if not self.classifier:
            return [{emotion: 0.0 for emotion in self.emotions} for _ in texts]
        
        results = []
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            # Filter out non-string or empty texts
            batch = [text for text in batch if isinstance(text, str) and len(text.strip()) > 0]
            
            if not batch:
                continue
                
            try:
                # Truncate texts if they're too long
                batch = [text[:512] if len(text) > 512 else text for text in batch]
                
                # Get emotion scores for batch
                batch_results = self.classifier(batch)
                
                # Convert to dictionaries
                for result in batch_results:
                    emotion_scores = {item['label']: item['score'] for item in result}
                    results.append(emotion_scores)
            except Exception as e:
                print(f"Error analyzing batch: {e}")
                results.extend([{emotion: 0.0 for emotion in self.emotions} for _ in batch])
        
        return results
    
    def analyze_reviews(self, reviews_df, text_column='reviewText'):
        """
        Analyze emotions in a DataFrame of reviews.
        
        Args:
            reviews_df (pd.DataFrame): DataFrame containing reviews.
            text_column (str): The name of the column containing review text.
            
        Returns:
            pd.DataFrame: The input DataFrame with additional emotion columns.
        """
        if text_column not in reviews_df.columns:
            print(f"Column '{text_column}' not found in DataFrame.")
            return reviews_df
        
        # Get texts to analyze
        texts = reviews_df[text_column].tolist()
        
        # Analyze emotions
        print(f"Analyzing emotions in {len(texts)} reviews...")
        emotion_results = self.analyze_batch(texts)
        
        # Add emotion scores to DataFrame
        result_df = reviews_df.copy()
        
        # Initialize emotion columns
        for emotion in self.emotions:
            result_df[f'emotion_{emotion}'] = 0.0
        
        # Fill in emotion scores
        for i, emotion_dict in enumerate(emotion_results):
            for emotion, score in emotion_dict.items():
                result_df.loc[i, f'emotion_{emotion}'] = score
        
        # Add dominant emotion column
        if emotion_results:
            result_df['dominant_emotion'] = result_df[[f'emotion_{e}' for e in self.emotions]].idxmax(axis=1)
            result_df['dominant_emotion'] = result_df['dominant_emotion'].str.replace('emotion_', '')
        
        print("Emotion analysis completed.")
        return result_df
    
    def visualize_emotions(self, reviews_df, output_dir='./results'):
        """
        Create visualizations of emotion analysis results.
        
        Args:
            reviews_df (pd.DataFrame): DataFrame containing reviews with emotion scores.
            output_dir (str): Directory to save visualizations.
        """
        if 'dominant_emotion' not in reviews_df.columns:
            print("No emotion analysis results found in DataFrame.")
            return
        
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        # Emotion distribution
        plt.figure(figsize=(12, 8))
        sns.countplot(y='dominant_emotion', data=reviews_df, 
                     order=reviews_df['dominant_emotion'].value_counts().index)
        plt.title('Distribution of Emotions in Reviews')
        plt.xlabel('Count')
        plt.ylabel('Emotion')
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, 'emotion_distribution.png'))
        plt.close()
        
        # Emotion by category
        if 'category' in reviews_df.columns:
            plt.figure(figsize=(14, 10))
            emotion_by_category = pd.crosstab(reviews_df['category'], reviews_df['dominant_emotion'])
            emotion_by_category_pct = emotion_by_category.div(emotion_by_category.sum(axis=1), axis=0)
            
            emotion_by_category_pct.plot(kind='bar', stacked=True, colormap='viridis')
            plt.title('Emotion Distribution by Category')
            plt.xlabel('Category')
            plt.ylabel('Percentage')
            plt.legend(title='Emotion')
            plt.tight_layout()
            plt.savefig(os.path.join(output_dir, 'emotion_by_category.png'))
            plt.close()
        
        # Emotion by rating
        if 'rating' in reviews_df.columns:
            plt.figure(figsize=(14, 8))
            emotion_by_rating = pd.crosstab(reviews_df['rating'], reviews_df['dominant_emotion'])
            emotion_by_rating_pct = emotion_by_rating.div(emotion_by_rating.sum(axis=1), axis=0)
            
            emotion_by_rating_pct.plot(kind='bar', stacked=True, colormap='viridis')
            plt.title('Emotion Distribution by Rating')
            plt.xlabel('Rating')
            plt.ylabel('Percentage')
            plt.legend(title='Emotion')
            plt.tight_layout()
            plt.savefig(os.path.join(output_dir, 'emotion_by_rating.png'))
            plt.close()
        
        print(f"Emotion visualizations saved to {output_dir}")


# Example usage
if __name__ == "__main__":
    # Initialize analyzer
    analyzer = EmotionAnalyzer()
    
    # Example text
    text = "I absolutely love this product! It exceeded all my expectations."
    result = analyzer.analyze_text(text)
    print(f"Emotion analysis for '{text}':")
    for emotion, score in result.items():
        print(f"  {emotion}: {score:.4f}")
    
    # Load reviews
    try:
        reviews_df = pd.read_parquet("filtered_amazon_reviews.parquet")
        print(f"Loaded {len(reviews_df)} reviews.")
        
        # Analyze a sample of reviews
        sample_df = reviews_df.sample(min(100, len(reviews_df)))
        result_df = analyzer.analyze_reviews(sample_df)
        
        # Visualize results
        analyzer.visualize_emotions(result_df)
        
    except FileNotFoundError:
        print("Reviews file not found. Run data_loader.py first.")

Device set to use cpu


Emotion analyzer initialized with model: j-hartmann/emotion-english-distilroberta-base
Emotion analysis for 'I absolutely love this product! It exceeded all my expectations.':
  anger: 0.0045
  disgust: 0.0012
  fear: 0.0012
  joy: 0.9558
  neutral: 0.0092
  sadness: 0.0016
  surprise: 0.0264
Loaded 10000 reviews.
Analyzing emotions in 100 reviews...
Emotion analysis completed.
Emotion visualizations saved to ./results


<Figure size 1400x1000 with 0 Axes>

<Figure size 1400x800 with 0 Axes>

In [31]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class ReviewNetworkAnalyzer:
    def __init__(self):
        """
        Initialize the ReviewNetworkAnalyzer.
        """
        pass
    
    def build_reviewer_network(self, reviews_df):
        print(reviews_df.columns)
        """
        Build a network of reviewers based on product co-reviewing.
        
        Args:
            reviews_df (pd.DataFrame): DataFrame containing reviews with reviewer_id and product_id.
            
        Returns:
            nx.Graph: A NetworkX graph representing the reviewer network.
        """
        if 'user_id' not in reviews_df.columns or 'asin' not in reviews_df.columns:
            print("Required columns 'user_id' and 'asin' not found in DataFrame.")
            return nx.Graph()
        
        # Create a graph
        G = nx.Graph()
        
        # Add reviewers as nodes
        reviewers = reviews_df['user_id'].unique()
        G.add_nodes_from(reviewers)
        
        # Group reviews by product
        product_reviewers = reviews_df.groupby('asin')['user_id'].apply(list)
        
        # Add edges between reviewers who reviewed the same product
        for reviewers_list in product_reviewers:
            if len(reviewers_list) > 1:
                for i in range(len(reviewers_list)):
                    for j in range(i+1, len(reviewers_list)):
                        reviewer1 = reviewers_list[i]
                        reviewer2 = reviewers_list[j]
                        if G.has_edge(reviewer1, reviewer2):
                            G[reviewer1][reviewer2]['weight'] += 1
                        else:
                            G.add_edge(reviewer1, reviewer2, weight=1)
        
        return G
    
    def build_product_network(self, reviews_df):
        """
        Build a network of products based on reviewer overlap.
        
        Args:
            reviews_df (pd.DataFrame): DataFrame containing reviews with reviewer_id and product_id.
            
        Returns:
            nx.Graph: A NetworkX graph representing the product network.
        """
        if 'user_id' not in reviews_df.columns or 'asin' not in reviews_df.columns:
            print("Required columns 'user_id' and 'asin' not found in DataFrame.")
            return nx.Graph()
        
        # Create a graph
        G = nx.Graph()
        
        # Add products as nodes
        products = reviews_df['asin'].unique()
        G.add_nodes_from(products)
        
        # Group reviews by reviewer
        reviewer_products = reviews_df.groupby('user_id')['asin'].apply(list)
        
        # Add edges between products reviewed by the same reviewer
        for products_list in reviewer_products:
            if len(products_list) > 1:
                for i in range(len(products_list)):
                    for j in range(i+1, len(products_list)):
                        product1 = products_list[i]
                        product2 = products_list[j]
                        if G.has_edge(product1, product2):
                            G[product1][product2]['weight'] += 1
                        else:
                            G.add_edge(product1, product2, weight=1)
        
        return G
    
    def identify_suspicious_clusters(self, G, threshold=0.8):
        """
        Identify suspicious clusters in the network.
        
        Args:
            G (nx.Graph): A NetworkX graph.
            threshold (float): Threshold for considering a cluster suspicious.
            
        Returns:
            list: List of suspicious clusters (lists of node IDs).
        """
        # Find connected components
        components = list(nx.connected_components(G))
        
        # Calculate density for each component
        suspicious_clusters = []
        for component in components:
            if len(component) < 3:
                continue
            
            subgraph = G.subgraph(component)
            density = nx.density(subgraph)
            
            if density > threshold:
                suspicious_clusters.append(list(component))
        
        return suspicious_clusters
    
    def visualize_network(self, G, suspicious_clusters=None, output_dir='./results', filename='review_network.png'):
        """
        Visualize the network with suspicious clusters highlighted.
        
        Args:
            G (nx.Graph): A NetworkX graph.
            suspicious_clusters (list): List of suspicious clusters.
            output_dir (str): Directory to save the visualization.
            filename (str): Filename for the visualization.
        """
        if len(G.nodes()) == 0:
            print("Empty graph, nothing to visualize.")
            return
        
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        # Limit visualization to a manageable size
        if len(G.nodes()) > 1000:
            print(f"Graph too large ({len(G.nodes())} nodes), sampling for visualization.")
            # Sample nodes
            sampled_nodes = list(G.nodes())[:1000]
            G = G.subgraph(sampled_nodes)
        
        plt.figure(figsize=(12, 12))
        
        # Set node colors
        node_colors = ['#1f77b4'] * len(G.nodes())  # Default blue
        
        # Highlight suspicious clusters
        if suspicious_clusters:
            for i, cluster in enumerate(suspicious_clusters):
                for node in cluster:
                    if node in G.nodes():
                        node_idx = list(G.nodes()).index(node)
                        node_colors[node_idx] = '#d62728'  # Red for suspicious
        
        # Set node sizes based on degree
        node_sizes = [10 + 5 * G.degree(node) for node in G.nodes()]
        
        # Set edge widths based on weight
        edge_widths = [0.5 + 0.5 * G[u][v].get('weight', 1) for u, v in G.edges()]
        
        # Draw the network
        pos = nx.spring_layout(G, seed=42)
        nx.draw_networkx_nodes(G, pos, node_size=node_sizes, node_color=node_colors, alpha=0.7)
        nx.draw_networkx_edges(G, pos, width=edge_widths, alpha=0.3, edge_color='gray')
        
        plt.title('Review Network Analysis')
        plt.axis('off')
        plt.tight_layout()
        
        # Save the figure
        plt.savefig(os.path.join(output_dir, filename), dpi=300, bbox_inches='tight')
        plt.close()
        
        print(f"Network visualization saved to {os.path.join(output_dir, filename)}")
    
    def analyze_review_network(self, reviews_df, output_dir='./results'):
        """
        Perform a complete network analysis on the reviews.
        
        Args:
            reviews_df (pd.DataFrame): DataFrame containing reviews.
            output_dir (str): Directory to save results.
            
        Returns:
            dict: Dictionary containing analysis results.
        """
        results = {}
        
        # Build reviewer network
        reviewer_network = self.build_reviewer_network(reviews_df)
        results['reviewer_network'] = reviewer_network
        
        # Identify suspicious clusters
        suspicious_clusters = self.identify_suspicious_clusters(reviewer_network)
        results['suspicious_clusters'] = suspicious_clusters
        
        # Visualize network
        self.visualize_network(reviewer_network, suspicious_clusters, output_dir)
        
        # Calculate network metrics
        if len(reviewer_network.nodes()) > 0:
            results['network_metrics'] = {
                'num_nodes': len(reviewer_network.nodes()),
                'num_edges': len(reviewer_network.edges()),
                'avg_degree': sum(dict(reviewer_network.degree()).values()) / len(reviewer_network.nodes()),
                'density': nx.density(reviewer_network),
                'num_components': nx.number_connected_components(reviewer_network),
                'num_suspicious_clusters': len(suspicious_clusters)
            }
        
        return results


# Example usage
if __name__ == "__main__":
    # Initialize analyzer
    analyzer = ReviewNetworkAnalyzer()
    
    # Load reviews
    try:
        reviews_df = pd.read_parquet("filtered_amazon_reviews.parquet")
        print(f"Loaded {len(reviews_df)} reviews.")
        
        # Analyze network
        results = analyzer.analyze_review_network(reviews_df)
        
        # Print results
        if 'network_metrics' in results:
            print("\nNetwork Metrics:")
            for metric, value in results['network_metrics'].items():
                print(f"  {metric}: {value}")
        
        if 'suspicious_clusters' in results:
            print(f"\nFound {len(results['suspicious_clusters'])} suspicious clusters.")
            
    except FileNotFoundError:
        print("Reviews file not found. Run data_loader.py first.")

Loaded 10000 reviews.
Index(['rating', 'review_title', 'text', 'images', 'asin', 'parent_asin',
       'user_id', 'timestamp', 'helpful_vote', 'verified_purchase', 'category',
       'datetime', 'reviewText', 'main_category', 'product_title',
       'average_rating', 'store'],
      dtype='object')
Graph too large (9976 nodes), sampling for visualization.
Network visualization saved to ./results\review_network.png

Network Metrics:
  num_nodes: 9976
  num_edges: 4479
  avg_degree: 0.8979550922213312
  density: 9.002056062369235e-05
  num_components: 7992
  num_suspicious_clusters: 383

Found 383 suspicious clusters.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import time # For timing aspect extraction

# ==============================================================
# IMPORTANT ASSUMPTION:
# The 'AspectDetector' class MUST be defined and executed
# in a previous cell or part of the script BEFORE this code runs.
# We are removing the `from gnn_context_model import AspectDetector` line
# because AspectDetector is assumed to be already available in the scope.
# ==============================================================

# Download NLTK resources if needed
print("Checking/downloading NLTK resources (punkt, stopwords)...")
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
print("NLTK resources checked.")


class AspectAnalyzer:
    def __init__(self):
        """
        Initialize the AspectAnalyzer with an AspectDetector.
        Assumes AspectDetector class is already defined in the environment.
        """
        try:
            # Instantiate the detector defined in a previous cell
            self.aspect_detector = AspectDetector()
            print("AspectDetector instance created successfully within AspectAnalyzer.")
        except NameError:
            print("\n *** ERROR: The 'AspectDetector' class is not defined. ***")
            print("Please ensure the cell defining 'AspectDetector' has been executed before this one.")
            raise # Re-raise the error to stop execution if AspectDetector isn't available
        except Exception as e:
            print(f"\n *** ERROR: An unexpected error occurred initializing AspectDetector: {e} ***")
            raise

        self.stop_words = set(stopwords.words('english'))

    def extract_aspects(self, reviews_df, text_column='reviewText', category_column='category'):
        """
        Extract aspects from review texts using the loaded AspectDetector.

        Args:
            reviews_df (pd.DataFrame): DataFrame containing reviews.
            text_column (str): The name of the column containing review text.
            category_column (str): The name of the column containing the category.

        Returns:
            pd.DataFrame: The input DataFrame with an added 'detected_aspects' column (list of strings).
                          Returns the original DataFrame if required columns are missing or an error occurs.
        """
        print(f"\n--- Starting Aspect Extraction (using '{text_column}' and '{category_column}') ---")
        start_time = time.time()

        if text_column not in reviews_df.columns or category_column not in reviews_df.columns:
            print(f"Error: Required columns '{text_column}' or '{category_column}' not found in DataFrame.")
            return reviews_df

        # Ensure the AspectDetector was initialized correctly
        if not hasattr(self, 'aspect_detector'):
             print("Error: AspectDetector was not properly initialized. Cannot extract aspects.")
             return reviews_df

        # Work on a copy to avoid modifying the original DataFrame directly
        result_df = reviews_df.copy()
        # Initialize the column with empty lists for flexibility
        result_df['detected_aspects'] = [[] for _ in range(len(result_df))]

        # Process each review
        processed_count = 0
        error_count = 0
        print(f"Processing {len(result_df)} reviews...")
        for index, row in result_df.iterrows():
            text = row[text_column]
            category = row[category_column]

            # Basic check for valid input types
            if not isinstance(text, str) or pd.isna(text) or not isinstance(category, str) or pd.isna(category):
                # Keep the empty list in 'detected_aspects' for this row
                continue

            try:
                # Detect aspects using the instance created in __init__
                aspects = self.aspect_detector.detect_aspects(text, category)
                # Ensure aspects is a list before assigning
                if isinstance(aspects, list):
                     result_df.loc[index, 'detected_aspects'] = aspects
                else:
                     # Handle cases where detect_aspects might not return a list
                     result_df.loc[index, 'detected_aspects'] = []
                processed_count += 1
            except Exception as e:
                # print(f"Warning: Error detecting aspects for review index {index}: {e}")
                error_count += 1
                # Keep the empty list on error

            # Optional: Add progress indicator for long runs
            if (index + 1) % 1000 == 0:
                print(f"  Processed {index + 1}/{len(result_df)} reviews...")

        end_time = time.time()
        print(f"--- Aspect Extraction Finished ---")
        print(f"Successfully processed: {processed_count} reviews.")
        if error_count > 0:
             print(f"Encountered errors in: {error_count} reviews.")
        print(f"Duration: {end_time - start_time:.2f} seconds.")
        return result_df

    def analyze_aspects_by_category(self, reviews_df_with_aspects, category_column='category'):
        """
        Analyze aspects by category. Requires 'detected_aspects' column.

        Args:
            reviews_df_with_aspects (pd.DataFrame): DataFrame containing reviews with a 'detected_aspects' column.
            category_column (str): The name of the category column.

        Returns:
            dict: Dictionary containing aspect analysis results by category. Keys are categories.
                  Returns empty dict if required columns are missing.
        """
        print("\n--- Analyzing Aspects by Category ---")
        if 'detected_aspects' not in reviews_df_with_aspects.columns or category_column not in reviews_df_with_aspects.columns:
            print(f"Error: Required columns 'detected_aspects' or '{category_column}' not found in DataFrame.")
            return {}

        results = {}
        valid_categories = reviews_df_with_aspects.dropna(subset=[category_column])[category_column].unique()
        print(f"Analyzing for categories: {list(valid_categories)}")

        # Group by category
        for category, group in reviews_df_with_aspects.groupby(category_column):
            # Count aspects
            aspect_counts = Counter()
            # Iterate safely, checking for None or non-list types
            for aspects in group['detected_aspects']:
                if isinstance(aspects, list):
                    aspect_counts.update(aspects)

            if not aspect_counts: # Skip category if no aspects found
                 print(f"  No aspects found for category: {category}")
                 continue

            # Calculate aspect frequencies
            total_reviews_in_group = len(group)
            # Calculate frequency based on reviews *that had aspects detected* if desired,
            # or total reviews in group. Using total reviews here.
            aspect_freq = {aspect: count / total_reviews_in_group for aspect, count in aspect_counts.items()}

            # Store results, sorting counts for easier viewing
            results[category] = {
                'aspect_counts': dict(sorted(aspect_counts.items(), key=lambda item: item[1], reverse=True)),
                'aspect_frequencies': dict(sorted(aspect_freq.items(), key=lambda item: item[1], reverse=True)),
                'total_reviews_in_category': total_reviews_in_group
            }
            print(f"  Analyzed category: {category} ({len(aspect_counts)} unique aspects found)")

        print("--- Finished Aspect Analysis by Category ---")
        return results

    def analyze_aspects_by_rating(self, reviews_df_with_aspects, rating_column='rating'):
        """
        Analyze aspects by rating. Requires 'detected_aspects' column.

        Args:
            reviews_df_with_aspects (pd.DataFrame): DataFrame containing reviews with a 'detected_aspects' column.
            rating_column (str): The name of the rating column.

        Returns:
            dict: Dictionary containing aspect analysis results by rating. Keys are ratings.
                  Returns empty dict if required columns are missing.
        """
        print("\n--- Analyzing Aspects by Rating ---")
        if 'detected_aspects' not in reviews_df_with_aspects.columns or rating_column not in reviews_df_with_aspects.columns:
            print(f"Error: Required columns 'detected_aspects' or '{rating_column}' not found in DataFrame.")
            return {}

        results = {}
        valid_ratings = reviews_df_with_aspects.dropna(subset=[rating_column])[rating_column].unique()
        print(f"Analyzing for ratings: {sorted(list(valid_ratings))}")

        # Group by rating
        for rating, group in reviews_df_with_aspects.groupby(rating_column):
            # Count aspects
            aspect_counts = Counter()
            for aspects in group['detected_aspects']:
                if isinstance(aspects, list):
                    aspect_counts.update(aspects)

            if not aspect_counts:
                print(f"  No aspects found for rating: {rating}")
                continue

            # Calculate aspect frequencies
            total_reviews_in_group = len(group)
            aspect_freq = {aspect: count / total_reviews_in_group for aspect, count in aspect_counts.items()}

            # Store results
            results[rating] = {
                'aspect_counts': dict(sorted(aspect_counts.items(), key=lambda item: item[1], reverse=True)),
                'aspect_frequencies': dict(sorted(aspect_freq.items(), key=lambda item: item[1], reverse=True)),
                'total_reviews_with_rating': total_reviews_in_group
            }
            print(f"  Analyzed rating: {rating} ({len(aspect_counts)} unique aspects found)")

        print("--- Finished Aspect Analysis by Rating ---")
        return results

    def visualize_aspects(self, reviews_df_with_aspects, output_dir='./results', top_n=20):
        """
        Create visualizations of aspect analysis results. Requires 'detected_aspects' column.

        Args:
            reviews_df_with_aspects (pd.DataFrame): DataFrame containing reviews with 'detected_aspects'.
            output_dir (str): Directory to save visualizations.
            top_n (int): Number of top aspects to show in plots.
        """
        print(f"\n--- Visualizing Aspects (Top {top_n}) ---")
        if 'detected_aspects' not in reviews_df_with_aspects.columns:
            print("Error: 'detected_aspects' column not found. Cannot create visualizations.")
            return
        if reviews_df_with_aspects['detected_aspects'].isnull().all():
             print("Warning: 'detected_aspects' column contains only null/empty values. No visualizations generated.")
             return

        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

        # --- Overall Aspect Distribution ---
        print("Generating overall aspect distribution plot...")
        all_aspects_list = []
        for aspects in reviews_df_with_aspects['detected_aspects']:
            if isinstance(aspects, list):
                all_aspects_list.extend(aspects)

        if not all_aspects_list:
            print("No aspects found in any reviews. Skipping overall distribution plot.")
        else:
            aspect_counts = Counter(all_aspects_list)
            aspect_df = pd.DataFrame(aspect_counts.items(), columns=['aspect', 'count'])\
                          .sort_values('count', ascending=False)

            plt.figure(figsize=(12, max(6, len(aspect_df.head(top_n)) * 0.4))) # Adjust height
            sns.barplot(x='count', y='aspect', data=aspect_df.head(top_n), palette='viridis')
            plt.title(f'Top {min(top_n, len(aspect_df))} Aspects Mentioned Overall')
            plt.xlabel('Total Count')
            plt.ylabel('Aspect')
            plt.tight_layout()
            try:
                plt.savefig(os.path.join(output_dir, 'aspect_distribution_overall.png'))
                print(f"Saved: aspect_distribution_overall.png")
            except Exception as e:
                print(f"Error saving aspect_distribution_overall.png: {e}")
            plt.close()


        # --- Aspect by Category ---
        if 'category' in reviews_df_with_aspects.columns:
            print("\nGenerating aspect distribution by category plot...")
            category_results = self.analyze_aspects_by_category(reviews_df_with_aspects)

            if not category_results:
                print("No category analysis results. Skipping aspect by category plot.")
            else:
                category_aspect_data = []
                for category, results_dict in category_results.items():
                    # Take top N aspects for this specific category
                    top_aspects_cat = list(results_dict['aspect_counts'].items())[:top_n]
                    for aspect, count in top_aspects_cat:
                        category_aspect_data.append({
                            'category': category,
                            'aspect': aspect,
                            'count': count
                        })

                if not category_aspect_data:
                     print("No aspect data found across categories. Skipping category plot.")
                else:
                    category_aspect_df = pd.DataFrame(category_aspect_data)

                    # Determine plot dimensions based on number of categories and aspects
                    num_categories = category_aspect_df['category'].nunique()
                    plot_height = max(6, num_categories * 1.5) # Adjust height based on categories
                    plot_width = max(10, top_n * 0.6) # Adjust width based on aspects shown

                    plt.figure(figsize=(plot_width, plot_height))
                    # Use barplot directly for better control over figure size
                    sns.barplot(x='count', y='category', hue='aspect', data=category_aspect_df,
                                dodge=False) # Use dodge=False if too many aspects overlap, or adjust width
                    plt.title(f'Top {top_n} Aspects by Category')
                    plt.xlabel('Count within Category')
                    plt.ylabel('Category')
                    plt.legend(title='Aspect', bbox_to_anchor=(1.05, 1), loc='upper left')
                    plt.tight_layout(rect=[0, 0, 0.85, 1]) # Adjust layout for legend
                    try:
                        plt.savefig(os.path.join(output_dir, 'aspect_by_category.png'))
                        print(f"Saved: aspect_by_category.png")
                    except Exception as e:
                        print(f"Error saving aspect_by_category.png: {e}")
                    plt.close()


        # --- Aspect by Rating ---
        if 'rating' in reviews_df_with_aspects.columns:
            print("\nGenerating aspect distribution by rating plot...")
            rating_results = self.analyze_aspects_by_rating(reviews_df_with_aspects)

            if not rating_results:
                print("No rating analysis results. Skipping aspect by rating plot.")
            else:
                rating_aspect_data = []
                for rating, results_dict in rating_results.items():
                    # Take top N aspects for this specific rating
                    top_aspects_rat = list(results_dict['aspect_counts'].items())[:top_n]
                    for aspect, count in top_aspects_rat:
                        rating_aspect_data.append({
                            'rating': str(rating), # Convert rating to string for categorical plotting
                            'aspect': aspect,
                            'count': count
                        })

                if not rating_aspect_data:
                    print("No aspect data found across ratings. Skipping rating plot.")
                else:
                    rating_aspect_df = pd.DataFrame(rating_aspect_data)

                    plt.figure(figsize=(12, 7)) # Adjust size as needed
                    sns.barplot(x='rating', y='count', hue='aspect', data=rating_aspect_df, dodge=True)
                    plt.title(f'Top {top_n} Aspects by Rating')
                    plt.xlabel('Rating')
                    plt.ylabel('Count within Rating')
                    plt.legend(title='Aspect', bbox_to_anchor=(1.05, 1), loc='upper left')
                    plt.tight_layout(rect=[0, 0, 0.85, 1]) # Adjust layout for legend
                    try:
                        plt.savefig(os.path.join(output_dir, 'aspect_by_rating.png'))
                        print(f"Saved: aspect_by_rating.png")
                    except Exception as e:
                        print(f"Error saving aspect_by_rating.png: {e}")
                    plt.close()

        print("--- Finished Aspect Visualizations ---")


# ==============================================================
# Example usage in the `if __name__ == "__main__":` block
# ==============================================================
if __name__ == "__main__":
    print("\n--- Running Aspect Analysis Example ---")

    # Define input file and output directory
    input_file = "filtered_amazon_reviews.parquet"
    output_directory = "./results"

    # --- Try loading data ---
    reviews_df = None
    try:
        print(f"Attempting to load reviews from: {input_file}")
        reviews_df = pd.read_parquet(input_file)
        # Optional: Sample the data for faster testing
        # sample_size = 1000
        # if len(reviews_df) > sample_size:
        #     print(f"Sampling {sample_size} reviews for testing...")
        #     reviews_df = reviews_df.sample(n=sample_size, random_state=42)
        print(f"Successfully loaded {len(reviews_df)} reviews.")

    except FileNotFoundError:
        print(f"Error: Input reviews file not found at '{input_file}'.")
        print("Please ensure the file exists or run the data preparation step first.")
    except Exception as e:
         print(f"Error loading Parquet file '{input_file}': {e}")


    # --- Proceed only if data is loaded and AspectDetector is likely available ---
    if reviews_df is not None and not reviews_df.empty:
        analyzer = None
        try:
            # --- Initialize Analyzer (this requires AspectDetector) ---
            analyzer = AspectAnalyzer()

            # --- Extract Aspects ---
            # Use the analyzer instance to call extract_aspects
            result_df_with_aspects = analyzer.extract_aspects(reviews_df) # Modify the dataframe in place (via copy)

            # --- Visualize Aspects ---
            # Check if aspect extraction was successful before visualizing
            if 'detected_aspects' in result_df_with_aspects.columns:
                 analyzer.visualize_aspects(result_df_with_aspects, output_dir=output_directory)

                 # --- Print Sample Results ---
                 print("\n--- Sample Results with Detected Aspects ---")
                 # Ensure 'detected_aspects' column exists before sampling/printing
                 sample = result_df_with_aspects.sample(min(5, len(result_df_with_aspects))) # Sample up to 5 rows
                 required_cols = ['reviewText', 'category', 'detected_aspects']
                 if all(col in sample.columns for col in required_cols):
                     for index, row in sample.iterrows():
                         text_preview = row['reviewText'][:100] if isinstance(row['reviewText'], str) else "[No Text]"
                         aspects_list = row['detected_aspects'] if isinstance(row['detected_aspects'], list) else []
                         print(f"\nReview Index {index}: {text_preview}...")
                         print(f"  Category: {row['category']}")
                         print(f"  Detected aspects: {aspects_list}")
                 else:
                      print("Could not display sample results as required columns are missing after processing.")

            else:
                print("\nAspect extraction did not produce the 'detected_aspects' column. Skipping visualization and sample results.")

        except NameError:
            # This catches the error if AspectDetector wasn't defined before AspectAnalyzer was initialized
            print("\nExecution stopped: AspectAnalyzer could not be initialized because AspectDetector was not found.")
            print("Please ensure the cell defining AspectDetector is run first.")
        except Exception as e:
            print(f"\nAn unexpected error occurred during analysis: {e}")
            import traceback
            traceback.print_exc()

    else:
        print("\nAnalysis cannot proceed as review data was not loaded.")

    print("\n--- Aspect Analysis Example Finished ---")

Checking/downloading NLTK resources (punkt, stopwords)...
NLTK resources checked.

--- Running Aspect Analysis Example ---
Attempting to load reviews from: filtered_amazon_reviews.parquet
Successfully loaded 10000 reviews.
AspectDetector instance created successfully within AspectAnalyzer.

--- Starting Aspect Extraction (using 'reviewText' and 'category') ---
Processing 10000 reviews...


In [None]:
import pandas as pd
import numpy as np
import os
import argparse
import time
from datetime import datetime
from data_loader import load_amazon_reviews_hf, preprocess_reviews
from aspect_analysis import AspectAnalyzer
from emotion_analyzer import EmotionAnalyzer
from network_analysis import ReviewNetworkAnalyzer
from domain_adapter import FakeReviewDetector
from context_aware_sentiment import SelfAdaptiveModel

class BatchProcessor:
    def __init__(self, output_dir='./results'):
        """
        Initialize the BatchProcessor.
        
        Args:
            output_dir (str): Directory to save results.
        """
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)
        
        # Initialize analyzers
        self.aspect_analyzer = AspectAnalyzer()
        self.emotion_analyzer = EmotionAnalyzer()
        self.network_analyzer = ReviewNetworkAnalyzer()
        self.fake_detector = FakeReviewDetector()
        self.sentiment_model = SelfAdaptiveModel()
        
        print(f"BatchProcessor initialized. Results will be saved to {output_dir}")
    
    def process_reviews(self, reviews_df, sample_size=None):
        """
        Process reviews in batch mode.
        
        Args:
            reviews_df (pd.DataFrame): DataFrame containing reviews.
            sample_size (int): Number of reviews to sample (optional).
            
        Returns:
            pd.DataFrame: Processed DataFrame with analysis results.
        """
        start_time = time.time()
        print(f"Starting batch processing of {len(reviews_df)} reviews...")
        
        # Sample if needed
        if sample_size and len(reviews_df) > sample_size:
            reviews_df = reviews_df.sample(sample_size, random_state=42)
            print(f"Sampled {sample_size} reviews for processing.")
        
        # 1. Aspect Analysis
        print("Performing aspect analysis...")
        reviews_df = self.aspect_analyzer.extract_aspects(reviews_df)
        self.aspect_analyzer.visualize_aspects(reviews_df, self.output_dir)
        
        # 2. Emotion Analysis
        print("Performing emotion analysis...")
        reviews_df = self.emotion_analyzer.analyze_reviews(reviews_df)
        self.emotion_analyzer.visualize_emotions(reviews_df, self.output_dir)
        
        # 3. Network Analysis
        print("Performing network analysis...")
        network_results = self.network_analyzer.analyze_review_network(reviews_df, self.output_dir)
        
        # 4. Fake Review Detection
        print("Performing fake review detection...")
        if 'text' in reviews_df.columns and 'reviewer_id' in reviews_df.columns and 'timestamp' in reviews_df.columns:
            is_fake = self.fake_detector.analyze_fake_review_patterns(reviews_df)
            reviews_df['is_fake'] = is_fake
            
            # Calculate suspicious score
            suspicious_scores = np.zeros(len(reviews_df))
            if 'suspicious_clusters' in network_results:
                for cluster in network_results['suspicious_clusters']:
                    for reviewer_id in cluster:
                        suspicious_scores[reviews_df['reviewer_id'] == reviewer_id] += 1
            
            reviews_df['suspicious_score'] = suspicious_scores
            
            # Visualize fake review distribution
            self._visualize_fake_reviews(reviews_df)
        
        # 5. Context-Aware Sentiment Analysis
        print("Performing context-aware sentiment analysis...")
        reviews_df['sentiment_score'] = reviews_df.apply(
            lambda row: self.sentiment_model.analyze_with_context(
                row['reviewText'] if 'reviewText' in row else row['text'],
                row['category'] if 'category' in row else None
            ),
            axis=1
        )
        
        # Map sentiment scores to labels
        reviews_df['sentiment'] = reviews_df['sentiment_score'].apply(
            lambda x: 'positive' if x > 0.6 else ('negative' if x < 0.4 else 'neutral')
        )
        
        # Save processed data
        output_file = os.path.join(self.output_dir, 'processed_reviews.parquet')
        reviews_df.to_parquet(output_file)
        print(f"Processed data saved to {output_file}")
        
        elapsed_time = time.time() - start_time
        print(f"Batch processing completed in {elapsed_time:.2f} seconds.")
        
        return reviews_df
    
    def _visualize_fake_reviews(self, reviews_df):
        """
        Create visualizations for fake review analysis.
        
        Args:
            reviews_df (pd.DataFrame): DataFrame containing reviews with fake detection results.
        """
        import matplotlib.pyplot as plt
        import seaborn as sns
        
        if 'is_fake' not in reviews_df.columns:
            return
        
        # Rating distribution: fake vs genuine
        plt.figure(figsize=(10, 6))
        sns.histplot(
            data=reviews_df, 
            x='rating', 
            hue='is_fake',
            multiple='dodge',
            shrink=0.8,
            discrete=True
        )
        plt.title('Rating Distribution: Fake vs. Genuine Reviews')
        plt.xlabel('Rating')
        plt.ylabel('Count')
        plt.legend(['Genuine', 'Fake'])
        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, 'rating_distribution.png'))
        plt.close()
        
        # Fake review percentage by category
        if 'category' in reviews_df.columns:
            plt.figure(figsize=(12, 6))
            fake_by_category = reviews_df.groupby('category')['is_fake'].mean() * 100
            fake_by_category = fake_by_category.sort_values(ascending=False)
            
            sns.barplot(x=fake_by_category.index, y=fake_by_category.values)
            plt.title('Fake Review Percentage by Category')
            plt.xlabel('Category')
            plt.ylabel('Fake Review Percentage')
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.savefig(os.path.join(self.output_dir, 'category_distribution.png'))
            plt.close()


def main():
    parser = argparse.ArgumentParser(description='Batch process Amazon reviews for analysis.')
    parser.add_argument('--categories', type=str, nargs='+', 
                        default=['All Beauty', 'Health and Personal Care', 'Appliances', 'Video_Games'],
                        help='Categories to analyze')
    parser.add_argument('--sample_size', type=int, default=1000, 
                        help='Number of reviews to sample for analysis')
    parser.add_argument('--output_dir', type=str, default='./results',
                        help='Directory to save results')
    args = parser.parse_args()
    
    # Load reviews
    print(f"Loading reviews for categories: {args.categories}")
    reviews_df = load_amazon_reviews_hf(args.categories)
    
    # Preprocess reviews
    reviews_df = preprocess_reviews(reviews_df)
    
    # Process reviews
    processor = BatchProcessor(output_dir=args.output_dir)
    processed_df = processor.process_reviews(reviews_df, sample_size=args.sample_size)
    
    print(f"Batch processing complete. Processed {len(processed_df)} reviews.")


if __name__ == "__main__":
    main()