# SentiCompare - Data Exploration

This notebook demonstrates how to load and explore sentiment analysis datasets for the SentiCompare benchmark.

## Setup

In [1]:
import sys
sys.path.append('..')

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from src.data.loader import SentimentDataLoader, list_available_datasets
from src.data.preprocessor import TextPreprocessor, validate_dataset, print_validation_report

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("âœ“ Imports successful")

ModuleNotFoundError: No module named 'datasets'

## 1. List Available Datasets

In [None]:
datasets = list_available_datasets()
print("Available datasets:")
for ds in datasets:
    print(f"  - {ds}")

## 2. Load a Dataset

Let's load the IMDB dataset as an example:

In [None]:
# Load IMDB dataset with DistilBERT tokenizer
loader = SentimentDataLoader(
    dataset_name="imdb",
    tokenizer_name="distilbert-base-uncased"
)

# Load raw dataset
dataset = loader.load()
print(f"Dataset loaded: {type(dataset)}")
print(f"Splits: {list(dataset.keys())}")

## 3. Explore Dataset Structure

In [None]:
# Get dataset info
info = loader.get_dataset_info()
print("Dataset Information:")
for key, value in info.items():
    print(f"  {key}: {value}")

# Show sample
print("\nSample from training set:")
print(dataset['train'][0])

## 4. Create Train/Val/Test Splits

In [None]:
# Create smaller splits for exploration
train_data, val_data, test_data = loader.prepare_splits(
    dataset,
    train_size=1000,
    val_size=200,
    test_size=500
)

print(f"Train size: {len(train_data)}")
print(f"Val size: {len(val_data)}")
print(f"Test size: {len(test_data)}")

## 5. Text Preprocessing

In [None]:
# Create preprocessor
preprocessor = TextPreprocessor(
    remove_urls=True,
    remove_extra_spaces=True
)

# Test on a sample
sample_text = dataset['train'][0]['text']
print("Original:")
print(sample_text[:200])
print("\nCleaned:")
print(preprocessor.clean_text(sample_text)[:200])

## 6. Label Distribution Analysis

In [None]:
# Convert to pandas for analysis
train_df = pd.DataFrame(train_data)

# Plot label distribution
plt.figure(figsize=(8, 5))
label_counts = train_df['label'].value_counts().sort_index()
label_counts.plot(kind='bar')
plt.title('Label Distribution in Training Set')
plt.xlabel('Label')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

print("\nLabel distribution:")
print(label_counts)

## 7. Text Length Analysis

In [None]:
# Calculate text lengths
train_df['text_length'] = train_df['text'].str.len()

# Plot distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(train_df['text_length'], bins=50, edgecolor='black')
plt.title('Text Length Distribution')
plt.xlabel('Text Length (characters)')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
train_df.boxplot(column='text_length', by='label')
plt.title('Text Length by Label')
plt.xlabel('Label')
plt.ylabel('Text Length (characters)')
plt.suptitle('')  # Remove default title

plt.tight_layout()
plt.show()

print("\nText length statistics:")
print(train_df['text_length'].describe())

## 8. Tokenization Analysis

In [None]:
# Tokenize a few examples
sample_texts = train_df['text'].head(5).tolist()
tokenized = loader.tokenizer(sample_texts, truncation=True, padding='max_length', max_length=512)

# Analyze token lengths
token_lengths = [sum(attention_mask) for attention_mask in tokenized['attention_mask']]

print("Token lengths for first 5 samples:")
for i, length in enumerate(token_lengths):
    print(f"  Sample {i+1}: {length} tokens")

# Show tokenization example
print("\nTokenization example:")
print(f"Text: {sample_texts[0][:100]}...")
print(f"\nTokens: {loader.tokenizer.convert_ids_to_tokens(tokenized['input_ids'][0][:20])}")

## 9. Dataset Validation

In [None]:
# Validate dataset
report = validate_dataset(
    train_df,
    text_column='text',
    label_column='label',
    num_labels=2
)

print_validation_report(report)

## 10. Compare Multiple Datasets

In [None]:
# Load multiple datasets for comparison
dataset_stats = []

for ds_name in ['imdb', 'sst2']:  # Add more datasets as needed
    try:
        loader = SentimentDataLoader(ds_name, "distilbert-base-uncased")
        ds = loader.load()
        
        # Calculate stats
        if isinstance(ds, dict):
            total_samples = sum(len(split) for split in ds.values())
        else:
            total_samples = len(ds)
        
        dataset_stats.append({
            'dataset': ds_name,
            'total_samples': total_samples,
            'num_labels': loader.dataset_config.get('num_labels', 2),
            'max_length': loader.dataset_config.get('max_length', 512)
        })
    except Exception as e:
        print(f"Could not load {ds_name}: {e}")

# Display comparison
comparison_df = pd.DataFrame(dataset_stats)
print("\nDataset Comparison:")
print(comparison_df.to_string(index=False))

## Summary

This notebook demonstrated:
1. Loading datasets using `SentimentDataLoader`
2. Creating train/val/test splits
3. Text preprocessing and cleaning
4. Analyzing label distributions
5. Examining text lengths and tokenization
6. Validating dataset integrity

Next steps:
- Explore other datasets (Disney, Amazon, Yelp)
- Experiment with different preprocessing options
- Prepare data for model training