## Cell 1: Install Dependencies & Clone Repository

Run this cell first to set up the environment.

In [3]:
# Install required packages
!pip install -q torch transformers vaderSentiment pandas matplotlib seaborn jupyter kaggle

# Clone the repository (for code modules)
!git clone https://github.com/annageiser/Sentiment-Analyzer.git 2>/dev/null || (cd Sentiment-Analyzer && git pull -q)

# Change to project directory
%cd /content/Sentiment-Analyzer
!mkdir -p data

# Option 1: Download from Kaggle (if credentials available)
import kagglehub
path = kagglehub.dataset_download("pranjalverma08/sec-edgar-annual-financial-filings-2021")

# Option 2: Use GitHub sample data (fallback)
print("\nüì• Using sample data from GitHub repository...")

# Verify project structure
!echo "\n=== Project Files ==="
!ls -la *.py *.txt *.ipynb 2>/dev/null | grep -v total
!echo "\n=== Data Files ==="
!ls -la data/ | head -10

‚ö†Ô∏è  Kaggle credentials not found in Colab secrets
   Add KAGGLE_USERNAME and KAGGLE_KEY in the üîë Secrets panel
   Or use GitHub sample data instead
/content/Sentiment-Analyzer
Downloading from https://www.kaggle.com/api/v1/datasets/download/pranjalverma08/sec-edgar-annual-financial-filings-2021?dataset_version_number=1...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 62.8M/62.8M [00:00<00:00, 115MB/s]

Extracting files...






üì• Using sample data from GitHub repository...
\n=== Project Files ===
-rw-r--r-- 1 root root  22561 Dec  8 14:17 Financial_Sentiment_Analysis_Colab.ipynb
-rw-r--r-- 1 root root 137659 Dec  8 14:17 financial_sentiment_analysis.ipynb
-rw-r--r-- 1 root root    226 Dec  8 14:17 requirements.txt
\n=== Data Files ===
total 8
drwxr-xr-x 2 root root 4096 Dec  8 14:17 .
drwxr-xr-x 6 root root 4096 Dec  8 14:18 ..


## Cell 2: Setup Environment & Import Modules

In [4]:
import warnings
warnings.filterwarnings('ignore')

from pathlib import Path
from typing import Dict, List, Any, Tuple
import json
from datetime import datetime

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Import from local modules
from sentiment_analysis import FinancialSentimentAnalyzer, load_filing_data
from batch_analysis import process_batch

# Configure paths for Colab
PROJECT_ROOT = Path.cwd()
DATA_DIR = PROJECT_ROOT / "data"
OUTPUT_DIR = PROJECT_ROOT / "output"
OUTPUT_DIR.mkdir(exist_ok=True)

# Configure visualization
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("=" * 70)
print("Financial Sentiment Analysis - Google Colab Demo")
print(f"Session started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Project root: {PROJECT_ROOT}")
print("=" * 70)
print("‚úÖ Environment ready!")

ModuleNotFoundError: No module named 'sentiment_analysis'

## Cell 3: Load Sample SEC Filing

In [None]:
# Find and load first available SEC filing
available_samples = list(DATA_DIR.glob("*.json"))

if available_samples:
    SAMPLE_FILE = str(available_samples[0])
    print(f"‚úì Using sample file: {available_samples[0].name}")
else:
    print(f"‚ö†Ô∏è  Warning: No sample files found in {DATA_DIR}")
    SAMPLE_FILE = None

SECTIONS_OF_INTEREST = ['item_7', 'item_1A']

if SAMPLE_FILE:
    try:
        filing_data = load_filing_data(SAMPLE_FILE, SECTIONS_OF_INTEREST)
        print("=" * 70)
        print(f"‚úì Loaded filing data with {len(filing_data)} sections")
        print(f"  Available sections: {', '.join(filing_data.keys())}")
        print("=" * 70)

        # Display section sizes
        for section, text in filing_data.items():
            print(f"  {section}: {len(text):,} characters")
    except FileNotFoundError:
        print(f"‚ö†Ô∏è  File not found: {SAMPLE_FILE}")
        filing_data = {}
else:
    filing_data = {}

## Cell 4: Initialize FinBERT Analyzer

In [None]:
# Initialize the sentiment analyzer (FinBERT model)
print("=" * 70)
print("\nüîß Initializing Sentiment Analyzer...")
print("(First run will download FinBERT model ~500MB)")
print()

analyzer = FinancialSentimentAnalyzer()

print("‚úÖ Ready to analyze filings")
print("=" * 70)

## Cell 5: Analyze Sentiment Per Section

**Key Question**: Is the tone consistent across sections?

In [None]:
if filing_data:
    print("=" * 70)
    print("\nüìä SENTIMENT COMPARISON - Cross-Section Analysis")
    print("=" * 70)
    comparison_data = []

    for section_name, section_text in filing_data.items():
        print(f"\n  Analyzing {section_name}...")
        section_results = analyzer.analyze_text(section_text)
        section_agg = analyzer.aggregate_sentiment(section_results)

        comparison_data.append({
            'Section': section_name.replace('item_', 'Item ').upper(),
            'Sentiment': section_agg['label'],
            'Score': f"{section_agg['score']:.3f}",
            'Confidence': f"{section_agg['confidence']:.3f}",
        })

    comparison_df = pd.DataFrame(comparison_data)
    print("\n" + comparison_df.to_string(index=False))
    print("\n" + "=" * 70)

    # Check for inconsistencies
    sentiments = [d['Sentiment'] for d in comparison_data]
    if len(set(sentiments)) > 1:
        print(f"‚ö†Ô∏è  ALERT: Sentiment differs across sections - potential inconsistency detected!")
    else:
        print(f"‚úÖ Consistent tone: All sections show {sentiments[0]} sentiment")
    print("=" * 70)
else:
    print("‚ö†Ô∏è  No filing data available")

## Cell 6: Visualize Sentiment Scores

In [None]:
if 'comparison_df' in locals() and len(comparison_df) > 0:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    # Convert Score and Confidence to float for plotting
    comparison_df['Score_float'] = comparison_df['Score'].astype(float)
    comparison_df['Confidence_float'] = comparison_df['Confidence'].astype(float)

    # Sentiment scores
    colors = ['#2ecc71' if x > 0 else '#e74c3c' if x < 0 else '#95a5a6'
              for x in comparison_df['Score_float']]
    axes[0].bar(comparison_df['Section'], comparison_df['Score_float'], color=colors)
    axes[0].axhline(y=0, color='black', linestyle='-', linewidth=0.5)
    axes[0].set_title('Sentiment Score by Section', fontsize=12, fontweight='bold')
    axes[0].set_ylabel('Score (Positive ‚Üí Negative)')
    axes[0].tick_params(axis='x', rotation=45)

    # Confidence scores
    axes[1].bar(comparison_df['Section'], comparison_df['Confidence_float'], color='#3498db')
    axes[1].set_title('Analysis Confidence by Section', fontsize=12, fontweight='bold')
    axes[1].set_ylabel('Confidence Score')
    axes[1].set_ylim([0, 1.0])
    axes[1].tick_params(axis='x', rotation=45)

    plt.tight_layout()
    plt.show()
else:
    print("‚ö†Ô∏è  Insufficient data for visualization")

## Cell 7: Risk Assessment

**Key Question**: Are there red flags or risk indicators?

In [None]:
def assess_business_risks(sentiment_results: Dict[str, Any]) -> Dict[str, Any]:
    """
    Assess financial risks from business perspective.

    Returns: Dictionary with risk level, specific risks, and recommendations
    """
    risks = []
    severity = "LOW"

    positive_ratio = sentiment_results.get('positive_ratio', 0)
    negative_ratio = sentiment_results.get('negative_ratio', 0)
    confidence = sentiment_results.get('confidence', 0)

    # Risk 1: Over-optimism (potential misleading statements)
    if positive_ratio > 0.70:
        risks.append("üî¥ OVER-OPTIMISM: Unusually positive tone may mask underlying issues")
        severity = "HIGH"

    # Risk 2: Distress signals
    if negative_ratio > 0.60:
        risks.append("üî¥ DISTRESS SIGNALS: High negative sentiment may indicate financial problems")
        severity = "HIGH"

    # Risk 3: Low confidence (vague/unclear disclosures)
    if confidence < 0.30:
        risks.append("üü† VAGUE LANGUAGE: Low confidence suggests unclear or inconsistent disclosures")
        severity = "MEDIUM" if severity == "LOW" else severity

    # Risk 4: Mixed messaging
    if 0.35 < positive_ratio < 0.65 and 0.35 < negative_ratio < 0.65:
        risks.append("üü° MIXED SIGNALS: Conflicting narratives - company is hedging statements")
        severity = "MEDIUM" if severity == "LOW" else severity

    return {
        'severity': severity,
        'risks': risks if risks else ["‚úÖ LOW RISK: Consistent, clear, moderate tone"],
        'confidence': confidence
    }

# Analyze all sections for risks
if filing_data:
    print("\nüö® RISK ASSESSMENT - Executive Summary")
    print("=" * 70)

    all_risks = []
    for section_name, section_text in filing_data.items():
        section_results = analyzer.analyze_text(section_text)
        section_agg = analyzer.aggregate_sentiment(section_results)
        section_risks = assess_business_risks(section_agg)

        print(f"\nüìå {section_name.replace('item_', 'Item ').upper()}")
        print(f"   Risk Level: {section_risks['severity']} | Confidence: {section_risks['confidence']:.1%}")
        for risk in section_risks['risks']:
            print(f"   {risk}")
        all_risks.append(section_risks)

    # Overall assessment
    print("\n" + "=" * 70)
    print("üéØ OVERALL ASSESSMENT")
    overall_severity = max([r['severity'] for r in all_risks], key=lambda x: {'LOW': 0, 'MEDIUM': 1, 'HIGH': 2}[x])
    print(f"Risk Level: {overall_severity}")

    if overall_severity == "HIGH":
        print("‚ö†Ô∏è  RECOMMENDATION: Conduct thorough audit of disclosure statements")
    elif overall_severity == "MEDIUM":
        print("‚ö†Ô∏è  RECOMMENDATION: Review specific sections flagged above")
    else:
        print("‚úÖ RECOMMENDATION: Standard review procedures sufficient")
else:
    print("‚ö†Ô∏è  No filing data available")

## Cell 8: Detailed MD&A Analysis

In [None]:
if filing_data and 'item_7' in filing_data:
    print("\nüìã DETAILED MD&A ANALYSIS (Management Discussion & Analysis)")
    print("=" * 70)

    mdna_text = filing_data['item_7']
    print("\n  Analyzing MD&A section...")
    mdna_results = analyzer.analyze_text(mdna_text)
    mdna_aggregate = analyzer.aggregate_sentiment(mdna_results)

    print(f"\nText Length: {len(mdna_text):,} characters")
    print(f"Chunks Analyzed: {len(mdna_results)}")

    print(f"\nüìä Sentiment Breakdown:")
    distribution = mdna_aggregate['sentiment_distribution']
    total = sum(distribution.values())
    for sentiment, count in sorted(distribution.items(), key=lambda x: x[1], reverse=True):
        percentage = (count / total * 100) if total > 0 else 0
        print(f"   {sentiment:12s}: {count:3d} chunks ({percentage:5.1f}%)")

    print(f"\nüéØ Overall Assessment:")
    print(f"   Sentiment: {mdna_aggregate['label']}")
    print(f"   Score: {mdna_aggregate['score']:.3f} (positive=1.0, negative=-1.0)")
    print(f"   Confidence: {mdna_aggregate['confidence']:.1%}")

    # Save for downstream analysis
    mdna_analysis = {
        'results': mdna_results,
        'aggregate': mdna_aggregate
    }
else:
    print("‚ö†Ô∏è  MD&A section not available")

## Cell 9: Business Actionable Insights

In [None]:
print("\nüìå KEY BUSINESS INSIGHTS")
print("=" * 70)

insights = [
    ("AUDITORS", [
        "Use sentiment inconsistency detection to identify sections requiring deeper audit",
        "Flag unusual optimism in problematic business segments",
        "Verify claims in sections with vague or conflicting language"
    ]),
    ("INVESTORS", [
        "Compare sentiment tone to financial metrics (income, cash flow) for coherence",
        "Watch for over-optimism relative to risk factor disclosures",
        "Identify companies hedging negative news with cautious language"
    ]),
    ("REGULATORS", [
        "Monitor for systematically misleading narratives across filings",
        "Compare sentiment trends year-over-year for consistency",
        "Cross-check narrative claims against quantitative financial data"
    ])
]

for stakeholder, use_cases in insights:
    print(f"\nüë• {stakeholder}")
    for i, case in enumerate(use_cases, 1):
        print(f"   {i}. {case}")

## Cell 10: Batch Processing (Multiple Companies)

In [None]:
print("\nüöÄ BATCH PROCESSING - Multiple Companies")
print("=" * 70)

# Demo: First show single filing analysis (already done above)
print("\nüìå SINGLE FILING (Already Analyzed)")
print("-" * 70)
if SAMPLE_FILE:
    print(f"File: {SAMPLE_FILE}")
    if 'overall_severity' in locals():
        print(f"Overall Risk Level: {overall_severity}")
    print("‚úÖ Use this when auditing or analyzing a specific company\n")

# Demo: Now show batch processing capability
data_dir = Path("data")
output_dir = Path("output")
available_files = list(data_dir.glob("*.json"))

# Ensure output directory exists
output_dir.mkdir(exist_ok=True)

if len(available_files) > 1:
    print(f"üìå BATCH PROCESSING ({len(available_files)} companies found)")
    print("-" * 70)
    print("Processing all filings in data/ directory...\n")

    # Generate timestamped output filename
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    output_file = f"output/batch_analysis_{timestamp}.csv"

    # Run batch analysis
    process_batch(
        input_dir='data/',
        output_file=output_file,
        sections=['item_7', 'item_1A']
    )

    # Load and display results
    batch_df = pd.read_csv(output_file)
    print("\n‚úÖ Batch Analysis Results:")
    print("-" * 70)

    # Show summary statistics
    if 'overall_sentiment' in batch_df.columns:
        print(f"\nProcessed {len(batch_df['company_id'].unique())} companies")
        print(f"Analyzed {len(batch_df)} section-level entries")
        print(f"\nResults saved to: {output_file}")

        # Display sample results
        print("\nSample Results (first 10 entries):")
        display_cols = ['company_id', 'section', 'overall_sentiment', 'sentiment_score', 'confidence']
        available_cols = [col for col in display_cols if col in batch_df.columns]
        print(batch_df[available_cols].head(10).to_string(index=False))

        print("\n‚úÖ Use batch processing when:")
        print("   - Analyzing industry trends")
        print("   - Comparing multiple companies")
        print("   - Identifying outliers (unusually optimistic/pessimistic)")
        print("   - Generating regulatory reports")
else:
    print("üìå BATCH PROCESSING")
    print("-" * 70)
    print(f"Only {len(available_files)} file(s) available in data/ directory")
    print("\n‚úÖ To run batch processing:")
    print("   1. Add more SEC filings to data/ directory")
    print("   2. Run: process_batch(")
    print("         input_dir='data/',")
    print("         output_file='output/batch_results.csv',")
    print("         sections=['item_7', 'item_1A']")
    print("      )")
    print("\n‚úÖ Batch processing provides:")
    print("   - Cross-company sentiment comparison")
    print("   - Industry trend analysis")
    print("   - Risk outlier identification")
    print("   - Exportable results (CSV in output/ folder)")

## Summary

‚úÖ **Demo Complete!**

This notebook demonstrated:
1. **Single Filing Analysis** - Load and analyze a SEC 10-K filing
2. **Sentiment Comparison** - Compare tone across sections
3. **Risk Assessment** - Flag business risks and inconsistencies
4. **Detailed Analysis** - Deep dive into MD&A section
5. **Stakeholder Insights** - Business value for Auditors, Investors, Regulators
6. **Batch Processing** - Scale analysis to multiple companies

### For Your Assignment:
- **Deliverable 1 (Prototype)**: This notebook + `sentiment_analysis.py` + `batch_analysis.py`
- **Deliverable 2 (Presentation)**: See `Documentations/PRESENTATION_GUIDE.md`

### To Share This Demo:
1. Save this notebook to your Google Colab
2. Click "Share" and set permissions
3. Copy the link and include in your presentation slides
4. During presentation, just click "Run all" cells

---

**HS25 Big Data Assignment - Group Work Part 2**