# Multi-lingual News Event Clustering with Amazon Nova

This solution demonstrates how to cluster related news stories from multiple languages into coherent events using Amazon Nova Multimodal Embedding model. The notebook processes a diverse dataset of news articles in German, Spanish, and English, automatically grouping similar stories and generating meaningful event summaries.

## Key Features:
- **Multi-language support**: Handles news articles in German (deu), Spanish (spa), and English (eng)
- **Semantic clustering**: Uses Amazon Nova embeddings to group semantically similar articles
- **Automated summarization**: Generates event descriptions using Amazon Nova's language model
- **Visual analysis**: Provides 2D visualization of article clusters and event distributions

## Use Cases:
- News aggregation and organization
- Multi-lingual content analysis
- Event detection and monitoring
- Information overload reduction

In [None]:
import boto3
import json
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from collections import Counter
from datetime import datetime, timedelta
import random

In [None]:
class NewsEventProcessor:
    def __init__(self, region_name="us-east-1"):
        self.bedrock = boto3.client('bedrock-runtime', region_name=region_name)
        self.embedding_model = 'amazon.nova-2-multimodal-embeddings-v1:0'
        self.llm_model = "us.amazon.nova-lite-v1:0"
        
    def get_embedding(self, text):
        request_body = {
            "taskType": "SINGLE_EMBEDDING",
            "singleEmbeddingParams": {
                "embeddingDimension": 1024,
                "embeddingPurpose": "CLUSTERING",
                "text": {"truncationMode": "END", "value": text}
            }
        }
        response = self.bedrock.invoke_model(modelId=self.embedding_model, body=json.dumps(request_body))
        return np.array(json.loads(response['body'].read())["embeddings"][0]["embedding"])
    
    def get_batch_embeddings(self, texts):
        return np.array([self.get_embedding(text) for text in texts])
    
    def generate_event_summary(self, articles):
        sample_text = "\n\n".join([article[:300] for article in articles[:3]])
        prompt = f"""Analyze these news articles and extract the main topic category and description.

Articles:
{sample_text}"""
        
        tools = [{
            "toolSpec": {
                "name": "extract_topic",
                "description": "Extract topic category and description from news articles",
                "inputSchema": {
                    "json": {
                        "type": "object",
                        "properties": {
                            "category": {
                                "type": "string",
                                "description": "Main topic category (max 3 words)"
                            },
                            "description": {
                                "type": "string",
                                "description": "Brief description (1-2 sentences)"
                            }
                        },
                        "required": ["category", "description"]
                    }
                }
            }
        }]
        
        request_body = {
            "messages": [{"role": "user", "content": [{"text": prompt}]}],
            "toolConfig": {"tools": tools, "toolChoice": {"tool": {"name": "extract_topic"}}},
            "inferenceConfig": {"maxTokens": 150, "temperature": 0.1}
        }
        
        try:
            response = self.bedrock.invoke_model(modelId=self.llm_model, body=json.dumps(request_body))
            result = json.loads(response['body'].read())
            tool_use = result['output']['message']['content'][0]['toolUse']
            return tool_use['input']
        except:
            return {"category": "Unknown Topic", "description": "Unable to analyze articles"}

processor = NewsEventProcessor()

## Load News Data

In [None]:
# Load public dataset
# Dataset source: https://github.com/aws-samples/news-clustering-and-summarization/tree/main
print("Loading public dataset...")
with open('public_dataset.json', 'r', encoding='utf-8') as f:
    dataset = json.load(f)

# Extract articles and create categories from cluster information
articles = [item['text'] for item in dataset[:100]]  # Limit for demo
categories = [f"Cluster_{item['cluster']}" for item in dataset[:100]]

# Create dataset table
df = pd.DataFrame({
    'Article_ID': [item['id'] for item in dataset[:100]],
    'Title': [item['title'] for item in dataset[:100]],
    'Category': categories,
    'Language': [item['lang'] for item in dataset[:100]],
    'Length': [len(item['text']) for item in dataset[:100]],
    'Preview': [item['text'][:150] + '...' if len(item['text']) > 150 else item['text'] for item in dataset[:100]]
})

print(f"Loaded {len(articles)} articles")
print(f"Languages: {sorted(set([item['lang'] for item in dataset[:100]]))}")
print(f"Unique clusters: {len(set(categories))}")
print("\nDataset Overview:")
display(df.head(10))

## Event Detection

In [None]:
# Generate embeddings and cluster
print("Generating embeddings...")
embeddings = processor.get_batch_embeddings(articles)

n_events = len(set(categories))
kmeans = KMeans(n_events, random_state=42)
event_labels = kmeans.fit_predict(embeddings)
event_sizes = Counter(event_labels)

print(f"Detected {n_events} topics: {dict(event_sizes)}")

## Generate Event Summaries

In [None]:
# Create event summaries
event_summaries = {}
for event_id in range(n_events):
    event_articles = [articles[i] for i in range(len(articles)) if event_labels[i] == event_id]
    if event_articles:
        summary = processor.generate_event_summary(event_articles)
        event_summaries[event_id] = summary
        print(f"Event {event_id}: {summary['category']} ({len(event_articles)} articles)")
        print(f"  Description: {summary['description']}")

## Visualization

In [None]:
# Visualize events by category
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
colors = plt.cm.Set3(np.linspace(0, 1, n_events))

# Event clusters by category
pca = PCA(n_components=2, random_state=42)
embeddings_2d = pca.fit_transform(embeddings)

for event_id in range(n_events):
    mask = event_labels == event_id
    category = event_summaries[event_id]['category'] if event_id in event_summaries else f"Event {event_id}"
    ax1.scatter(embeddings_2d[mask, 0], embeddings_2d[mask, 1], 
               c=[colors[event_id]], label=category, alpha=0.7, s=60)

ax1.set_title('News Event Clusters by Category')
ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
ax1.grid(True, alpha=0.3)

# Category distribution
categories_list = [event_summaries[i]['category'] for i in range(n_events) if i in event_summaries]
event_counts = [event_sizes[i] for i in range(n_events)]

bars = ax2.bar(range(len(categories_list)), event_counts[:len(categories_list)], color=colors[:len(categories_list)])
ax2.set_title('Event Coverage by Category')
ax2.set_xlabel('Categories')
ax2.set_ylabel('Articles')
ax2.set_xticks(range(len(categories_list)))
ax2.set_xticklabels(categories_list, rotation=45, ha='right')

for bar, count in zip(bars, event_counts[:len(categories_list)]):
    ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1, 
             str(count), ha='center', va='bottom')

plt.tight_layout()
plt.show()

## Event Report

In [None]:
# Generate final report with structured data
event_data = [(i, event_sizes[i], event_summaries.get(i, {})) for i in range(n_events)]
event_data.sort(key=lambda x: x[1], reverse=True)

print("NEWS EVENT SUMMARY")
print("=" * 50)
print(f"Total articles: {len(articles)}")
print(f"Events detected: {n_events}")

print("\nTOP EVENTS BY CATEGORY:")
for rank, (event_id, count, summary) in enumerate(event_data, 1):
    if summary:
        category = summary.get('category', f'Event {event_id}')
        description = summary.get('description', 'No description available')
        print(f"\n{rank}. {category.upper()} ({count} articles)")
        print(f"   📝 {description}")