In [3]:
import boto3
import sagemaker

# Setup
sess = sagemaker.Session()
s3 = boto3.client('s3')
bucket_name = sess.default_bucket()

# Key is just the path within the bucket (no s3://, no bucket name)
document_key = "document-analysis/uploads/sample_feedback.txt"

print(f"Reading from: s3://{bucket_name}/{document_key}")

# Read text file from S3
response = s3.get_object(Bucket=bucket_name, Key=document_key)
extracted_text = response['Body'].read().decode('utf-8')

print("\n‚úÖ Extracted Text:")
print(extracted_text)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
Reading from: s3://sagemaker-us-east-2-854757836160/document-analysis/uploads/sample_feedback.txt

‚úÖ Extracted Text:

Customer Feedback Form

Name: John Smith
Date: November 6, 2024
Product: Widget Pro 2000

Feedback:
I absolutely love this product! The quality exceeded my expectations and 
the customer service was outstanding. I had a small issue with setup but 
the support team resolved it within 30 minutes. Highly recommend to anyone 
considering this purchase. Five stars!

Would you recommend to others? Yes
Overall satisfaction: 10/10



***Run Amazon Comprehend***

In [8]:
import boto3

comprehend = boto3.client('comprehend')

# Use the extracted_text from previous step
print("üîç Analyzing text with Amazon Comprehend...\n")

# 1. Sentiment Analysis
print("1Ô∏è‚É£ SENTIMENT ANALYSIS")
print("-" * 50)
sentiment_response = comprehend.detect_sentiment(
    Text=extracted_text,
    LanguageCode='en'
)

sentiment = sentiment_response['Sentiment']
scores = sentiment_response['SentimentScore']

print(f"Overall Sentiment: {sentiment}")
print(f"  Positive: {scores['Positive']:.1%}")
print(f"  Negative: {scores['Negative']:.1%}")
print(f"  Neutral:  {scores['Neutral']:.1%}")
print(f"  Mixed:    {scores['Mixed']:.1%}")

# 2. Entity Recognition
print("\n2Ô∏è‚É£ ENTITY RECOGNITION")
print("-" * 50)
entities_response = comprehend.detect_entities(
    Text=extracted_text,
    LanguageCode='en'
)

entities = entities_response['Entities']
print(f"Found {len(entities)} entities:\n")

for entity in entities:
    print(f"  ‚Ä¢ {entity['Text']}")
    print(f"    Type: {entity['Type']}")
    print(f"    Confidence: {entity['Score']:.1%}\n")

# 3. Key Phrases
print("3Ô∏è‚É£ KEY PHRASES")
print("-" * 50)
phrases_response = comprehend.detect_key_phrases(
    Text=extracted_text,
    LanguageCode='en'
)

key_phrases = phrases_response['KeyPhrases']
print(f"Found {len(key_phrases)} key phrases:\n")

for phrase in key_phrases[:10]:  # Show top 10
    print(f"  ‚Ä¢ {phrase['Text']} ({phrase['Score']:.1%})")

# Store results for next phase
analysis_result = {
    'document': document_key,
    'sentiment': {
        'overall': sentiment,
        'scores': scores
    },
    'entities': [
        {
            'text': e['Text'],
            'type': e['Type'],
            'score': e['Score']
        }
        for e in entities
    ],
    'key_phrases': [
        {
            'text': p['Text'],
            'score': p['Score']
        }
        for p in key_phrases
    ],
    'metadata': {
        'text_length': len(extracted_text),
        'word_count': len(extracted_text.split())
    }
}

print("\n‚úÖ Analysis complete!")

üîç Analyzing text with Amazon Comprehend...

1Ô∏è‚É£ SENTIMENT ANALYSIS
--------------------------------------------------
Overall Sentiment: POSITIVE
  Positive: 100.0%
  Negative: 0.0%
  Neutral:  0.0%
  Mixed:    0.0%

2Ô∏è‚É£ ENTITY RECOGNITION
--------------------------------------------------
Found 7 entities:

  ‚Ä¢ John Smith
    Type: PERSON
    Confidence: 99.9%

  ‚Ä¢ November 6, 2024
    Type: DATE
    Confidence: 100.0%

  ‚Ä¢ Widget
    Type: COMMERCIAL_ITEM
    Confidence: 58.7%

  ‚Ä¢ Pro 2000
    Type: TITLE
    Confidence: 52.6%

  ‚Ä¢ 30 minutes
    Type: QUANTITY
    Confidence: 94.7%

  ‚Ä¢ Five stars
    Type: QUANTITY
    Confidence: 99.7%

  ‚Ä¢ 10/10
    Type: QUANTITY
    Confidence: 95.5%

3Ô∏è‚É£ KEY PHRASES
--------------------------------------------------
Found 17 key phrases:

  ‚Ä¢ Customer Feedback Form

Name (95.0%)
  ‚Ä¢ John Smith
Date (96.5%)
  ‚Ä¢ November 6, 2024 (99.9%)
  ‚Ä¢ Product (96.1%)
  ‚Ä¢ Widget Pro 2000

Feedback (94.7%)
  ‚Ä¢ this p

**Store analysis to S3**

In [10]:
import json
from datetime import datetime
import boto3

s3 = boto3.client('s3')

# Save extracted text
text_key = document_key.replace('uploads/', 'processed/text/')
s3.put_object(
    Bucket=bucket_name,
    Key=text_key,
    Body=extracted_text.encode('utf-8')
)
print(f"‚úÖ Text saved: s3://{bucket_name}/{text_key}")

# Save analysis JSON
analysis_key = document_key.replace('uploads/', 'processed/analysis/').replace('.txt', '_analysis.json')
s3.put_object(
    Bucket=bucket_name,
    Key=analysis_key,
    Body=json.dumps(analysis_result, indent=2).encode('utf-8'),
    ContentType='application/json'
)
print(f"‚úÖ Analysis saved: s3://{bucket_name}/{analysis_key}")

# Create and save summary report
summary = f"""
Document Analysis Report
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
==========================================================

Document: {document_key}
Word Count: {analysis_result['metadata']['word_count']}

SENTIMENT ANALYSIS
------------------
Overall Sentiment: {analysis_result['sentiment']['overall']}
Confidence Scores:
  Positive: {analysis_result['sentiment']['scores']['Positive']:.1%}
  Negative: {analysis_result['sentiment']['scores']['Negative']:.1%}
  Neutral:  {analysis_result['sentiment']['scores']['Neutral']:.1%}
  Mixed:    {analysis_result['sentiment']['scores']['Mixed']:.1%}

ENTITIES DETECTED
-----------------
"""

# Group entities by type
entity_types = {}
for entity in analysis_result['entities']:
    etype = entity['type']
    if etype not in entity_types:
        entity_types[etype] = []
    entity_types[etype].append(entity['text'])

for etype, items in entity_types.items():
    summary += f"\n{etype}:\n"
    for item in set(items):  # Remove duplicates
        summary += f"  - {item}\n"

summary += "\nKEY PHRASES\n-----------\n"
for phrase in analysis_result['key_phrases'][:10]:
    summary += f"  - {phrase['text']}\n"

# Save summary
summary_key = analysis_key.replace('.json', '_summary.txt')
s3.put_object(
    Bucket=bucket_name,
    Key=summary_key,
    Body=summary.encode('utf-8')
)
print(f"‚úÖ Summary saved: s3://{bucket_name}/{summary_key}")

# Display the summary
print("\n" + "="*60)
print(summary)
print("="*60)

‚úÖ Text saved: s3://sagemaker-us-east-2-854757836160/document-analysis/processed/text/sample_feedback.txt
‚úÖ Analysis saved: s3://sagemaker-us-east-2-854757836160/document-analysis/processed/analysis/sample_feedback_analysis.json
‚úÖ Summary saved: s3://sagemaker-us-east-2-854757836160/document-analysis/processed/analysis/sample_feedback_analysis_summary.txt


Document Analysis Report
Generated: 2025-11-08 18:55:46

Document: document-analysis/uploads/sample_feedback.txt
Word Count: 65

SENTIMENT ANALYSIS
------------------
Overall Sentiment: POSITIVE
Confidence Scores:
  Positive: 100.0%
  Negative: 0.0%
  Neutral:  0.0%
  Mixed:    0.0%

ENTITIES DETECTED
-----------------

PERSON:
  - John Smith

DATE:
  - November 6, 2024

COMMERCIAL_ITEM:
  - Widget

TITLE:
  - Pro 2000

QUANTITY:
  - Five stars
  - 30 minutes
  - 10/10

KEY PHRASES
-----------
  - Customer Feedback Form

Name
  - John Smith
Date
  - November 6, 2024
  - Product
  - Widget Pro 2000

Feedback
  - this product
  -

**COMPLETE PIPELINE**

In [12]:
import boto3
import json
import time
from datetime import datetime

def process_document_pipeline(bucket, document_key):
    """
    Complete pipeline: Extract text ‚Üí Analyze ‚Üí Store results
    
    Args:
        bucket: S3 bucket name
        document_key: Path to document in S3 (e.g., 'document-analysis/uploads/file.txt')
    
    Returns:
        dict: Analysis results
    """
    print(f"üöÄ Starting pipeline for: {document_key}")
    print("="*60)
    
    s3 = boto3.client('s3')
    textract = boto3.client('textract')
    comprehend = boto3.client('comprehend')
    
    # Step 1: Extract text
    print("\nüìÑ Step 1: Extracting text...")
    
    try:
        if document_key.endswith('.txt'):
            # Simple text file - read directly
            response = s3.get_object(Bucket=bucket, Key=document_key)
            extracted_text = response['Body'].read().decode('utf-8')
            print(f"   ‚úì Extracted {len(extracted_text)} characters from text file")
        else:
            # PDF or image - use Textract
            print(f"   Using Textract for: {document_key.split('.')[-1].upper()}")
            response = textract.detect_document_text(
                Document={'S3Object': {'Bucket': bucket, 'Name': document_key}}
            )
            extracted_text = ""
            for item in response['Blocks']:
                if item['BlockType'] == 'LINE':
                    extracted_text += item['Text'] + '\n'
            print(f"   ‚úì Extracted {len(extracted_text)} characters with Textract")
    
    except Exception as e:
        print(f"   ‚úó Error extracting text: {e}")
        return None
    
    # Step 2: Analyze with Comprehend
    print("\nüîç Step 2: Analyzing with Comprehend...")
    
    try:
        # Limit text to 5000 characters (Comprehend limit)
        text_to_analyze = extracted_text[:5000]
        
        # Sentiment
        sentiment_response = comprehend.detect_sentiment(
            Text=text_to_analyze,
            LanguageCode='en'
        )
        print(f"   ‚úì Sentiment: {sentiment_response['Sentiment']}")
        
        # Entities
        entities_response = comprehend.detect_entities(
            Text=text_to_analyze,
            LanguageCode='en'
        )
        print(f"   ‚úì Found {len(entities_response['Entities'])} entities")
        
        # Key phrases
        phrases_response = comprehend.detect_key_phrases(
            Text=text_to_analyze,
            LanguageCode='en'
        )
        print(f"   ‚úì Found {len(phrases_response['KeyPhrases'])} key phrases")
        
    except Exception as e:
        print(f"   ‚úó Error analyzing text: {e}")
        return None
    
    # Step 3: Compile results
    print("\nüìä Step 3: Compiling results...")
    
    analysis_result = {
        'document': document_key,
        'timestamp': datetime.now().isoformat(),
        'extracted_text': extracted_text,
        'sentiment': {
            'overall': sentiment_response['Sentiment'],
            'scores': sentiment_response['SentimentScore']
        },
        'entities': [
            {
                'text': e['Text'],
                'type': e['Type'],
                'score': e['Score']
            }
            for e in entities_response['Entities']
        ],
        'key_phrases': [
            {
                'text': p['Text'],
                'score': p['Score']
            }
            for p in phrases_response['KeyPhrases']
        ],
        'metadata': {
            'text_length': len(extracted_text),
            'word_count': len(extracted_text.split())
        }
    }
    
    # Step 4: Save results
    print("\nüíæ Step 4: Saving results to S3...")
    
    try:
        # Save extracted text
        text_key = document_key.replace('uploads/', 'processed/text/')
        s3.put_object(
            Bucket=bucket,
            Key=text_key,
            Body=extracted_text.encode('utf-8')
        )
        print(f"   ‚úì Text: {text_key}")
        
        # Save analysis JSON
        analysis_key = document_key.replace('uploads/', 'processed/analysis/')
        # Remove extension and add _analysis.json
        analysis_key = analysis_key.rsplit('.', 1)[0] + '_analysis.json'
        s3.put_object(
            Bucket=bucket,
            Key=analysis_key,
            Body=json.dumps(analysis_result, indent=2).encode('utf-8'),
            ContentType='application/json'
        )
        print(f"   ‚úì Analysis: {analysis_key}")
        
        # Create and save summary report
        summary = create_summary_report(analysis_result)
        summary_key = analysis_key.replace('.json', '_summary.txt')
        s3.put_object(
            Bucket=bucket,
            Key=summary_key,
            Body=summary.encode('utf-8')
        )
        print(f"   ‚úì Summary: {summary_key}")
        
    except Exception as e:
        print(f"   ‚úó Error saving results: {e}")
        return None
    
    print("\n" + "="*60)
    print("‚úÖ Pipeline complete!")
    print("="*60)
    
    return analysis_result


def create_summary_report(analysis_result):
    """Generate human-readable summary"""
    report = f"""
Document Analysis Report
Generated: {analysis_result['timestamp']}
==========================================================

Document: {analysis_result['document']}
Word Count: {analysis_result['metadata']['word_count']}
Characters: {analysis_result['metadata']['text_length']}

SENTIMENT ANALYSIS
------------------
Overall Sentiment: {analysis_result['sentiment']['overall']}
Confidence Scores:
  Positive: {analysis_result['sentiment']['scores']['Positive']:.1%}
  Negative: {analysis_result['sentiment']['scores']['Negative']:.1%}
  Neutral:  {analysis_result['sentiment']['scores']['Neutral']:.1%}
  Mixed:    {analysis_result['sentiment']['scores']['Mixed']:.1%}

ENTITIES DETECTED
-----------------
"""
    
    # Group entities by type
    entity_types = {}
    for entity in analysis_result['entities']:
        etype = entity['type']
        if etype not in entity_types:
            entity_types[etype] = []
        entity_types[etype].append(entity['text'])
    
    if entity_types:
        for etype, items in entity_types.items():
            report += f"\n{etype}:\n"
            for item in set(items):  # Remove duplicates
                report += f"  - {item}\n"
    else:
        report += "\nNo entities detected.\n"
    
    report += "\nKEY PHRASES\n-----------\n"
    if analysis_result['key_phrases']:
        for phrase in analysis_result['key_phrases'][:10]:
            report += f"  - {phrase['text']}\n"
    else:
        report += "No key phrases detected.\n"
    
    return report


# Test the pipeline on your document
print("Testing complete pipeline...\n")

result = process_document_pipeline(
    bucket=bucket_name,
    document_key=document_key
)

if result:
    print("\nüìã Quick Summary:")
    print(f"  Sentiment: {result['sentiment']['overall']}")
    print(f"  Entities: {len(result['entities'])}")
    print(f"  Key Phrases: {len(result['key_phrases'])}")

Testing complete pipeline...

üöÄ Starting pipeline for: document-analysis/uploads/sample_feedback.txt

üìÑ Step 1: Extracting text...
   ‚úì Extracted 428 characters from text file

üîç Step 2: Analyzing with Comprehend...
   ‚úì Sentiment: POSITIVE
   ‚úì Found 7 entities
   ‚úì Found 17 key phrases

üìä Step 3: Compiling results...

üíæ Step 4: Saving results to S3...
   ‚úì Text: document-analysis/processed/text/sample_feedback.txt
   ‚úì Analysis: document-analysis/processed/analysis/sample_feedback_analysis.json
   ‚úì Summary: document-analysis/processed/analysis/sample_feedback_analysis_summary.txt

‚úÖ Pipeline complete!

üìã Quick Summary:
  Sentiment: POSITIVE
  Entities: 7
  Key Phrases: 17


**Adding a second document**

In [13]:
# Create a second sample document (negative feedback)
negative_feedback = """
Customer Complaint

Name: Jane Doe
Date: November 6, 2024
Product: Widget Pro 2000

Feedback:
I'm very disappointed with this product. The quality is poor and it broke 
after just two days of use. Customer service was unhelpful and took forever 
to respond. I would not recommend this to anyone. Complete waste of money.

Would you recommend to others? No
Overall satisfaction: 2/10
"""

with open('negative_feedback.txt', 'w') as f:
    f.write(negative_feedback)

# Upload to S3
s3.upload_file(
    'negative_feedback.txt',
    bucket_name,
    'document-analysis/uploads/negative_feedback.txt'
)

print("‚úÖ Second document uploaded")

# Process both documents
documents = [
    'document-analysis/uploads/sample_feedback.txt',
    'document-analysis/uploads/negative_feedback.txt'
]

results = []
for doc in documents:
    print(f"\n{'='*70}\n")
    result = process_document_pipeline(bucket_name, doc)
    if result:
        results.append(result)
    time.sleep(1)  # Brief pause between documents

# Aggregate analysis
print(f"\n\n{'='*70}")
print("AGGREGATE ANALYSIS")
print("="*70)
print(f"Total documents processed: {len(results)}")

positive_count = sum(1 for r in results if r['sentiment']['overall'] == 'POSITIVE')
negative_count = sum(1 for r in results if r['sentiment']['overall'] == 'NEGATIVE')
neutral_count = sum(1 for r in results if r['sentiment']['overall'] == 'NEUTRAL')

print(f"\nSentiment Distribution:")
print(f"  Positive: {positive_count}")
print(f"  Negative: {negative_count}")
print(f"  Neutral:  {neutral_count}")

avg_entities = sum(len(r['entities']) for r in results) / len(results)
print(f"\nAverage entities per document: {avg_entities:.1f}")

‚úÖ Second document uploaded


üöÄ Starting pipeline for: document-analysis/uploads/sample_feedback.txt

üìÑ Step 1: Extracting text...
   ‚úì Extracted 428 characters from text file

üîç Step 2: Analyzing with Comprehend...
   ‚úì Sentiment: POSITIVE
   ‚úì Found 7 entities
   ‚úì Found 17 key phrases

üìä Step 3: Compiling results...

üíæ Step 4: Saving results to S3...
   ‚úì Text: document-analysis/processed/text/sample_feedback.txt
   ‚úì Analysis: document-analysis/processed/analysis/sample_feedback_analysis.json
   ‚úì Summary: document-analysis/processed/analysis/sample_feedback_analysis_summary.txt

‚úÖ Pipeline complete!


üöÄ Starting pipeline for: document-analysis/uploads/negative_feedback.txt

üìÑ Step 1: Extracting text...
   ‚úì Extracted 384 characters from text file

üîç Step 2: Analyzing with Comprehend...
   ‚úì Sentiment: NEGATIVE
   ‚úì Found 5 entities
   ‚úì Found 12 key phrases

üìä Step 3: Compiling results...

üíæ Step 4: Saving results to S3...
   

**More Sample Documents**

In [14]:
import boto3

s3 = boto3.client('s3')

# Sample Document 2: Negative Feedback
negative_feedback = """
Customer Complaint Form

Name: Jane Doe
Date: November 6, 2024
Product: Widget Pro 2000
Order ID: WP-98765

Complaint:
I'm extremely disappointed with this product. The quality is terrible and it 
broke after just two days of normal use. I contacted customer service three 
times and received no helpful response. The wait time was over an hour each call.

This is completely unacceptable for a product at this price point. I demand a 
full refund immediately. I will never purchase from this company again and will 
be sharing my experience on social media.

Would you recommend to others? Absolutely not
Overall satisfaction: 1/10
Status: URGENT - Awaiting refund
"""

# Sample Document 3: Neutral/Mixed Review
neutral_review = """
Product Review

Reviewer: Mike Johnson
Date: November 5, 2024
Product: Widget Pro 2000
Rating: 3/5 stars

Review:
The Widget Pro 2000 is an okay product with both strengths and weaknesses. 

Pros:
- Easy to set up and use
- Reasonable price point
- Nice design and build quality

Cons:
- Battery life could be better
- Limited features compared to competitors
- Instructions were unclear in some sections

Overall, it's a decent product but nothing exceptional. It gets the job done 
for basic use cases. I might consider other options if I were buying again, 
but it's not bad for the price.

Would I recommend? Maybe, depends on your needs
Overall: Average product, meets expectations but doesn't exceed them
"""

# Sample Document 4: Technical Support Ticket
support_ticket = """
Technical Support Ticket

Ticket ID: SUP-45123
Customer: Sarah Williams
Product: Widget Pro 2000
Date Opened: November 4, 2024
Priority: High

Issue Description:
Device fails to connect to WiFi network after latest firmware update (v2.3.1).
Error message displays: "Connection timeout - unable to reach server"

Steps already taken:
1. Restarted device multiple times
2. Reset network settings
3. Tried different WiFi networks
4. Uninstalled and reinstalled app

System Information:
- Firmware: v2.3.1
- App Version: 4.2.0
- Device Model: WP2000-X
- Location: Seattle, Washington

Customer is frustrated as device is needed for work. Requesting urgent 
technical assistance or replacement unit.

Status: Open - Awaiting Technical Team Response
"""

# Sample Document 5: Positive Review from Employee
employee_feedback = """
Internal Product Feedback - Q4 2024

Employee: Robert Chen
Department: Sales
Region: West Coast
Date: November 1, 2024

Product Assessment: Widget Pro 2000

I've been demonstrating the Widget Pro 2000 to clients for the past quarter 
and the response has been overwhelmingly positive. Customers love the intuitive 
interface and the reliability of the device.

Key Strengths:
- Easy to demonstrate features
- Strong value proposition
- Excellent customer satisfaction post-purchase
- Low return rate (under 2%)

The product has helped me close 15 deals this quarter, representing $250,000 
in revenue. The Amazon integration and Microsoft Office compatibility are 
particularly impressive selling points.

Recommendation: Continue investment in this product line. Consider expanding 
the lineup with additional models for enterprise customers.

Overall Assessment: Excellent product with strong market potential
Confidence Level: Very High
"""

# Sample Document 6: Short Customer Email
customer_email = """
Subject: Quick Question

Hi there,

I received my Widget Pro 2000 yesterday. Setup was super easy! 

One question - does it work with Google Home? The manual doesn't mention it.

Thanks!
Lisa Martinez
New York, NY
"""

# Dictionary of all documents
documents = {
    'negative_feedback.txt': negative_feedback,
    'neutral_review.txt': neutral_review,
    'support_ticket.txt': support_ticket,
    'employee_feedback.txt': employee_feedback,
    'customer_email.txt': customer_email
}

# Save and upload all documents
print("Creating and uploading sample documents...\n")

for filename, content in documents.items():
    # Save locally
    with open(filename, 'w') as f:
        f.write(content)
    
    # Upload to S3
    s3_key = f'document-analysis/uploads/{filename}'
    s3.upload_file(
        filename,
        bucket_name,
        s3_key
    )
    
    print(f"‚úÖ Uploaded: {filename}")

print(f"\n‚úÖ Created and uploaded {len(documents)} new documents")
print("\nAll documents in S3:")

# List all documents in uploads folder
response = s3.list_objects_v2(
    Bucket=bucket_name,
    Prefix='document-analysis/uploads/'
)

if 'Contents' in response:
    for obj in response['Contents']:
        filename = obj['Key'].split('/')[-1]
        size_kb = obj['Size'] / 1024
        print(f"  ‚Ä¢ {filename} ({size_kb:.1f} KB)")

Creating and uploading sample documents...

‚úÖ Uploaded: negative_feedback.txt
‚úÖ Uploaded: neutral_review.txt
‚úÖ Uploaded: support_ticket.txt
‚úÖ Uploaded: employee_feedback.txt
‚úÖ Uploaded: customer_email.txt

‚úÖ Created and uploaded 5 new documents

All documents in S3:
  ‚Ä¢ customer_email.txt (0.2 KB)
  ‚Ä¢ employee_feedback.txt (0.9 KB)
  ‚Ä¢ negative_feedback.txt (0.7 KB)
  ‚Ä¢ neutral_review.txt (0.7 KB)
  ‚Ä¢ sample_feedback.txt (0.4 KB)
  ‚Ä¢ support_ticket.txt (0.7 KB)


**Process Documents**

In [16]:
import time

# Get all documents in uploads folder
response = s3.list_objects_v2(
    Bucket=bucket_name,
    Prefix='document-analysis/uploads/'
)

if 'Contents' not in response:
    print("No documents found!")
else:
    document_keys = [obj['Key'] for obj in response['Contents'] if not obj['Key'].endswith('/')]
    
    print(f"Found {len(document_keys)} documents to process\n")
    print("="*70)
    
    results = []
    for i, doc_key in enumerate(document_keys, 1):
        print(f"\n\nüìÑ Processing Document {i}/{len(document_keys)}")
        print("="*70)
        
        try:
            result = process_document_pipeline(bucket_name, doc_key)
            if result:
                results.append(result)
            time.sleep(1)  # Brief pause between documents
        except Exception as e:
            print(f"‚ùå Error processing {doc_key}: {e}")
            continue
    
    print("\n\n" + "="*70)
    print("üéâ BATCH PROCESSING COMPLETE")
    print("="*70)
    print(f"\n‚úÖ Successfully processed {len(results)}/{len(document_keys)} documents")

Found 6 documents to process



üìÑ Processing Document 1/6
üöÄ Starting pipeline for: document-analysis/uploads/customer_email.txt

üìÑ Step 1: Extracting text...
   ‚úì Extracted 215 characters from text file

üîç Step 2: Analyzing with Comprehend...
   ‚úì Sentiment: POSITIVE
   ‚úì Found 6 entities
   ‚úì Found 9 key phrases

üìä Step 3: Compiling results...

üíæ Step 4: Saving results to S3...
   ‚úì Text: document-analysis/processed/text/customer_email.txt
   ‚úì Analysis: document-analysis/processed/analysis/customer_email_analysis.json
   ‚úì Summary: document-analysis/processed/analysis/customer_email_analysis_summary.txt

‚úÖ Pipeline complete!


üìÑ Processing Document 2/6
üöÄ Starting pipeline for: document-analysis/uploads/employee_feedback.txt

üìÑ Step 1: Extracting text...
   ‚úì Extracted 941 characters from text file

üîç Step 2: Analyzing with Comprehend...
   ‚úì Sentiment: POSITIVE
   ‚úì Found 14 entities
   ‚úì Found 39 key phrases

üìä Step 3: Compili

**Aggregate Analysis**

In [17]:
print("\n\n" + "="*70)
print("üìä AGGREGATE ANALYSIS")
print("="*70)

# Sentiment distribution
sentiment_counts = {}
for r in results:
    sentiment = r['sentiment']['overall']
    sentiment_counts[sentiment] = sentiment_counts.get(sentiment, 0) + 1

print("\nüé≠ Sentiment Distribution:")
for sentiment, count in sorted(sentiment_counts.items()):
    percentage = (count / len(results)) * 100
    print(f"  {sentiment}: {count} documents ({percentage:.1f}%)")

# Entity analysis
all_entity_types = {}
for r in results:
    for entity in r['entities']:
        etype = entity['type']
        all_entity_types[etype] = all_entity_types.get(etype, 0) + 1

print("\nüè∑Ô∏è Most Common Entity Types:")
sorted_entities = sorted(all_entity_types.items(), key=lambda x: x[1], reverse=True)
for etype, count in sorted_entities[:5]:
    print(f"  {etype}: {count} occurrences")

# Document statistics
total_words = sum(r['metadata']['word_count'] for r in results)
avg_words = total_words / len(results)
total_entities = sum(len(r['entities']) for r in results)
avg_entities = total_entities / len(results)
total_phrases = sum(len(r['key_phrases']) for r in results)
avg_phrases = total_phrases / len(results)

print("\nüìà Document Statistics:")
print(f"  Total documents: {len(results)}")
print(f"  Total words: {total_words:,}")
print(f"  Average words per document: {avg_words:.0f}")
print(f"  Average entities per document: {avg_entities:.1f}")
print(f"  Average key phrases per document: {avg_phrases:.1f}")

# Identify extremes
most_positive = max(results, key=lambda x: x['sentiment']['scores']['Positive'])
most_negative = max(results, key=lambda x: x['sentiment']['scores']['Negative'])

print("\nüîç Notable Documents:")
print(f"  Most positive: {most_positive['document'].split('/')[-1]}")
print(f"    Positive confidence: {most_positive['sentiment']['scores']['Positive']:.1%}")
print(f"  Most negative: {most_negative['document'].split('/')[-1]}")
print(f"    Negative confidence: {most_negative['sentiment']['scores']['Negative']:.1%}")

# Save aggregate report
aggregate_report = f"""
AGGREGATE DOCUMENT ANALYSIS REPORT
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
==========================================================

SUMMARY
-------
Total Documents Processed: {len(results)}
Total Words: {total_words:,}
Average Words per Document: {avg_words:.0f}

SENTIMENT DISTRIBUTION
----------------------
"""

for sentiment, count in sorted(sentiment_counts.items()):
    percentage = (count / len(results)) * 100
    aggregate_report += f"{sentiment}: {count} documents ({percentage:.1f}%)\n"

aggregate_report += f"""

ENTITY ANALYSIS
---------------
Total Entities Detected: {total_entities}
Average per Document: {avg_entities:.1f}

Most Common Entity Types:
"""

for etype, count in sorted_entities[:5]:
    aggregate_report += f"  {etype}: {count}\n"

aggregate_report += f"""

KEY INSIGHTS
------------
- Most positive document: {most_positive['document'].split('/')[-1]} ({most_positive['sentiment']['scores']['Positive']:.1%} confidence)
- Most negative document: {most_negative['document'].split('/')[-1]} ({most_negative['sentiment']['scores']['Negative']:.1%} confidence)
- Average key phrases per document: {avg_phrases:.1f}

DOCUMENTS PROCESSED
-------------------
"""

for r in results:
    filename = r['document'].split('/')[-1]
    sentiment = r['sentiment']['overall']
    aggregate_report += f"- {filename}: {sentiment}\n"

# Save aggregate report to S3
aggregate_key = 'document-analysis/processed/analysis/aggregate_report.txt'
s3.put_object(
    Bucket=bucket_name,
    Key=aggregate_key,
    Body=aggregate_report.encode('utf-8')
)

print(f"\nüíæ Aggregate report saved: s3://{bucket_name}/{aggregate_key}")
print("\n" + aggregate_report)



üìä AGGREGATE ANALYSIS

üé≠ Sentiment Distribution:
  MIXED: 1 documents (16.7%)
  NEGATIVE: 1 documents (16.7%)
  NEUTRAL: 1 documents (16.7%)
  POSITIVE: 3 documents (50.0%)

üè∑Ô∏è Most Common Entity Types:
  QUANTITY: 14 occurrences
  DATE: 9 occurrences
  COMMERCIAL_ITEM: 6 occurrences
  PERSON: 6 occurrences
  TITLE: 6 occurrences

üìà Document Statistics:
  Total documents: 6
  Total words: 564
  Average words per document: 94
  Average entities per document: 8.5
  Average key phrases per document: 25.3

üîç Notable Documents:
  Most positive: sample_feedback.txt
    Positive confidence: 100.0%
  Most negative: negative_feedback.txt
    Negative confidence: 100.0%

üíæ Aggregate report saved: s3://sagemaker-us-east-2-854757836160/document-analysis/processed/analysis/aggregate_report.txt


AGGREGATE DOCUMENT ANALYSIS REPORT
Generated: 2025-11-08 19:21:52

SUMMARY
-------
Total Documents Processed: 6
Total Words: 564
Average Words per Document: 94

SENTIMENT DISTRIBUTION
-

**Verify Results**

In [20]:
print("\n\n" + "="*70)
print("üìÅ FINAL FILE STRUCTURE")
print("="*70)

# Check uploads
print("\nüì§ Uploads:")
response = s3.list_objects_v2(Bucket=bucket_name, Prefix='document-analysis/uploads/')
if 'Contents' in response:
    for obj in response['Contents']:
        if not obj['Key'].endswith('/'):
            print(f"  ‚Ä¢ {obj['Key'].split('/')[-1]}")

# Check processed text
print("\nüìÑ Processed Text:")
response = s3.list_objects_v2(Bucket=bucket_name, Prefix='document-analysis/processed/text/')
if 'Contents' in response:
    for obj in response['Contents']:
        if not obj['Key'].endswith('/'):
            print(f"  ‚Ä¢ {obj['Key'].split('/')[-1]}")

# Check analysis results
print("\nüìä Analysis Results:")
response = s3.list_objects_v2(Bucket=bucket_name, Prefix='document-analysis/processed/analysis/')
if 'Contents' in response:
    for obj in response['Contents']:
        if not obj['Key'].endswith('/'):
            print(f"  ‚Ä¢ {obj['Key'].split('/')[-1]}")

print("\n‚úÖ Project structure complete!")



üìÅ FINAL FILE STRUCTURE

üì§ Uploads:
  ‚Ä¢ customer_email.txt
  ‚Ä¢ employee_feedback.txt
  ‚Ä¢ negative_feedback.txt
  ‚Ä¢ neutral_review.txt
  ‚Ä¢ sample_feedback.txt
  ‚Ä¢ support_ticket.txt

üìÑ Processed Text:
  ‚Ä¢ customer_email.txt
  ‚Ä¢ employee_feedback.txt
  ‚Ä¢ negative_feedback.txt
  ‚Ä¢ neutral_review.txt
  ‚Ä¢ sample_feedback.txt
  ‚Ä¢ support_ticket.txt

üìä Analysis Results:
  ‚Ä¢ aggregate_report.txt
  ‚Ä¢ customer_email_analysis.json
  ‚Ä¢ customer_email_analysis_summary.txt
  ‚Ä¢ employee_feedback_analysis.json
  ‚Ä¢ employee_feedback_analysis_summary.txt
  ‚Ä¢ negative_feedback_analysis.json
  ‚Ä¢ negative_feedback_analysis_summary.txt
  ‚Ä¢ neutral_review_analysis.json
  ‚Ä¢ neutral_review_analysis_summary.txt
  ‚Ä¢ sample_feedback_analysis.json
  ‚Ä¢ sample_feedback_analysis_summary.txt
  ‚Ä¢ support_ticket_analysis.json
  ‚Ä¢ support_ticket_analysis_summary.txt

‚úÖ Project structure complete!
