<span style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">An Exception was encountered at '<a href="#papermill-error-cell">In [4]</a>'.</span>

In [1]:
import os
print(f"IMAGE_VERSION: {os.environ.get('IMAGE_VERSION', 'Not set')}")
print(f"SAGEMAKER_INTERNAL_IMAGE_URI: {os.environ.get('SAGEMAKER_INTERNAL_IMAGE_URI', 'Not set')}")

IMAGE_VERSION: 2.8.5-cpu
SAGEMAKER_INTERNAL_IMAGE_URI: Not set


In [2]:
from datetime import datetime
from zoneinfo import ZoneInfo
print(f"Execution Start Time (EST): {datetime.now(ZoneInfo('America/New_York')).strftime('%Y-%m-%d %H:%M:%S %Z')}")


Execution Start Time (EST): 2025-11-12 10:26:05 EST


In [3]:
# Package upgrade skipped - using environment default
print('Using default sagemaker_studio package from environment')


Using default sagemaker_studio package from environment


# Test 11: Exploring Movie Ticket Sales with DynamoDB

This notebook performs some data analytics on movie ticket sales to predict future sales

<span id="papermill-error-cell" style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">Execution using papermill encountered an exception here and stopped:</span>

In [4]:
from sagemaker_studio import sqlutils

ImportError: cannot import name 'sqlutils' from 'sagemaker_studio' (/opt/conda/lib/python3.11/site-packages/sagemaker_studio/__init__.py)

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

# Generate comprehensive movie ticket sales dataset
n_records = 1000

# Movie attributes
genres = ['Action', 'Comedy', 'Drama', 'Horror', 'Romance', 'Thriller', 'Animation', 'Documentary', 'Sci-Fi', 'Adventure']
ratings = ['G', 'PG', 'PG-13', 'R', 'NR']
studios = ['Disney', 'Warner Bros', 'Universal', 'Sony', 'Paramount', 'Fox', 'MGM', 'Lionsgate', 'Netflix', 'Amazon']
theaters = ['AMC', 'Regal', 'Cinemark', 'Landmark', 'Alamo Drafthouse', 'Marcus', 'Showcase', 'Century']

# Generate base data
data = []
for i in range(n_records):
    # Movie identifiers (DynamoDB friendly)
    movie_id = f"MOVIE_{i+1:04d}"
    
    # Movie attributes
    genre = random.choice(genres)
    rating = random.choice(ratings)
    studio = random.choice(studios)
    theater_chain = random.choice(theaters)
    
    # Release date (last 2 years)
    release_date = datetime.now() - timedelta(days=random.randint(1, 730))
    
    # Budget and production factors
    budget_millions = np.random.lognormal(mean=2.5, sigma=0.8)  # Log-normal distribution for budget
    star_power_score = np.random.randint(1, 11)  # 1-10 scale
    director_reputation = np.random.randint(1, 11)  # 1-10 scale
    sequel_flag = random.choice([0, 1])
    franchise_flag = random.choice([0, 1])
    
    # Marketing and distribution
    marketing_spend_millions = budget_millions * np.random.uniform(0.3, 0.8)
    num_theaters = np.random.randint(500, 4500)
    num_screens = num_theaters * np.random.uniform(1.2, 3.5)
    
    # Seasonal and timing factors
    is_summer_release = 1 if release_date.month in [6, 7, 8] else 0
    is_holiday_release = 1 if release_date.month in [11, 12] else 0
    is_weekend_release = 1 if release_date.weekday() >= 5 else 0
    
    # Competition factors
    num_competing_movies = np.random.poisson(3)  # Average 3 competing movies
    avg_competitor_rating = np.random.uniform(5.0, 8.5)
    
    # Social media and reviews
    social_media_mentions_k = np.random.lognormal(mean=3, sigma=1)  # In thousands
    critic_score = np.random.uniform(10, 100)
    audience_score = np.random.uniform(20, 95)
    
    # Theater and location factors
    avg_ticket_price = np.random.uniform(8.50, 16.00)
    premium_format_pct = np.random.uniform(0.15, 0.45)  # IMAX, Dolby, etc.
    urban_theater_pct = np.random.uniform(0.4, 0.8)
    
    # Target variable calculation with realistic relationships
    base_sales = (
        budget_millions * 0.5 +
        star_power_score * 2 +
        director_reputation * 1.5 +
        sequel_flag * 15 +
        franchise_flag * 10 +
        marketing_spend_millions * 0.8 +
        (num_theaters / 100) +
        is_summer_release * 8 +
        is_holiday_release * 6 +
        is_weekend_release * 3 +
        (critic_score / 10) +
        (audience_score / 10) +
        (social_media_mentions_k / 10) +
        premium_format_pct * 20 -
        num_competing_movies * 2 -
        (avg_competitor_rating - 6) * 3
    )
    
    # Add some randomness and ensure positive values
    ticket_sales_millions = max(0.1, base_sales + np.random.normal(0, 5))
    
    # Create record (all values as strings or numbers for DynamoDB compatibility)
    record = {
        'movie_id': movie_id,  # Primary key for DynamoDB
        'genre': genre,
        'rating': rating,
        'studio': studio,
        'theater_chain': theater_chain,
        'release_date': release_date.strftime('%Y-%m-%d'),
        'budget_millions': round(budget_millions, 2),
        'star_power_score': star_power_score,
        'director_reputation': director_reputation,
        'sequel_flag': sequel_flag,
        'franchise_flag': franchise_flag,
        'marketing_spend_millions': round(marketing_spend_millions, 2),
        'num_theaters': int(num_theaters),
        'num_screens': int(num_screens),
        'is_summer_release': is_summer_release,
        'is_holiday_release': is_holiday_release,
        'is_weekend_release': is_weekend_release,
        'num_competing_movies': num_competing_movies,
        'avg_competitor_rating': round(avg_competitor_rating, 1),
        'social_media_mentions_k': round(social_media_mentions_k, 1),
        'critic_score': round(critic_score, 1),
        'audience_score': round(audience_score, 1),
        'avg_ticket_price': round(avg_ticket_price, 2),
        'premium_format_pct': round(premium_format_pct, 3),
        'urban_theater_pct': round(urban_theater_pct, 3),
        'ticket_sales_millions': round(ticket_sales_millions, 2)  # Target variable
    }
    
    data.append(record)

# Create DataFrame
df = pd.DataFrame(data)

# Display dataset info
print("Movie Ticket Sales Dataset Created!")
print(f"Shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nTarget variable statistics:")
print(df['ticket_sales_millions'].describe())

# Show sample records
print(f"\nSample records:")
df.head()

In [None]:
import boto3
import pandas as pd
from botocore.exceptions import ClientError
import json
from decimal import Decimal

# Initialize DynamoDB resource
dynamodb = boto3.resource('dynamodb')
table_name = 'movies'

# Check if table exists, if not create it
def create_table_if_not_exists():
    try:
        table = dynamodb.Table(table_name)
        table.load()  # This will raise an exception if table doesn't exist
        print(f"Table '{table_name}' already exists")
        return table
    except ClientError as e:
        if e.response['Error']['Code'] == 'ResourceNotFoundException':
            print(f"Table '{table_name}' not found. Creating table...")
            
            # Create table with movie_id as primary key
            table = dynamodb.create_table(
                TableName=table_name,
                KeySchema=[
                    {
                        'AttributeName': 'movie_id',
                        'KeyType': 'HASH'  # Partition key
                    }
                ],
                AttributeDefinitions=[
                    {
                        'AttributeName': 'movie_id',
                        'AttributeType': 'S'  # String
                    }
                ],
                BillingMode='PAY_PER_REQUEST'  # On-demand billing
            )
            
            # Wait for table to be created
            print("Waiting for table to be created...")
            table.wait_until_exists()
            print(f"Table '{table_name}' created successfully!")
            return table
        else:
            raise e

# Helper function to convert pandas dtypes to DynamoDB compatible types
def convert_to_dynamodb_format(value):
    if pd.isna(value):
        return None
    elif isinstance(value, (int, float)):
        return Decimal(str(value))
    elif isinstance(value, bool):
        return bool(value)
    else:
        return str(value)

# Create table if it doesn't exist
table = create_table_if_not_exists()

# Convert dataframe to DynamoDB items
items_to_write = []
for index, row in df.iterrows():
    item = {}
    for column in df.columns:
        converted_value = convert_to_dynamodb_format(row[column])
        if converted_value is not None:  # Only add non-null values
            item[column] = converted_value
    items_to_write.append(item)

# Batch write items to DynamoDB (DynamoDB batch_writer handles batching automatically)
print(f"Writing {len(items_to_write)} items to DynamoDB table '{table_name}'...")

with table.batch_writer() as batch:
    for item in items_to_write:
        batch.put_item(Item=item)

print(f"Successfully wrote {len(items_to_write)} items to DynamoDB table '{table_name}'!")

# Verify by scanning a few items
print("\nFirst 5 items in the table:")
response = table.scan(Limit=5)
for i, item in enumerate(response['Items'], 1):
    print(f"Item {i}: {dict(item)}")

In [None]:
sql_output_d0we = sqlutils.sql("select * from movies limit 10", connection_id="c2o2qti8uldqxc")
sql_output_d0we

In [None]:
sql_output_6fiu = sqlutils.sql("select * from movies where budget_millions > 25", connection_id="c2o2qti8uldqxc")
sql_output_6fiu