In [None]:
# Check all field lengths against database limits
# exec(open('check_all_field_lengths.py').read())

Checking all text fields against database schema limits...

üìä Found 7 field length violations:

üî¥ Listing 2: amenity_code
   Actual: 300 chars | Max: 255 chars | Excess: 45

üî¥ Listing 4: amenity_code
   Actual: 300 chars | Max: 255 chars | Excess: 45

üî¥ Listing 23: amenity_code
   Actual: 300 chars | Max: 255 chars | Excess: 45

üî¥ Listing 78: amenity_code
   Actual: 300 chars | Max: 255 chars | Excess: 45

üî¥ Listing 93: amenity_code
   Actual: 296 chars | Max: 255 chars | Excess: 41

üî¥ Listing 93: amenity_code
   Actual: 275 chars | Max: 255 chars | Excess: 20

üî¥ Listing 99: amenity_code
   Actual: 285 chars | Max: 255 chars | Excess: 30


üí° Solution: All violations need truncation in etl_airbnb_to_postgres.py


In [None]:
import psycopg2
from psycopg2 import sql
import json
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Any, Optional
import sys
from dotenv import load_dotenv
import os
load_dotenv()

‚úì All libraries imported successfully


In [None]:
DB_CONFIG = {
    'host': os.getenv('DB_HOST', 'localhost'),
    'database': os.getenv('DB_NAME', 'airbnb_db'),
    'user': os.getenv('DB_USER', 'postgres'),
    'password': os.getenv('DB_PASSWORD'),
    'port': int(os.getenv('DB_PORT', '5432'))
}

# File paths
JSON_FILE = 'Resources/airbnb_beltline_calgary_listings_100.json'
SCHEMA_FILE = 'database_normalized_schema.sql'

print("‚úì Configuration loaded")
print(f"  Host: {DB_CONFIG['host']}")
print(f"  Database: {DB_CONFIG['database']}")
print(f"  User: {DB_CONFIG['user']}")
print(f"  Port: {DB_CONFIG['port']}")

‚úì Configuration loaded
  Host: localhost
  Database: airbnb_db
  User: postgres
  Port: 5432


---
## Create Database

This creates the `airbnb_db` database if it doesn't already exist.

In [4]:
def create_database():
    """
    Create the airbnb_db database if it doesn't exist.
    
    Returns
    -------
    bool
        True if successful, False otherwise
    """
    try:
        # Connect to default postgres database
        conn = psycopg2.connect(
            host=DB_CONFIG['host'],
            database='postgres',  # Connect to default DB first
            user=DB_CONFIG['user'],
            password=DB_CONFIG['password'],
            port=DB_CONFIG['port']
        )
        conn.autocommit = True
        cursor = conn.cursor()
        
        print(f"üîÑ Checking if database '{DB_CONFIG['database']}' exists...")
        
        # Check if database exists
        cursor.execute(
            "SELECT 1 FROM pg_database WHERE datname = %s",
            (DB_CONFIG['database'],)
        )
        
        if cursor.fetchone():
            print(f"‚ÑπÔ∏è  Database '{DB_CONFIG['database']}' already exists")
        else:
            # Create database
            cursor.execute(
                sql.SQL("CREATE DATABASE {}").format(
                    sql.Identifier(DB_CONFIG['database'])
                )
            )
            print(f"‚úÖ Database '{DB_CONFIG['database']}' created successfully")
        
        cursor.close()
        conn.close()
        return True
        
    except psycopg2.Error as e:
        print(f"‚ùå Database error: {e}")
        return False
    except Exception as e:
        print(f"‚ùå Error: {e}")
        return False

# Execute database creation
if create_database():
    print("\n‚úÖ Step 4 Complete: Database ready")
else:
    print("\n‚ùå Step 4 Failed: Check your PostgreSQL credentials")

üîÑ Checking if database 'airbnb_db' exists...
‚ÑπÔ∏è  Database 'airbnb_db' already exists

‚úÖ Step 4 Complete: Database ready

‚ÑπÔ∏è  Database 'airbnb_db' already exists

‚úÖ Step 4 Complete: Database ready


---
##  Create Database Schema (15 Tables)

This executes the SQL script to create all normalized tables with proper relationships.

In [5]:
def create_schema():
    """
    Execute SQL schema file to create database tables.
    
    Returns
    -------
    bool
        True if successful, False otherwise
    """
    try:
        # Check if schema file exists
        schema_path = Path(SCHEMA_FILE)
        if not schema_path.exists():
            print(f"‚ùå Schema file not found: {SCHEMA_FILE}")
            return False
        
        # Connect to airbnb_db
        conn = psycopg2.connect(**DB_CONFIG)
        cursor = conn.cursor()
        
        print(f"üîÑ Creating database schema from '{SCHEMA_FILE}'...")
        
        # Read and execute schema file
        with open(schema_path, 'r', encoding='utf-8') as f:
            schema_sql = f.read()
        
        cursor.execute(schema_sql)
        conn.commit()
        
        # Verify tables created
        cursor.execute("""
            SELECT table_name 
            FROM information_schema.tables 
            WHERE table_schema = 'public'
            ORDER BY table_name
        """)
        tables = cursor.fetchall()
        
        print(f"\n‚úÖ Schema created successfully")
        print(f"\nüìä Tables created ({len(tables)} total):")
        for i, (table_name,) in enumerate(tables, 1):
            print(f"   {i:2d}. {table_name}")
        
        cursor.close()
        conn.close()
        return True
        
    except psycopg2.Error as e:
        print(f"‚ùå Database error: {e}")
        return False
    except Exception as e:
        print(f"‚ùå Error: {e}")
        return False

# Execute schema creation
if create_schema():
    print("\n‚úÖ Step 5 Complete: Database structure ready")
else:
    print("\n‚ùå Step 5 Failed: Could not create schema")

üîÑ Creating database schema from 'database_schema.sql'...

‚úÖ Schema created successfully

üìä Tables created (13 total):
    1. amenities
    2. amenity_groups
    3. hosts
    4. listing_amenities
    5. listing_arrangement_details
    6. listing_cancellation_policies
    7. listing_category_ratings
    8. listing_description_sections
    9. listing_highlights
   10. listing_house_rules
   11. listing_location_details
   12. listing_reviews
   13. listings

‚úÖ Step 5 Complete: Database structure ready

‚úÖ Schema created successfully

üìä Tables created (13 total):
    1. amenities
    2. amenity_groups
    3. hosts
    4. listing_amenities
    5. listing_arrangement_details
    6. listing_cancellation_policies
    7. listing_category_ratings
    8. listing_description_sections
    9. listing_highlights
   10. listing_house_rules
   11. listing_location_details
   12. listing_reviews
   13. listings

‚úÖ Step 5 Complete: Database structure ready


---
## Load and Preview JSON Data

Let's load the JSON file and preview its structure before inserting into the database.

In [6]:
def load_json_data(json_file: str) -> List[Dict[str, Any]]:
    """
    Load Airbnb listings from JSON file.
    
    Parameters
    ----------
    json_file : str
        Path to JSON file
    
    Returns
    -------
    list of dict
        List of listing dictionaries
    """
    try:
        json_path = Path(json_file)
        if not json_path.exists():
            print(f"‚ùå JSON file not found: {json_file}")
            return []
        
        print(f"üîÑ Loading data from '{json_file}'...")
        
        with open(json_path, 'r', encoding='utf-8') as f:
            listings = json.load(f)
        
        print(f"‚úÖ Loaded {len(listings)} listings")
        return listings
        
    except Exception as e:
        print(f"‚ùå Error loading JSON: {e}")
        return []

# Load data
listings = load_json_data(JSON_FILE)

# Preview first listing
if listings:
    print("\nüìã Preview of first listing:")
    first_listing = listings[0]
    print(f"   Name: {first_listing.get('name', 'N/A')}")
    print(f"   Price: ${first_listing.get('price', 'N/A')}")
    print(f"   Location: {first_listing.get('location', 'N/A')}")
    print(f"   Guests: {first_listing.get('guests', 'N/A')}")
    print(f"   Rating: {first_listing.get('ratings', 'N/A')}")
    print(f"   Reviews: {len(first_listing.get('reviews', []))}")
    print(f"   Amenities: {len(first_listing.get('amenities', []))} groups")
    
    print("\n‚úÖ Step 6 Complete: JSON data loaded")
else:
    print("\n‚ùå Step 6 Failed: Could not load JSON data")

üîÑ Loading data from 'Resources/airbnb_beltline_calgary_listings_100.json'...
‚úÖ Loaded 100 listings

üìã Preview of first listing:
   Name: Condo in Calgary ¬∑ ‚òÖ5.0 ¬∑ 1 bedroom ¬∑ 2 beds ¬∑ 1 bath
   Price: $181.5
   Location: Calgary, Alberta, Canada
   Guests: 4
   Rating: 5
   Reviews: 3
   Amenities: 13 groups

‚úÖ Step 6 Complete: JSON data loaded


---
## Load Data Using ETL Script

Now we'll use the existing ETL script to load all the data into the database with proper normalization.

In [None]:
# Reload the ETL module to get the latest fixes
import importlib
import etl_airbnb_normalized_postgres
importlib.reload(etl_airbnb_normalized_postgres)
from etl_airbnb_normalized_postgres import AirbnbETL

print("‚úÖ ETL module reloaded with latest fixes")

‚úÖ ETL module reloaded with latest fixes


In [None]:
# Import the ETL class from the existing script
from etl_airbnb_normalized_postgres import AirbnbETL
import logging

# Create a string buffer to capture error logs
import io
error_log_stream = io.StringIO()
error_handler = logging.StreamHandler(error_log_stream)
error_handler.setLevel(logging.ERROR)
error_handler.setFormatter(logging.Formatter('%(levelname)s - %(message)s'))
logging.getLogger('__main__').addHandler(error_handler)

print("üîÑ Starting ETL process...")
print("   This will insert data into all 15 tables with proper relationships.")
print("   This may take a minute...\n")

try:
    # Initialize ETL
    etl = AirbnbETL(DB_CONFIG)
    
    # Connect to database
    etl.connect()
    
    # Process each listing
    success_count = 0
    failed_count = 0
    
    for idx, listing in enumerate(listings, 1):
        if idx % 10 == 0 or idx == 1:
            print(f"   Processing listing {idx}/{len(listings)}...")
        
        if etl.process_listing(listing):
            success_count += 1
        else:
            failed_count += 1
    
    # No need to commit here - each listing commits individually
    print(f"\n‚úÖ ETL Complete!")
    print(f"   ‚úì Successfully processed: {success_count} listings")
    if failed_count > 0:
        print(f"   ‚ö†Ô∏è  Failed: {failed_count} listings")
    
    # Disconnect
    etl.disconnect()
    
    print("\n‚úÖ Step 7 Complete: All data loaded into database")
    
    # Show first 5 unique error messages
    error_logs = error_log_stream.getvalue()
    if error_logs:
        print("\nüìã Sample Error Messages (first 5 unique errors):")
        print("=" * 80)
        unique_errors = list(dict.fromkeys(error_logs.strip().split('\n')))[:5]
        for err in unique_errors:
            print(f"  ‚Ä¢ {err}")
    
except Exception as e:
    print(f"\n‚ùå ETL Error: {e}")
    print("\nTroubleshooting:")
    print("  1. Check that PostgreSQL is running")
    print("  2. Verify your database password in Step 3")
    print("  3. Ensure database_normalized_schema.sql exists in the project folder")

2025-11-10 20:40:59,192 - INFO - Successfully connected to database


üîÑ Starting ETL process...
   This will insert data into all 15 tables with proper relationships.
   This may take a minute...

   Processing listing 1/100...
   Processing listing 10/100...
   Processing listing 10/100...
   Processing listing 20/100...
   Processing listing 30/100...
   Processing listing 20/100...
   Processing listing 30/100...
   Processing listing 40/100...
   Processing listing 50/100...
   Processing listing 40/100...
   Processing listing 50/100...
   Processing listing 60/100...
   Processing listing 70/100...
   Processing listing 60/100...
   Processing listing 70/100...
   Processing listing 80/100...
   Processing listing 90/100...
   Processing listing 80/100...
   Processing listing 90/100...


2025-11-10 20:41:01,025 - INFO - Database connection closed


   Processing listing 100/100...

‚úÖ ETL Complete!
   ‚úì Successfully processed: 100 listings

‚úÖ Step 7 Complete: All data loaded into database


In [None]:
cur = psycopg2.connect(**DB_CONFIG).cursor()
cur.execute("SELECT listing_id, property_id, name FROM listings ORDER BY listing_id")
successful_listings = cur.fetchall()
cur.close()

print(f"\nüìä Successfully inserted listings ({len(successful_listings)} total):")
print("="  * 80)
for lid, prop_id, name in successful_listings[:10]:  # Show first 10
    print(f"  {lid}. {prop_id}: {name[:60]}...")
if len(successful_listings) > 10:
    print(f"  ... and {len(successful_listings) - 10} more")

# Check which listing indices from our JSON were successful
successful_prop_ids = {prop_id for _, prop_id, _ in successful_listings}
failed_indices = []
for idx, listing in enumerate(listings, 1):
    if listing.get('property_id') not in successful_prop_ids:
        failed_indices.append(idx)

print(f"\n‚ùå Failed listing indices: {failed_indices[:20]}...")  # Show first 20
print(f"\nüí° Total failed: {len(failed_indices)}")


üìä Successfully inserted listings (100 total):
  1. 1426378005713860735: Condo in Calgary ¬∑ ‚òÖ5.0 ¬∑ 1 bedroom ¬∑ 2 beds ¬∑ 1 bath...
  2. 779862525321826168: Rental unit in Calgary ¬∑ ‚òÖ4.85 ¬∑ 2 bedrooms ¬∑ 3 beds ¬∑ 1 bat...
  3. 1375556219860316591: Rental unit in Calgary ¬∑ ‚òÖ4.95 ¬∑ 2 bedrooms ¬∑ 2 beds ¬∑ 1 bat...
  4. 1404688484861443653: Condo in Calgary ¬∑ ‚òÖ4.94 ¬∑ 2 bedrooms ¬∑ 1 bed ¬∑ 1 bath...
  5. 21869477: Rental unit in Calgary ¬∑ ‚òÖ4.93 ¬∑ 1 bedroom ¬∑ 1 bed ¬∑ 1 bath...
  6. 1334815708091669351: Rental unit in Calgary ¬∑ ‚òÖ4.73 ¬∑ 2 bedrooms ¬∑ 2 beds ¬∑ 1 bat...
  7. 1268959458721206253: Rental unit in Calgary ¬∑ ‚òÖ4.81 ¬∑ 1 bedroom ¬∑ 2 beds ¬∑ 1 bath...
  8. 1429387547766358716: Rental unit in Calgary ¬∑ ‚òÖ4.74 ¬∑ 2 bedrooms ¬∑ 2 beds ¬∑ 2 bat...
  9. 877853278978311875: Rental unit in Calgary ¬∑ ‚òÖ4.82 ¬∑ 2 bedrooms ¬∑ 1 bed ¬∑ 1 bath...
  10. 1249574984122191715: Rental unit in Calgary ¬∑ ‚òÖ4.87 ¬∑ 2 bedrooms ¬∑ 2 beds ¬∑ 2 bat...
  ... and 90 mor

---
## Validate the Data

Let's verify that data was loaded correctly by checking row counts in each table.

In [10]:
def validate_data():
    """
    Check row counts in all tables to validate data loading.
    
    Returns
    -------
    dict
        Dictionary with table names and row counts
    """
    try:
        conn = psycopg2.connect(**DB_CONFIG)
        cursor = conn.cursor()
        
        # Get all table names
        cursor.execute("""
            SELECT table_name 
            FROM information_schema.tables 
            WHERE table_schema = 'public'
            ORDER BY table_name
        """)
        tables = [row[0] for row in cursor.fetchall()]
        
        print("üìä Database Statistics:\n")
        print(f"{'Table Name':<35} {'Row Count':>10}")
        print("-" * 47)
        
        table_counts = {}
        total_rows = 0
        
        for table in tables:
            cursor.execute(f"SELECT COUNT(*) FROM {table}")
            count = cursor.fetchone()[0]
            table_counts[table] = count
            total_rows += count
            print(f"{table:<35} {count:>10,}")
        
        print("-" * 47)
        print(f"{'TOTAL':<35} {total_rows:>10,}")
        
        cursor.close()
        conn.close()
        
        return table_counts
        
    except Exception as e:
        print(f"‚ùå Validation error: {e}")
        return {}

# Run validation
table_counts = validate_data()

if table_counts:
    print("\n‚úÖ Step 8 Complete: Data validation successful")
else:
    print("\n‚ùå Step 8 Failed: Could not validate data")

üìä Database Statistics:

Table Name                           Row Count
-----------------------------------------------
amenities                                  383
amenity_groups                              14
hosts                                       65
listing_amenities                        4,834
listing_arrangement_details                158
listing_cancellation_policies              141
listing_category_ratings                   558
listing_description_sections               412
listing_highlights                         291
listing_house_rules                        300
listing_location_details                    80
listing_reviews                          1,932
listings                                   100
-----------------------------------------------
TOTAL                                    9,268

‚úÖ Step 8 Complete: Data validation successful


---
## Some Sample Queries

### Query 1: Top 10 Highest-Rated Listings

In [11]:
def run_query(query_name: str, query: str):
    """
    Execute a SQL query and display results.
    
    Parameters
    ----------
    query_name : str
        Name of the query for display
    query : str
        SQL query to execute
    """
    try:
        conn = psycopg2.connect(**DB_CONFIG)
        cursor = conn.cursor()
        
        print(f"\n{'='*80}")
        print(f"üìä {query_name}")
        print(f"{'='*80}\n")
        
        cursor.execute(query)
        results = cursor.fetchall()
        
        if results:
            # Get column names
            colnames = [desc[0] for desc in cursor.description]
            
            # Print header
            header = " | ".join(f"{col[:15]:<15}" for col in colnames)
            print(header)
            print("-" * len(header))
            
            # Print rows (limit to 10)
            for row in results[:10]:
                row_str = " | ".join(f"{str(val)[:15]:<15}" for val in row)
                print(row_str)
            
            if len(results) > 10:
                print(f"\n... ({len(results) - 10} more rows)")
        else:
            print("No results found.")
        
        cursor.close()
        conn.close()
        
    except Exception as e:
        print(f"‚ùå Query error: {e}")

# Query 1: Top-rated listings
query1 = """
SELECT 
    name,
    price,
    rating,
    guests,
    number_of_reviews
FROM listings
WHERE rating IS NOT NULL
ORDER BY rating DESC, number_of_reviews DESC
LIMIT 10;
"""

run_query("Top 10 Highest-Rated Listings", query1)


üìä Top 10 Highest-Rated Listings

name            | price           | rating          | guests          | number_of_revie
---------------------------------------------------------------------------------------
Condo in Calgar | 160.00          | 5.00            | 4               | 55             
Rental unit in  | 153.97          | 5.00            | 2               | 39             
Rental unit in  | 124.96          | 5.00            | 2               | 29             
Condo in Calgar | 183.37          | 5.00            | 4               | 20             
Rental unit in  | 145.50          | 5.00            | 2               | 19             
Condo in Calgar | 154.90          | 5.00            | 4               | 15             
Condo in Calgar | 292.33          | 5.00            | 4               | 15             
Condo in Calgar | 265.33          | 5.00            | 4               | 10             
Rental unit in  | 179.00          | 5.00            | 2               | 7          

### Query 2: Average Price by Number of Bedrooms

In [12]:
query2 = """
SELECT 
    bedrooms,
    COUNT(*) as listing_count,
    ROUND(AVG(price), 2) as avg_price,
    ROUND(MIN(price), 2) as min_price,
    ROUND(MAX(price), 2) as max_price
FROM listings
WHERE bedrooms IS NOT NULL AND price IS NOT NULL
GROUP BY bedrooms
ORDER BY bedrooms;
"""

run_query("Average Price by Number of Bedrooms", query2)


üìä Average Price by Number of Bedrooms

bedrooms        | listing_count   | avg_price       | min_price       | max_price      
---------------------------------------------------------------------------------------
1               | 36              | 171.63          | 115.67          | 292.33         
2               | 41              | 199.65          | 140.94          | 345.00         


### Query 3: Most Common Amenities

In [13]:
query3 = """
SELECT 
    ag.group_name,
    a.amenity_name,
    COUNT(DISTINCT la.listing_id) as listing_count,
    ROUND(COUNT(DISTINCT la.listing_id) * 100.0 / 
          (SELECT COUNT(*) FROM listings), 1) as percentage
FROM amenity_groups ag
JOIN amenities a ON ag.group_id = a.group_id
JOIN listing_amenities la ON a.amenity_id = la.amenity_id
GROUP BY ag.group_name, a.amenity_name
ORDER BY listing_count DESC;
"""

run_query("Most Common Amenities (Top 15)", query3)



üìä Most Common Amenities (Top 15)

group_name      | amenity_name    | listing_count   | percentage     
---------------------------------------------------------------------
Kitchen and din | Kitchen         | 99              | 99.0           
Home safety     | Smoke alarm     | 97              | 97.0           
Bathroom        | Hair dryer      | 93              | 93.0           
Bedroom and lau | Iron            | 92              | 92.0           
Kitchen and din | Cooking basics  | 91              | 91.0           
Internet and of | Wifi            | 90              | 90.0           
Bathroom        | Hot water       | 90              | 90.0           
Kitchen and din | Microwave       | 89              | 89.0           
Home safety     | Carbon monoxide | 89              | 89.0           
Bedroom and lau | Hangers         | 89              | 89.0           

... (309 more rows)


### Query 4: Superhost Analysis

In [14]:
query4 = """
SELECT 
    h.is_superhost,
    COUNT(DISTINCT h.host_id) as host_count,
    COUNT(l.listing_id) as total_listings,
    ROUND(AVG(l.rating), 2) as avg_rating,
    ROUND(AVG(l.price), 2) as avg_price,
    ROUND(AVG(h.response_rate), 1) as avg_response_rate
FROM hosts h
LEFT JOIN listings l ON h.host_id = l.host_id
GROUP BY h.is_superhost
ORDER BY h.is_superhost DESC;
"""

run_query("Superhost vs Regular Host Comparison", query4)


üìä Superhost vs Regular Host Comparison

is_superhost    | host_count      | total_listings  | avg_rating      | avg_price       | avg_response_ra
---------------------------------------------------------------------------------------------------------
True            | 43              | 71              | 4.89            | 186.36          | 99.7           
False           | 22              | 29              | 4.11            | 185.90          | 98.9           
is_superhost    | host_count      | total_listings  | avg_rating      | avg_price       | avg_response_ra
---------------------------------------------------------------------------------------------------------
True            | 43              | 71              | 4.89            | 186.36          | 99.7           
False           | 22              | 29              | 4.11            | 185.90          | 98.9           


### Query 5: Category Ratings Breakdown

In [15]:
query5 = """
SELECT 
    category_name,
    ROUND(AVG(rating_value), 2) as avg_rating,
    ROUND(MIN(rating_value), 2) as min_rating,
    ROUND(MAX(rating_value), 2) as max_rating,
    COUNT(DISTINCT listing_id) as listing_count
FROM listing_category_ratings
GROUP BY category_name
ORDER BY avg_rating DESC;
"""

run_query("Category Ratings Analysis", query5)

print("\n‚úÖ Step 9 Complete: Sample queries executed successfully")


üìä Category Ratings Analysis

category_name   | avg_rating      | min_rating      | max_rating      | listing_count  
---------------------------------------------------------------------------------------
Communication   | 4.91            | 4.00            | 5.00            | 93             
Location        | 4.90            | 4.00            | 5.00            | 93             
Accuracy        | 4.89            | 4.40            | 5.00            | 93             
Cleanliness     | 4.84            | 4.00            | 5.00            | 93             
Check-in        | 4.83            | 4.30            | 5.00            | 93             
Value           | 4.80            | 4.20            | 5.00            | 93             

‚úÖ Step 9 Complete: Sample queries executed successfully
category_name   | avg_rating      | min_rating      | max_rating      | listing_count  
---------------------------------------------------------------------------------------
Communication   | 4.91      