In [1]:
import requests
import json
import psycopg2
from psycopg2.extras import execute_values



In [2]:
# 1. Fetch data from Dati Lombardia API
def fetch_data_from_api(api_url):
    response = requests.get(api_url)
    if response.status_code == 200:
        return response.json()
    else:
        raise Exception(f"API request failed with status code {response.status_code}")

In [17]:
# 2. Process API data and filter out problematic columns
def process_api_data(data_list):
    processed_data = []
    
    # List of columns to exclude
    columns_to_exclude = [":@computed_region_6hky_swhk"]
    
    for item in data_list:
        processed_item = {}
        
        # Copy only the desired fields, skipping problematic ones
        for key, value in item.items():
            # Skip excluded columns
            if key in columns_to_exclude:
                continue
                
            # Handle nested dictionaries by converting to JSON strings
            if isinstance(value, dict):
                processed_item[key] = json.dumps(value)
            else:
                processed_item[key] = value
                
        processed_data.append(processed_item)
        
    return processed_data

In [18]:
# 3. Connect to PostgreSQL database
def connect_to_postgres():
    conn = psycopg2.connect(
        host="localhost",        # Update with your database host
        database="lombardia_air_quality", # Update with your database name
        user="postgres",    # Update with your username
        password="Milano" # Update with your password
    )
    return conn

In [19]:
# 4. Create table if it doesn't exist
def create_table_if_not_exists(conn, table_name, data_sample):
    cursor = conn.cursor()
    
    # Create schema SQL statement based on the data structure
    columns = []
    for key, value in data_sample.items():
        column_type = "TEXT"  # Default type
        
        # Try to infer the data type
        if isinstance(value, int):
            column_type = "INTEGER"
        elif isinstance(value, float):
            column_type = "NUMERIC"
        elif isinstance(value, bool):
            column_type = "BOOLEAN"
        elif isinstance(value, dict):
            column_type = "JSONB"  # Use JSONB for nested structures
        # Special case for timestamp fields
        elif key == "datastart" or key == "datastop":
            column_type = "TIMESTAMP"

        columns.append(f"\"{key}\" {column_type}")

    create_table_sql = f"""
    DROP TABLE IF EXISTS {table_name};
    CREATE TABLE {table_name} (
        id SERIAL PRIMARY KEY,
        {', '.join(columns)},
        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
    );
    """
    
    cursor.execute(create_table_sql)
    conn.commit()
    cursor.close()

In [20]:
# 5. Insert data into table
def insert_data(conn, table_name, data_list):
    cursor = conn.cursor()
    
    if not data_list:
        print("No data to insert")
        return
    
    # Get column names from the first data item
    columns = list(data_list[0].keys())
    
    # Prepare values for insertion
    values = [[item.get(col) for col in columns] for item in data_list]
    
    # Create the SQL query
    insert_query = f"""
    INSERT INTO {table_name} ({', '.join([f'"{col}"' for col in columns])})
    VALUES %s
    """
    
    # Execute the query with all values
    execute_values(cursor, insert_query, values)
    
    conn.commit()
    print(f"Inserted {len(data_list)} records into {table_name}")
    cursor.close()

In [None]:

# API URL
api_url = "https://www.dati.lombardia.it/resource/ib47-atvt.json"

# Define the table name for your data
table_name = "sensors"

try:
    # Fetch data from API
    print("Fetching data from API...")
    raw_data = fetch_data_from_api(api_url)
    
    # Debug: Inspect the data structure
    print("Sample data item structure:")
    if raw_data:
        print(json.dumps(raw_data[0], indent=2))
    
    # Process the data to handle nested structures and filter out problematic columns
    print("Processing data...")
    processed_data = process_api_data(raw_data)
    
    # Connect to PostgreSQL
    print("Connecting to PostgreSQL...")
    conn = connect_to_postgres()
    
    # Create table if it doesn't exist (using actual data to infer schema)
    print("Creating table if it doesn't exist...")
    if processed_data:
        create_table_if_not_exists(conn, table_name, processed_data[0])
    
    # Insert data into table
    print("Inserting data into table...")
    insert_data(conn, table_name, processed_data)
    
    # Close connection
    conn.close()
    print("Process completed successfully!")
    
except Exception as e:
    print(f"Error: {str(e)}")


Fetching data from API...
Sample data item structure:
{
  "idsensore": "12691",
  "nometiposensore": "Arsenico",
  "unitamisura": "ng/m\u00b3",
  "idstazione": "560",
  "nomestazione": "Varese v.Copelli",
  "quota": "383",
  "provincia": "VA",
  "comune": "Varese",
  "storico": "N",
  "datastart": "2008-04-01T00:00:00.000",
  "utm_nord": "5073728",
  "utm_est": "486035",
  "lat": "45.81697450",
  "lng": "8.82024911",
  "location": {
    "type": "Point",
    "coordinates": [
      8.82024911,
      45.8169745
    ]
  },
  ":@computed_region_6hky_swhk": "1",
  ":@computed_region_ttgh_9sm5": "1",
  ":@computed_region_af5v_nc64": "3"
}
Processing data...
Connecting to PostgreSQL...
Creating table if it doesn't exist...
Inserting data into table...
Inserted 984 records into sensors
Process completed successfully!
