In [6]:
%load_ext autoreload
%autoreload 2
import time

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## ETL Pipeline

### Run the Extraction and Transformation (save to CSV file)

In [7]:
from scripts.run_etl import main
main()
time.sleep(0.1)

2025-05-09 11:55:24 - root - INFO - Starting Museum Data ETL Process
2025-05-09 11:55:24 - root - INFO - Fetching Wikipedia page: List_of_most_visited_museums
2025-05-09 11:55:24 - root - INFO - Requesting HTML for page: List_of_most_visited_museums from https://en.wikipedia.org/w/api.php
2025-05-09 11:55:24 - root - INFO - Successfully retrieved HTML for List_of_most_visited_museums
2025-05-09 11:55:24 - root - INFO - Extracting museum data table from HTML...
2025-05-09 11:55:24 - root - INFO - Attempting table extraction matching pattern: 'Visitors in 2024'
2025-05-09 11:55:24 - root - INFO - Found table using match='Visitors in 2024'.
2025-05-09 11:55:24 - root - INFO - Processing extracted table with shape: (81, 4)
2025-05-09 11:55:24 - root - INFO - Successfully extracted table. Shape: (81, 4)
2025-05-09 11:55:24 - root - INFO - Cleaning and filtering museum data...
2025-05-09 11:55:24 - root - INFO - Using column 'visitors_in_2024' for visitor data.
2025-05-09 11:55:24 - root - I

### Load the Data into a Database

In [8]:
from scripts.load_data_from_csv_to_db import main
main()
time.sleep(0.1)

2025-05-09 11:55:26 - root - INFO - Initializing database at sqlite:////Users/dev/visitum/data/visitum.db
2025-05-09 11:55:26 - root - INFO - Database tables checked/created by init_db.
2025-05-09 11:55:26 - root - INFO - Loading data from /Users/dev/visitum/data/enriched_museum_data.csv...
2025-05-09 11:55:26 - root - INFO - Successfully committed data: 0 new cities, 0 new museums.
2025-05-09 11:55:26 - root - INFO - Database session closed.


### Get the Data from the Database (just to check)

In [9]:
from db.queries import get_museums_with_city_population
from db.database import get_db

with get_db() as db:
    print("Fetching museum data...")
    museums = get_museums_with_city_population(db)
    
    if museums:
        print(f"Successfully fetched {len(museums)} records.")
        # Print the first few records as an example
        for record in museums[:50]:
            print(record)
    else:
        print("No museum data found or an error occurred.")
    print("Database session closed.")


2025-05-09 11:55:26 - root - INFO - Fetched 53 museums with city population data.


Fetching museum data...
Successfully fetched 53 records.
('Louvre', 8700000, 'Paris', 2138551)
('Vatican Museums', 6825436, 'Vatican City, Rome', 2318895)
('National Museum of China', 6300000, 'Beijing', 18960744)
('British Museum', 6479952, 'London', 8961989)
('Natural History Museum, South Kensington', 6301972, 'London', 8961989)
('Metropolitan Museum of Art', 5727258, 'New York City', 8804190)
('American Museum of Natural History', 5400000, 'New York City', 8804190)
('Galleria degli Uffizi', 2908828, 'Florence', 367150)
("Musée d'Orsay", 3751000, 'Paris', 2138551)
('Tate Modern', 4603025, 'London', 8961989)
('National Gallery of Art', 3936543, 'Washington, D.C.', 689545)
('State Hermitage Museum', 3563590, 'Saint Petersburg', 5351935)
('Prado Museum', 3457057, 'Madrid', 3255944)
('Victoria and Albert Museum', 3525700, 'London, South Kensington', 8961989)
('National Air and Space Museum[a]', 3100000, 'Washington, D.C.', 689545)
('National Gallery', 3203451, 'London', 8961989)
('Natio

## Model Training

In [10]:
from scripts.train_model import main
main()
time.sleep(0.1)


2025-05-09 11:55:26 - root - INFO - Running training pipeline as standalone script...
2025-05-09 11:55:26 - root - INFO - Starting training pipeline...
2025-05-09 11:55:26 - root - INFO - Connecting to the database...
2025-05-09 11:55:26 - root - INFO - Fetching model features from the database...
2025-05-09 11:55:26 - root - INFO - Fetching features for model training (population vs visitors).
2025-05-09 11:55:26 - root - INFO - Successfully fetched 53 records for model training.
2025-05-09 11:55:26 - root - INFO - Fetched 53 records. Proceeding to model training.
2025-05-09 11:55:26 - root - INFO - Starting model training with 53 records.
2025-05-09 11:55:26 - root - INFO - Model R-squared score: -0.03; it's... CATASTROPHIC :(
2025-05-09 11:55:26 - root - INFO - Model training complete. Intercept: 2590829.84, Coefficient: 0.01
2025-05-09 11:55:26 - root - INFO - Model equation: visitors_count ≈ 0.01 * population + 2590829.84
2025-05-09 11:55:26 - root - INFO - Model training complete