### Data exploration

##### Required Python packages

In [None]:
import importlib
import subprocess
import sys

# We list special packages that don't exist in jupyter installation
special_required_packages = {
    "duckdb": "duckdb"
}

# Verify special packages
for module_name, pip_name in special_required_packages.items():
    try:
        importlib.import_module(module_name)
        print(f"{module_name} already installed")
    except ImportError:
        print(f"Installing {pip_name}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pip_name])

Installing duckdb...
Collecting duckdb
  Downloading duckdb-1.3.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.0 kB)
Downloading duckdb-1.3.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (21.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.1/21.1 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: duckdb
Successfully installed duckdb-1.3.2


In [17]:
import os
import sys
import json
import duckdb
import pandas as pd
from datetime import datetime

##### Verify system version and set input data path, database path

In [3]:
# Input data path and database path within container
CSV_PATH = "/home/jovyan/challenge/data/ads_spend.csv"
DB_PATH = "/home/jovyan/challenge/database/warehouse.db"

# Verify directory path and python version
print(f"Working directory: {os.getcwd()}")
print(f"Python version: {sys.version}")

Working directory: /home/jovyan/challenge
Python version: 3.11.6 | packaged by conda-forge | (main, Oct  3 2023, 10:40:35) [GCC 12.3.0]


##### Explore first 5 rows within input data (ads_spend.csv)

In [None]:
# Verify if csv exists
if os.path.exists(CSV_PATH):
    
    # Take first 5 rows into pandas format, just for inspection
    df_sample = pd.read_csv(CSV_PATH, nrows=5)
    display(df_sample)
    
else:
    print("CSV file not found")

# Delete dataframe
del df_sample

Unnamed: 0,date,platform,account,campaign,country,device,spend,clicks,impressions,conversions
0,2025-01-01,Meta,AcctA,Prospecting,MX,Desktop,1115.94,360,15840,29
1,2025-01-01,Google,AcctA,Brand_Search,CA,Mobile,789.43,566,22640,28
2,2025-01-01,Google,AcctA,Prospecting,BR,Desktop,381.4,133,10241,12
3,2025-01-01,Google,AcctC,Prospecting,US,Desktop,1268.34,891,49005,36
4,2025-01-01,Google,AcctA,Brand_Search,BR,Desktop,1229.7,628,21352,31


##### Some features and big picture within input data (ads_spend.csv)

In [None]:
# Verify if csv exists
if os.path.exists(CSV_PATH):
    
    # Full data set info
    df_full = pd.read_csv(CSV_PATH)
    print(f"Dataset shape: {df_full.shape}")
    print(f"Columns: {list(df_full.columns)}")
    print(f"Spend total: ${df_full['spend'].sum():,.2f}")
    print(f"Date range: {df_full['date'].min()} a {df_full['date'].max()}")
    print(f"Platforms: {df_full['platform'].unique()}")
    print(f"Unique accounts: {df_full['account'].unique()}")
    
    # Verify data types
    for col, dtype in df_full.dtypes.items():
        print(f"  {col}: {dtype}")
        
else:
    print("CSV file not found")

# Delete dataframe
del df_full

dataset shape: (2000, 10)
Columns: ['date', 'platform', 'account', 'campaign', 'country', 'device', 'spend', 'clicks', 'impressions', 'conversions']
Spend total: $1,690,764.32
Date range: 2025-01-01 a 2025-06-30
Platforms: ['Meta' 'Google']
Unique accounts: ['AcctA' 'AcctC' 'AcctB']
  date: object
  platform: object
  account: object
  campaign: object
  country: object
  device: object
  spend: float64
  clicks: int64
  impressions: int64
  conversions: int64


### Data ingestion process

##### Function to ingest data in database

In [15]:
def ingest_data():

    # Error handling
    try:

        # Verify csv file again
        if not os.path.exists(CSV_PATH):
            raise FileNotFoundError(f"{CSV_PATH} csv not found")
            
        # Read CSV and convert to dataframe
        input_data_df = pd.read_csv(CSV_PATH)
        print(f"csv loaded: {input_data_df.shape[0]:,} rows, {input_data_df.shape[1]} columns")
        
        # Validate if csv is empty
        if input_data_df.empty:
            raise ValueError("csv is empty")

        # Validate columns name (we put the list of columns that we already know exist)    
        required_columns = ['date', 'platform', 'account', 'campaign', 
                           'country', 'device', 'spend', 'clicks', 
                           'impressions', 'conversions']
        
        # Check if we have missing columns
        missing_cols = [col for col in required_columns if col not in input_data_df.columns]
        if missing_cols:
            raise ValueError(f"Missing columns: {missing_cols}")

        # Check null or empty values in columns
        null_empty_columns = [col for col in required_columns if input_data_df[col].isnull().any() or input_data_df[col].apply(lambda x: isinstance(x, str) and x.strip() == "").any()]

        if null_empty_columns:
            raise ValueError(f"Columnas con valores nulos o vacíos: {problematic_columns}")
        
        # Add metadata
        # Date
        input_data_df['load_date'] = datetime.now()
        # Filename
        input_data_df['source_file_name'] = 'ads_spend.csv'
        
        print(f"load_date: {input_data_df['load_date'].iloc[0]}")
        print(f"source_file_name: {input_data_df['source_file_name'].iloc[0]}")
        
        # Connect to duckDB
        conn = duckdb.connect(DB_PATH)
        
        create_table_sql = """
        CREATE TABLE IF NOT EXISTS ads_spend_db (
            date DATE,
            platform VARCHAR,
            account VARCHAR,
            campaign VARCHAR,
            country VARCHAR,
            device VARCHAR,
            spend DECIMAL(12,2),
            clicks INTEGER,
            impressions INTEGER,
            conversions INTEGER,
            -- Metadata challenge required
            load_date TIMESTAMP,
            source_file_name VARCHAR
        );
        """
        # Verify table is already created or just verify
        conn.execute(create_table_sql)
        print("Table ads_spend_db verified/created")
        
        # Count register before add
        count_before = conn.execute("SELECT COUNT(*) FROM ads_spend_db").fetchone()[0]
        
        # Insert data in append mode to demostrate persistence
        conn.register('df_new', input_data_df)
        conn.execute("INSERT INTO ads_spend_db SELECT * FROM df_new")
        
        # Count register after add
        count_after = conn.execute("SELECT COUNT(*) FROM ads_spend_db").fetchone()[0]
        
        conn.close()
        
        # Result for n8n in JSON format
        result = {
            "status": "success",
            "timestamp": datetime.now().isoformat(),
            "rows_inserted": len(input_data_df),
            "total_rows_before": count_before,
            "total_rows_after": count_after,
            "source_file": "ads_spend.csv",
            "message": f"Successfully ingested {len(input_data_df):,} rows into warehouse"
        }
        
        print(f"Register added: {len(input_data_df):,}")
        print(f"Total regiser in DB now: {count_after:,}")
        print(f"Increment: +{count_after - count_before:,}")
        
        return result

    # Error handling    
    except Exception as error:
        error_result = {
            "status": "error",
            "timestamp": datetime.now().isoformat(),
            "error_message": str(error),
            "error_type": type(error).__name__
        }
        print(f"ERROR: {error}")
        return error_result

##### Execute data ingest

In [23]:
# Ingest data to database
print("Data Ingest in Execution")
result = ingest_data()

# Get result for n8n
print(f"\nFinal Result: ")
print(json.dumps(result, indent=2))

Data Ingest in Execution
csv loaded: 2,000 rows, 10 columns
load_date: 2025-08-28 00:47:37.086398
source_file_name: ads_spend.csv
Table ads_spend_db verified/created
Register added: 2,000
Total regiser in DB now: 4,000
Increment: +2,000

Final Result: 
{
  "status": "success",
  "timestamp": "2025-08-28T00:47:37.316933",
  "rows_inserted": 2000,
  "total_rows_before": 2000,
  "total_rows_after": 4000,
  "source_file": "ads_spend.csv",
  "message": "Successfully ingested 2,000 rows into warehouse"
}


### Verify data persistence