# Libraries

In [1]:
import pandas as pd
import numpy as np
import uuid
import random
from datetime import datetime, timedelta

# Constants

In [3]:
CITIES = [
    "Madrid",
    "Barcelona",
    "Seville",
    "Valencia",
    "Zaragoza",
    "Bilbao",
    "Malaga",
    "Granada",
    "Vigo",
    "Santander",
    "Alicante",
    "Murcia",
    "Gijón",
    "Valladolid",
    "La Coruña",
    "Córdoba",
    "Vitoria-Gasteiz",
    "Oviedo",
    "Elche",
    "Badalona",
    "Tarragona",
    "Pamplona",
    "Almería",
    "San Sebastián",
    "Burgos",
    "Salamanca",
    "Albacete",
    "Logroño",
    "Badajoz",
    "Huelva",
    "Lleida",
    "Torrejón de Ardoz",
    "Ourense",
    "Castellón de la Plana",
    "Algeciras",
    "Getafe",
    "Reus",
    "Toledo",
    "Leon",
    "Jaén",
    "Sabadell",
    "Mataró",
    "Alcorcón",
    "Fuenlabrada",
    "Leganés",
    "Santander",
    "Santa Cruz de Tenerife",
    "Las Palmas de Gran Canaria",
    "Guadalajara",
    "Cádiz",
    "Benidorm",
    "Palma de Mallorca",
    "San Fernando",
    "Torrevieja",
    "Pontevedra",
    "Ciudad Real",
    "Cuenca",
    "Cáceres",
    "Ferrol",
    "Marbella",
    "Girona",
    "Melilla",
    "Ceuta",
    "Telde",
    "Manresa",
    "Alcobendas",
    "San Sebastián de los Reyes",
    "Talavera de la Reina",
    "Santiago de Compostela",
    "Chiclana de la Frontera"
]

STREETS = [
    "Calle Alcalá",
    "Gran Vía",
    "Calle de Serrano",
    "Paseo de la Castellana",
    "Avenida Diagonal",
    "Carrer de Balmes",
    "Calle del Arenal",
    "Paseo del Prado",
    "Calle Mayor",
    "Calle de Velázquez",
    "Calle de Goya",
    "Ronda de Atocha",
    "Paseo de Recoletos",
    "Calle del Doctor Esquerdo",
    "Calle de O'Donnell",
    "Calle de Arturo Soria",
    "Avenida de América",
    "Calle de Bravo Murillo",
    "Paseo de Extremadura",
    "Calle de San Bernardo",
    "Carrer de València",
    "Carrer d'Aragó",
    "Carrer de Muntaner",
    "Carrer de Casanova",
    "Carrer del Consell de Cent",
    "Calle de Alfonso XII",
    "Calle de Ponzano",
    "Calle del General Díaz Porlier",
    "Calle de Ferraz",
    "Calle de Ibiza",
    "Calle de Jorge Juan",
    "Calle de Raimundo Fernández Villaverde",
    "Paseo de Juan XXIII",
    "Calle de Embajadores",
    "Calle de Narváez",
    "Calle de Orense",
    "Calle de Alcalde Sáinz de Baranda",
    "Calle de Vallehermoso",
    "Avenida de Valladolid",
    "Calle de Cartagena",
    "Calle del Marqués de Urquijo",
    "Avenida de la Albufera",
    "Paseo de Santa María de la Cabeza",
    "Calle de Príncipe de Vergara",
    "Calle del Conde de Peñalver",
    "Calle de Francisco Silvela",
    "Calle de José Abascal",
    "Calle de López de Hoyos",
    "Paseo de la Habana",
    "Calle de la Princesa",
    "Calle del Conde de Casal",
    "Calle de Montera",
    "Calle de Fuencarral",
    "Calle de Lavapiés",
    "Calle de Embajadores",
    "Calle de Hortaleza",
    "Calle de Argumosa",
    "Calle de Toledo",
    "Calle del Barquillo",
    "Calle de Santa Engracia",
    "Calle del Espíritu Santo"
]

PRODUCTS = {
    "fertilizers": [
        ("fert_001", "Nitrogen Boost 20-10-10"),
        ("fert_002", "Organic Compost Mix"),
        ("fert_003", "Phosphorus Growth Enhancer"),
        ("fert_004", "Liquid Seaweed Fertilizer"),
        ("fert_005", "Potash Enriched Formula")
    ],
    "seeds": [
        ("seed_001", "Sunflower Hybrid Seeds"),
        ("seed_002", "Organic Tomato Seeds"),
        ("seed_003", "Drought-Resistant Corn Seeds"),
        ("seed_004", "Wildflower Meadow Mix"),
        ("seed_005", "High-Yield Soybean Seeds")
    ],
    "pesticides": [
        ("pest_001", "Organic Neem Oil Spray"),
        ("pest_002", "Aphid and Mite Killer"),
        ("pest_003", "Fungicide for Fruit Trees"),
        ("pest_004", "Insect Barrier Dust"),
        ("pest_005", "Natural Slug Repellent")
    ],
    "equipment": [
        ("equip_001", "Heavy-Duty Tiller"),
        ("equip_002", "Precision Seed Spreader"),
        ("equip_003", "Battery-Powered Sprayer"),
        ("equip_004", "Compact Greenhouse Kit"),
        ("equip_005", "Soil pH Testing Meter")
    ],
    "supplies": [
        ("supply_001", "Garden Gloves Set"),
        ("supply_002", "Drip Irrigation Kit"),
        ("supply_003", "Plant Support Stakes"),
        ("supply_004", "Harvest Baskets"),
        ("supply_005", "Protective Row Covers")
    ],
    "soil": [
        ("soil_001", "Premium Potting Mix"),
        ("soil_002", "Cactus and Succulent Soil"),
        ("soil_003", "Raised Bed Soil Blend"),
        ("soil_004", "Organic Garden Soil"),
        ("soil_005", "Mulch and Soil Conditioner")
    ]
}


# Online Sales Dataset

In [4]:
def null_values(df: pd.DataFrame, nulls_dist: dict, n_rows: int):
    df = df.copy()
    n_rows = len(df)

    for col, null_ratio in nulls_dist.items():
        if col not in df.columns or null_ratio == 0:
            continue
        
        n_nulls = int(null_ratio * n_rows)
        if n_nulls == 0:
            continue

        null_indices = np.random.choice(df.index, size=n_nulls, replace=False)
        df.loc[null_indices, col] = np.nan

    return df

def generate_random_locations_spain(n_rows):
    lats = np.random.uniform(36.0, 43.8, n_rows)
    lons = np.random.uniform(-9.3, 3.3, n_rows)
    locations = np.column_stack((np.round(lats, 6), np.round(lons, 6)))
    return locations

def generate_address():
    street = random.choice(STREETS)
    number = random.randint(1, 200)
    
    if random.random() < 0.75:
        postal_code = random.randint(10000, 52999)
        return f"{street}, {number}, {postal_code}"
    else:
        return f"{street}, {number}"
    
def generate_city():
    city = random.choice(CITIES)
    return modify_capitalization(city)
    
def modify_capitalization(word):
    num = random.random()
    if num < 0.35:
        return word.upper()
    elif num < 0.7:
        return word.lower()
    else:
        return word
    
def generate_product(category):
    products = PRODUCTS.get(category)
    product_id, product = random.choice(products)
    num = random.randint(1, 5)
    separator = random.choice([" "*num, " - ", ", "])
    product_str = f"{product_id}{separator}{modify_capitalization(product)}"
    return product_str

def generate_online_sales_data(nulls_dist: dict, n_rows: int=100_000, n_users: int =3_000, n_orders=70_000):
    users = [f"user_{str(uuid.uuid4())}" for _ in range(n_users)]
    order_ids = [f"order_{str(uuid.uuid4())}" for _ in range(n_orders)]
    categories = ["fertilizers", "seeds", "pesticides", "equipment", "supplies", "soil"]
    random_locations = generate_random_locations_spain(n_rows)
    dates = {order_id: datetime.now() - timedelta(days=np.random.randint(0, 90)) for order_id in order_ids}

    data = {
        "order_id": np.random.choice(order_ids, n_rows),
        "transaction_id": [f"tx_{str(uuid.uuid4())}" for _ in range(n_rows)],
        "user_id": np.random.choice(users, n_rows),
        "sales_price": np.random.uniform(50, 200, n_rows).round(2),
        "currency": np.random.choice(["EUR", "USD"], n_rows),
        "payment_method": np.random.choice(["debit_card", "credit_card", "paypal"], n_rows),
        "transaction_status": np.random.choice(["completed", "failed", "processing"], n_rows),
        "coord": [tuple(x) for x in random_locations],
        "city": [generate_city() for _ in range(n_rows)],
        "address": [generate_address() for _ in range(n_rows)],
        "category": np.random.choice(categories, n_rows)
    }
    
    df = pd.DataFrame(data)
    df["product"] = df["category"].apply(lambda category: generate_product(category))
    df["timestamp"] = df["order_id"].apply(lambda order_id: dates.get(order_id))
    df["timestamp"] = df["timestamp"].dt.floor("ms")
    
    df = null_values(df, nulls_dist, n_rows)
    
    return df

# Inventory dataset

In [5]:
def generate_sales_price(purchase_price):
    margin = random.uniform(1.1, 1.5)
    sales_price = round(purchase_price * margin, 2)
    return sales_price

def generate_manufacturing_date(purchase_date: datetime):
    return purchase_date - timedelta(days=np.random.randint(0, 60))

def generate_inventory_data(nulls_dist: dict, n_rows: int=100_000, n_stores: int=50, n_vendors: int=250):
    categories = ["fertilizers", "seeds", "pesticides", "equipment", "supplies", "soil"]
    stores = [f"store_{str(uuid.uuid4())}" for _ in range(n_stores)]
    vendors = [f"vendor_{str(uuid.uuid4())}" for _ in range(n_vendors)]
    data = {
        "timestamp": [datetime.now() - timedelta(days=np.random.randint(0, 120)) for _ in range(n_rows)],
        "store_id": np.random.choice(stores, n_rows),
        "category": np.random.choice(categories, n_rows),
        "stock_quantity": np.random.uniform(100, 2000, n_rows).round(0),
        "purchase_price": np.random.uniform(50, 200, n_rows).round(2),
        "vendor_id": np.random.choice(vendors, n_rows),
    }
    
    df = pd.DataFrame(data)
    df["product"] = df["category"].apply(lambda category: generate_product(category))
    df["sales_price"] = df["purchase_price"].apply(lambda purchase: generate_sales_price(purchase))
    df["manufacturing_date"] = df["timestamp"].apply(lambda date: date.date() - timedelta(days=np.random.randint(0, 60)))
    df["expiration_date"] = df["manufacturing_date"].apply(lambda date: date + timedelta(days=np.random.randint(30, 150)))
    
    df = null_values(df, nulls_dist, n_rows)
    
    return df

# Meteorologic Dataset

In [6]:
def generate_climate_data(null_dist, n_stations, start_date, end_date):
    timestamps = pd.date_range(start=start_date, end=end_date, freq='H', inclusive='left').tolist()
    station_ids = [f"station_{i:03d}" for i in range(1, n_stations + 1)]
    station_coords = {
        st_id: (round(np.random.uniform(36.0, 43.8), 6), round(np.random.uniform(-9.3, 3.3), 6))
        for st_id in station_ids
    }

    data = {
        "timestamp": [],
        "station_id": [],
        "coord": [],
        "temperature_C": [],
        "precipitation_mm": [],
        "humidity_percent": [],
        "solar_radiation_Wm2": [],
        "wind_speed_mps": [],
    }

    for st_id in station_ids:
        n = len(timestamps)
        data["timestamp"] = data["timestamp"] + timestamps
        data["station_id"] = data["station_id"] + [st_id for _ in range(n)]
        data["coord"] = data["coord"] + [station_coords.get(st_id) for _ in range(n)]
        data["temperature_C"] = data["temperature_C"] + np.random.normal(15, 10, n).clip(-10, 45).round(1).tolist()
        data["precipitation_mm"] = data["precipitation_mm"] + np.random.exponential(0.5, n).round(2).tolist()
        data["humidity_percent"] = data["humidity_percent"] + np.random.uniform(20, 100, n).round(1).tolist()
        data["solar_radiation_Wm2"] = data["solar_radiation_Wm2"] + np.random.uniform(0, 1200, n).round(1).tolist()
        data["wind_speed_mps"] = data["wind_speed_mps"] + np.random.uniform(0, 20, n).round(1).tolist()
    
    df = pd.DataFrame(data)
    df = null_values(df, null_dist, len(df))
    
    return df


# Logistics Dataset

In [7]:
def generate_actual_delivery_date(scheduled_date, status):
    if status == "received":
        if random.random() < 0.80:
            n_days = 0
        else:
            n_days = random.randint(1,5)
        return scheduled_date + timedelta(days=n_days)
    else:
        return None

def generate_shipment_status():
    if random.random() < 0.80:
        return "received"
    else:
        return random.choice(["pending", "sent"])

def generate_logistics_dataset(df_online_sales, null_dist):
    orders_df = df_online_sales.groupby("order_id")[["timestamp", "coord"]].first().reset_index()
    orders_df["pickup_date"] = orders_df["timestamp"].apply(lambda date: date.date() + timedelta(days=np.random.randint(0, 5)))
    orders_df["scheduled_delivery_date"] = orders_df["pickup_date"].apply(lambda date: date + timedelta(days=np.random.randint(0, 3)))
    orders_df["shipment_status"] = [generate_shipment_status() for _ in range(len(orders_df))]
    orders_df["actual_delivery_date"] = orders_df.apply(lambda row: generate_actual_delivery_date(row["scheduled_delivery_date"], row["shipment_status"]), axis=1)
    df = null_values(orders_df, null_dist, len(orders_df))
    
    return df
    

# Clean Folders Content

In [12]:
import os
import shutil

def clear_folder_contents(folder_path):
    for item in os.listdir(folder_path):
        item_path = os.path.join(folder_path, item)
        try:
            if os.path.isfile(item_path) or os.path.islink(item_path):
                os.unlink(item_path)
            elif os.path.isdir(item_path):
                shutil.rmtree(item_path)
        except Exception as e:
            print(f"Error deleting {item_path}: {e}")

In [16]:
folder = "online_sales"
clear_folder_contents(f"./datasets/{folder}")

# Data generation

## Online sales and logistics

In [8]:
amount_files = 20
for i in range(20):
    indicator = f"{str(uuid.uuid4())}"
    nulls_dist = {
        "transaction_id": random.uniform(0, 0.1),
        "user_id": random.uniform(0, 0.1),
        "timestamp": random.uniform(0, 0.1),
        "sales_price": random.uniform(0, 0.1),
        "currency": random.uniform(0, 0.1),
        "payment_method": random.uniform(0, 0.1),
        "transaction_status": random.uniform(0, 0.1),
        "coord": random.uniform(0, 0.1),
        "city": random.uniform(0, 0.1),
        "address": random.uniform(0, 0.1),
    }
    df_online_sales = generate_online_sales_data(nulls_dist)
    df_online_sales.to_csv(f"datasets/online_sales/sales_{indicator}.csv")

    null_dist = {
        "actual_delivery_date": 0.08931
    }
    logistics_df = generate_logistics_dataset(df_online_sales, null_dist)
    logistics_df["timestamp"] = logistics_df["timestamp"].astype("datetime64[us]")
    logistics_df.to_parquet(f"datasets/logistics/logistics_{indicator}.parquet",engine="pyarrow")

## Inventory

In [9]:
amount_files = 20
for i in range(20):
    indicator = f"{str(uuid.uuid4())}"
    nulls_dist = {
        "timestamp": round(random.uniform(0, 0.1), 4),
        "product": round(random.uniform(0, 0.1), 4),
        "stock_quantity": round(random.uniform(0, 0.1), 4),
        "vendor_id": round(random.uniform(0, 0.1), 4),
        "expiration_date": round(random.uniform(0, 0.1), 4),
        "sales_price": round(random.uniform(0, 0.1), 4),
    }
    df_inventory = generate_inventory_data(nulls_dist)
    df_inventory.to_csv(f"datasets/inventory/inventory_{indicator}.csv")

## Climate

In [10]:
n_stations = 100
month = 3
starting_day = 1
ending_day = 29

for day in range(starting_day, ending_day):
    start_date = datetime(2025, month, day)
    end_date = datetime(2025, month, day+1)

    null_dist = {
        "temperature_C": random.uniform(0, 0.1),
        "humidity_percent": random.uniform(0, 0.1),
        "precipitation_mm": random.uniform(0, 0.1),
        "solar_radiation_Wm2": random.uniform(0, 0.1),
        "wind_speed_mps": random.uniform(0, 0.1),
    }

    df_climate = generate_climate_data(null_dist, n_stations, start_date, end_date)
    indicator_climate = f"{str(start_date.date())}_to_{str(end_date.date())}"
    df_climate.to_json(f"datasets/climate/climate_{indicator_climate}.json", orient="records", lines=True)

  timestamps = pd.date_range(start=start_date, end=end_date, freq='H', inclusive='left').tolist()
  timestamps = pd.date_range(start=start_date, end=end_date, freq='H', inclusive='left').tolist()
  timestamps = pd.date_range(start=start_date, end=end_date, freq='H', inclusive='left').tolist()
  timestamps = pd.date_range(start=start_date, end=end_date, freq='H', inclusive='left').tolist()
  timestamps = pd.date_range(start=start_date, end=end_date, freq='H', inclusive='left').tolist()
  timestamps = pd.date_range(start=start_date, end=end_date, freq='H', inclusive='left').tolist()
  timestamps = pd.date_range(start=start_date, end=end_date, freq='H', inclusive='left').tolist()
  timestamps = pd.date_range(start=start_date, end=end_date, freq='H', inclusive='left').tolist()
  timestamps = pd.date_range(start=start_date, end=end_date, freq='H', inclusive='left').tolist()
  timestamps = pd.date_range(start=start_date, end=end_date, freq='H', inclusive='left').tolist()
  timestamps = pd.da

# Samples

## Online sales

In [None]:
indicator = ""
pd.read_csv(f"datasets/online_sales/sales_{indicator}.csv")

Unnamed: 0.1,Unnamed: 0,order_id,transaction_id,user_id,sales_price,currency,payment_method,transaction_status,coord,city,address,category,product,timestamp
138,138,order_f46c1a42-8e60-4415-bbec-9deaa10ab972,tx_789ce18b-2f8e-485c-af72-2d38d83826c1,user_81165ece-401a-482c-a17c-e203eae1dbd9,,USD,credit_card,failed,"(np.float64(39.521806), np.float64(-1.551518))",CHICLANA DE LA FRONTERA,"Calle de Montera, 189, 36742",soil,soil_001 - premium potting mix,
141,141,order_2f2dba11-5ff3-435c-bb69-8389ce477e60,tx_89ebebef-0598-416e-b3fe-9cc4e57a367b,user_7665d83e-56c8-42f4-8705-04344a4e84fd,,EUR,debit_card,failed,"(np.float64(38.557413), np.float64(-8.631829))",granada,"Calle de Embajadores, 19, 27152",seeds,"seed_003, DROUGHT-RESISTANT CORN SEEDS",
217,217,order_8580bd5b-6512-499c-a0d5-98ad7d62702f,tx_8612cd03-57be-4d72-9de5-75af9dd29803,user_95763fbe-7e3a-461d-bac7-b404f9414719,,EUR,debit_card,processing,"(np.float64(42.356209), np.float64(-6.638135))",SANTANDER,"Calle de Narváez, 32",supplies,supply_002 - DRIP IRRIGATION KIT,2025-04-10 18:50:31.852
279,279,order_c908fa1f-b74f-4548-bb86-e2236c95ea33,tx_e2d82f07-f716-4ee3-8184-a7082f9e9148,user_5a223f11-41a2-4f46-9827-0a0d587e3561,,USD,credit_card,failed,"(np.float64(36.870457), np.float64(-0.980908))",MADRID,"Ronda de Atocha, 117, 25658",pesticides,pest_004 Insect Barrier Dust,
291,291,order_17107068-72d4-44b6-a04e-3c358151a22c,tx_a70f5862-c279-4595-8a5e-49273a79d739,user_e63adaa4-1dcc-4a65-8f0e-2c0ddd4b1a9e,,USD,credit_card,failed,"(np.float64(38.870869), np.float64(-1.862842))",burgos,,soil,"soil_004, ORGANIC GARDEN SOIL",2025-04-15 18:50:31.922
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99476,99476,order_2cb019c7-b606-4c1e-916f-409e16a9d368,,,,USD,paypal,processing,"(np.float64(40.718127), np.float64(-2.704268))",Gijón,"Calle de Príncipe de Vergara, 171, 24369",equipment,"equip_003, battery-powered sprayer",2025-03-14 18:50:31.869
99728,99728,order_3f183fa7-4e41-4575-acb9-97e27bf5b710,tx_e76a9a7a-a3fb-417f-8762-705fafce5b66,user_421d462f-3c98-48ce-be7a-98b8ef0c5b45,,EUR,debit_card,completed,"(np.float64(40.697416), np.float64(-6.829516))",ferrol,"Calle de Fuencarral, 77",fertilizers,fert_004 Liquid Seaweed Fertilizer,2025-03-27 18:50:31.883
99812,99812,order_f70309a6-dd9e-4d1d-93c7-90f0e9a6cb37,tx_3dd24e18-e988-43cb-a28c-b9aa4a5c87ff,user_6313f982-93ea-4562-a6a1-c3add13e7d65,,EUR,debit_card,completed,"(np.float64(38.275708), np.float64(0.452136))",Santander,"Calle Mayor, 37, 13362",equipment,"equip_001, heavy-duty tiller",2025-04-11 18:50:31.870
99899,99899,order_0e84e1a1-e8f2-4aac-bb4d-ba210cdb1b39,tx_ff612d43-1841-4103-beae-f2d5eb3877f7,user_d1d93b7a-a57c-44db-891b-5af079bea6c0,,EUR,credit_card,completed,"(np.float64(39.236958), np.float64(-0.797379))",Cáceres,"Avenida de Valladolid, 36, 47035",pesticides,pest_003 - fungicide for fruit trees,2025-03-14 18:50:31.888


In [None]:
indicator = ""
pd.read_csv(f"datasets/online_sales/sales_{indicator}.csv")

Unnamed: 0.1,Unnamed: 0,order_id,transaction_id,user_id,sales_price,currency,payment_method,transaction_status,coord,city,address,category,product,timestamp
0,0,order_b7f08af7-38b2-46c6-9b0f-3c71625b172f,tx_c32c2f07-618a-4fc8-baf6-7dbea5570b37,user_99a66a91-996b-463c-8e3e-01daa94d4eae,144.56,USD,credit_card,completed,"(np.float64(38.894713), np.float64(2.314092))",leon,"Paseo de la Habana, 114, 29814",fertilizers,fert_003 phosphorus growth enhancer,2025-03-10 12:57:26.537564
1,1,order_1e3deeb4-16d7-4bb3-9f16-1a7218e52b4e,tx_91a2a63a-3241-45c6-8931-eff90b47a616,user_b2263a80-8d58-4c54-a7b1-1f3d79beec55,53.64,EUR,paypal,failed,"(np.float64(39.936523), np.float64(-5.687339))",san fernando,"Paseo de Extremadura, 104, 41732",soil,soil_005 mulch and soil conditioner,2025-04-09 12:57:26.479851
2,2,order_c681015c-cbe5-48a9-ac62-26ae8b6b76c6,tx_a9ecce0f-d275-4e35-8df6-ed0a9aeb82d6,user_c096e83f-a441-4388-a3f0-c826ebed39a6,131.33,USD,credit_card,completed,"(np.float64(37.514645), np.float64(1.018441))",Barcelona,"Paseo de Extremadura, 62",pesticides,pest_001 - ORGANIC NEEM OIL SPRAY,2025-03-17 12:57:26.535920
3,3,order_f4e7f4c8-5b8b-44a5-b63a-950e0980e54f,tx_ec2aca77-b676-411c-a1a8-997b08c18388,user_86085218-cfc0-4b58-bc99-45af56749310,110.28,USD,credit_card,processing,"(np.float64(41.032766), np.float64(-0.369419))",murcia,"Calle de José Abascal, 59, 24858",fertilizers,"fert_005, Potash Enriched Formula",2025-03-10 12:57:26.527237
4,4,order_6c9f644b-fe34-49d1-9b79-dc9edcfe19f7,tx_5a4924c2-a40b-4607-8abe-4447692fbc66,user_fcdc5be6-5376-4678-8f1b-ba79c7b40806,157.02,EUR,debit_card,failed,"(np.float64(39.081563), np.float64(0.770435))",Albacete,"Calle Alcalá, 193, 25335",soil,soil_005 - Mulch and Soil Conditioner,2025-04-20 12:57:26.503599
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99995,order_6512538e-69d9-4738-b51a-f3b5610550d8,tx_3613f5a7-a099-490a-8f68-40c6d5a102a4,user_000fc8c6-132d-45b0-80ed-135cbe008bad,188.94,EUR,paypal,completed,"(np.float64(42.466082), np.float64(-1.58248))",CÓRDOBA,"Carrer de València, 111, 49929",fertilizers,fert_002 - Organic Compost Mix,2025-04-10 12:57:26.525683
99996,99996,order_2039545d-d440-4426-81cf-c31f01a3b71a,tx_5c3f1956-0725-46bb-b29b-bcead53a42e6,user_9f67b720-7c35-41e5-8412-462002eb8ae4,171.89,USD,credit_card,failed,"(np.float64(42.254193), np.float64(-6.603657))",girona,"Calle de Vallehermoso, 76",supplies,supply_004 - HARVEST BASKETS,2025-03-08 12:57:26.436602
99997,99997,order_a8ed3fe3-fe6e-4da4-afef-3cf019f7b060,tx_53f5d2f0-8937-4417-a5f2-84309c0af85b,user_2a4d32c4-6738-41f2-845b-e0f7d7ddb812,86.81,EUR,credit_card,completed,"(np.float64(39.812957), np.float64(2.971283))",TALAVERA DE LA REINA,"Carrer de Balmes, 16",pesticides,pest_003 Fungicide for Fruit Trees,2025-02-07 12:57:26.443658
99998,99998,order_76d0632c-3d88-4ea0-805d-b2f1ead249f7,tx_f52ad8c3-14a0-47d8-aba4-431eca853c5b,user_809d2c12-8bcb-4ce2-aaa5-43fa07e412d1,158.04,EUR,credit_card,failed,"(np.float64(37.805768), np.float64(-2.741933))",lleida,"Calle de Embajadores, 89",soil,soil_001 - Premium Potting Mix,2025-04-18 12:57:26.521687


## Logistics

In [None]:
indicator = ""
pd.read_parquet(f"datasets/logistics/logistics_{indicator}.parquet")

Unnamed: 0,order_id,timestamp,coord,pickup_date,scheduled_delivery_date,shipment_status,actual_delivery_date
30,order_001de955-7e57-4547-a878-6130e47c8951,NaT,"[37.580226, 1.787842]",,,received,
33,order_001ec053-8871-4b7d-96f7-296650b6e177,NaT,"[39.437611, -0.206401]",,,sent,
49,order_0033f305-6ef1-410f-9cfc-bbbc5c3fc7b2,NaT,"[42.138466, 2.936842]",,,received,
68,order_004fcd72-7bd3-4cf8-933f-1f5c088f922f,NaT,"[43.107976, -6.05112]",,,received,
73,order_00562ebd-dcdb-4a1b-8e65-4f9894d6afdf,NaT,"[40.804068, -1.027625]",,,received,
...,...,...,...,...,...,...,...
53043,order_ff7b33a8-c626-459e-a9b2-963cc6806988,NaT,"[37.835529, -0.096186]",,,received,
53066,order_ff901f43-2bb6-40bd-9a0f-10b1e3436d54,NaT,"[36.239071, -6.401546]",,,sent,
53071,order_ff973de2-61f7-4d80-899f-79b151d56440,NaT,"[37.556962, 1.649152]",,,pending,
53092,order_ffb8adcb-b0ff-4f60-8a01-3825499f9f85,NaT,"[42.792665, 0.527017]",,,received,


In [None]:
import pyarrow.parquet as pq

indicator = ""
table = pq.read_table("datasets/logistics/logistics_{indicator}.parquet")
print(table.schema)


order_id: string
timestamp: timestamp[us]
coord: list<element: double>
  child 0, element: double
pickup_date: date32[day]
scheduled_delivery_date: date32[day]
shipment_status: string
actual_delivery_date: date32[day]
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 1151


## Inventory

In [None]:
indicator = ""
pd.read_csv(f"datasets/inventory/inventory_{indicator}.csv")

Unnamed: 0.1,Unnamed: 0,timestamp,store_id,category,stock_quantity,purchase_price,vendor_id,product,sales_price,manufacturing_date,expiration_date
5,5,2025-03-26 19:51:31.407161,store_7b8878bd-172f-4cc0-8f18-92eb9c4e46e0,seeds,775.0,177.20,vendor_0a73ba81-d530-4ea3-8900-79efb874a343,seed_002 ORGANIC TOMATO SEEDS,,2025-02-28,2025-04-16
40,40,2025-04-25 19:51:31.407213,store_7a346fd8-a5b4-4db2-9b43-432620eac62c,soil,1563.0,80.05,vendor_f05f5318-bf6e-48b8-b293-98b6d6825bd8,"soil_001, premium potting mix",,2025-03-05,2025-06-01
67,67,2025-04-23 19:51:31.407252,store_2010d7a7-0ba5-4b62-8634-576b6670dfc8,soil,960.0,66.68,vendor_fd2b33f6-ec9f-45aa-abaf-9b65d3d008f7,soil_004 ORGANIC GARDEN SOIL,,2025-04-16,2025-07-21
72,72,2025-02-07 19:51:31.407259,store_39a6eea4-a5d8-47ac-890c-d07f933b4195,fertilizers,1930.0,80.98,vendor_d6ba0577-5c90-4333-a8f2-f013464b4db7,fert_003 PHOSPHORUS GROWTH ENHANCER,,2025-01-31,2025-03-18
74,74,2025-05-01 19:51:31.407262,store_7d0fcfa0-2526-4bc3-b7c6-e25dcd5b6480,supplies,178.0,85.09,vendor_4db9e28b-a12e-492e-b9ca-004ec1a51a7b,,,2025-04-15,2025-07-05
...,...,...,...,...,...,...,...,...,...,...,...
99930,99930,2025-03-26 19:51:31.550039,store_11023503-a750-4647-9a53-45560f437b0f,pesticides,167.0,81.93,vendor_b850f65e-c958-4885-b853-4d75134626d0,pest_005 natural slug repellent,,2025-02-27,2025-06-17
99955,99955,2025-01-24 19:51:31.550074,store_bf5e3aef-7c2f-4375-99c4-7e2efb80d17a,supplies,451.0,181.56,vendor_a1d87841-a0f3-4bed-8374-9c237e1a487a,supply_004 - Harvest Baskets,,2024-12-10,2025-04-03
99971,99971,2025-04-06 19:51:31.550097,store_84d39ccf-c888-4e15-9d74-104782db0f92,soil,1968.0,66.28,vendor_dd85c1fa-1790-49f7-957f-6f5ae537f7c5,soil_002 CACTUS AND SUCCULENT SOIL,,2025-04-01,2025-05-26
99973,99973,2025-03-18 19:51:31.550099,store_6fb08e58-1544-4eeb-bd8d-a10f58ff8fc7,equipment,1524.0,141.49,vendor_cf15c1cb-cbdc-47cd-a94b-bf714613d88e,equip_002 Precision Seed Spreader,,2025-02-24,2025-07-21


## Climate

In [105]:
pd.read_json("datasets/climate/climate_2025-03-01_to_2025-03-02.json", lines=True)

Unnamed: 0,timestamp,station_id,coord,temperature_C,precipitation_mm,humidity_percent,solar_radiation_Wm2,wind_speed_mps
0,2025-03-01 00:00:00,station_001,"[36.605885, -0.5795819999999999]",21.4,0.27,39.1,533.3,
1,2025-03-01 01:00:00,station_001,"[36.605885, -0.5795819999999999]",16.3,0.96,96.6,469.7,6.2
2,2025-03-01 02:00:00,station_001,"[36.605885, -0.5795819999999999]",23.8,1.72,99.3,374.7,14.0
3,2025-03-01 03:00:00,station_001,"[36.605885, -0.5795819999999999]",7.8,1.44,67.7,1102.2,13.5
4,2025-03-01 04:00:00,station_001,"[36.605885, -0.5795819999999999]",17.4,0.18,34.6,387.5,13.7
...,...,...,...,...,...,...,...,...
2395,2025-03-01 19:00:00,station_100,"[38.361384, 0.7997000000000001]",6.8,0.19,62.1,423.5,6.6
2396,2025-03-01 20:00:00,station_100,"[38.361384, 0.7997000000000001]",10.1,,83.5,319.8,10.5
2397,2025-03-01 21:00:00,station_100,"[38.361384, 0.7997000000000001]",22.5,0.90,,721.0,5.4
2398,2025-03-01 22:00:00,station_100,"[38.361384, 0.7997000000000001]",14.8,0.15,47.4,737.7,9.0
