In [1]:
# Package imports
import pandas as pd
import numpy as np

In [2]:
# Dispatch data simulation

# Set random seed for reproducibility
np.random.seed(42)

# Define parameters for simulation
n_rows = 500
job_types = ["towing", "fuel_delivery", "battery_jump", "lockout", "tire_change"]
location_ids = range(101, 151)  # Simulating 50 unique locations
start_time = pd.Timestamp("2024-01-01 00:00:00")
end_time = pd.Timestamp("2024-01-03 23:59:59")

# Generate simulated data
dispatch_ids = np.arange(1, n_rows + 1)
location_id_samples = np.random.choice(location_ids, n_rows)
job_type_samples = np.random.choice(job_types, n_rows)
dispatch_time_samples = pd.to_datetime(np.random.uniform(start_time.timestamp(), end_time.timestamp(), n_rows), unit="s")

# Create the DataFrame
dispatch_data_simulated = pd.DataFrame({
    "dispatch_id": dispatch_ids,
    "location_id": location_id_samples,
    "job_type": job_type_samples,
    "dispatch_time": dispatch_time_samples
})

# Save to CSV
dispatch_data_simulated.to_csv("data/dispatch_data_simulated.csv", index=False)

# Preview the dataset
print(dispatch_data_simulated)


# Introduce missing and junk values into dispatch_data
dispatch_data_simulated.loc[10:20, "job_type"] = None  # Introduce missing job types
dispatch_data_simulated.loc[30, "location_id"] = 9999  # Invalid location_id
dispatch_data_simulated.loc[40, "dispatch_time"] = "InvalidTime"  # Corrupted timestamp

# Save the updated dataset
dispatch_data_simulated.to_csv("data/dispatch_data_with_issues.csv", index=False)

     dispatch_id  location_id       job_type                 dispatch_time
0              1          139    tire_change 2024-01-01 04:47:15.766005248
1              2          129        lockout 2024-01-03 05:21:37.731175424
2              3          115         towing 2024-01-02 17:21:42.856463104
3              4          143    tire_change 2024-01-03 12:36:41.177106944
4              5          108  fuel_delivery 2024-01-01 10:03:48.860249088
..           ...          ...            ...                           ...
495          496          105   battery_jump 2024-01-03 00:59:44.738598400
496          497          112         towing 2024-01-02 23:07:28.652230400
497          498          116  fuel_delivery 2024-01-01 19:40:28.596138496
498          499          126        lockout 2024-01-03 20:27:42.884471552
499          500          126   battery_jump 2024-01-01 10:52:34.054491648

[500 rows x 4 columns]


In [3]:
# Traffic data simulation

# Set random seed for reproducibility
np.random.seed(42)

# Define parameters for simulation
n_rows = 700
location_ids = range(101, 151)  # Simulating 50 unique locations
traffic_levels = ["low", "moderate", "high", "severe"]
start_date = pd.Timestamp("2024-01-01")
end_date = pd.Timestamp("2024-01-03")

# Generate simulated data
location_id_samples = np.random.choice(location_ids, n_rows)
date_samples = pd.to_datetime(
    np.random.uniform(start_date.timestamp(), end_date.timestamp(), n_rows), unit="s"
).normalize()  # Normalize to ensure dates (no time component)
traffic_level_samples = np.random.choice(traffic_levels, n_rows)

# Create the DataFrame
traffic_data_simulated = pd.DataFrame({
    "location_id": location_id_samples,
    "date": date_samples,
    "traffic_level": traffic_level_samples
})

# Save to CSV
traffic_data_simulated.to_csv("data/traffic_data_simulated.csv", index=False)

# Preview the dataset
print(traffic_data_simulated.head())

# Introduce missing and junk values into traffic_data
traffic_data_simulated.loc[50:60, "traffic_level"] = None  # Missing traffic levels
traffic_data_simulated.loc[100, "location_id"] = 123456  # Invalid location_id
traffic_data_simulated.loc[150, "date"] = "2025-01-99"  # Invalid date

# Save the updated dataset
traffic_data_simulated.to_csv("data/traffic_data_with_issues.csv", index=False)

   location_id       date traffic_level
0          139 2024-01-01          high
1          129 2024-01-02      moderate
2          115 2024-01-01        severe
3          143 2024-01-02           low
4          108 2024-01-01      moderate


In [4]:
# Weather data simulation

# Set random seed for reproducibility
np.random.seed(456)

# Define parameters for simulation
n_rows = 200
location_ids = range(101, 151)  # Simulating 50 unique locations
start_date = pd.Timestamp("2024-01-01")
end_date = pd.Timestamp("2024-01-03")
temperature_range = (60, 100)  # Simulated temperature range (in Fahrenheit)
precipitation_range = (0, 3)  # Simulated precipitation (in inches)

# Generate simulated data
location_id_samples = np.random.choice(location_ids, n_rows)
date_samples = pd.to_datetime(
    np.random.uniform(start_date.timestamp(), end_date.timestamp(), n_rows), unit="s"
).normalize()  # Normalize to ensure only the date
temperature_samples = np.random.uniform(temperature_range[0], temperature_range[1], n_rows).round(1)
dof = 2  # Degrees of freedom for chi square
precipitation_samples = np.random.chisquare(dof, n_rows).round(2)

# Create the DataFrame
weather_data_simulated = pd.DataFrame({
    "location_id": location_id_samples,
    "date": date_samples,
    "temperature": temperature_samples,
    "precipitation": precipitation_samples
})

# Save to CSV
weather_data_simulated.to_csv("data/weather_data_simulated.csv", index=False)

# Preview the dataset
print(weather_data_simulated.head())


# Introduce missing and junk values into weather_data
weather_data_simulated.loc[20:30, "temperature"] = None  # Missing temperatures
weather_data_simulated.loc[50, "precipitation"] = -1  # Invalid precipitation
weather_data_simulated.loc[70, "date"] = "InvalidDate"  # Corrupted date

# Save the updated dataset
weather_data_simulated.to_csv("data/weather_data_with_issues.csv", index=False)

   location_id       date  temperature  precipitation
0          128 2024-01-02         91.5           2.51
1          138 2024-01-01         73.3           2.46
2          144 2024-01-02         64.5           1.06
3          126 2024-01-02         64.1           0.48
4          143 2024-01-01         75.8           0.35
