In [1]:
pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'


In [1]:
# Package imports
import pandas as pd
import numpy as np

In [2]:
# Dispatch data simulation

# Set random seed for reproducibility
np.random.seed(142)

# Define parameters for simulation
n_rows = 500
job_types = ["towing", "fuel_delivery", "battery_jump", "lockout", "tire_change"]
location_ids = range(101, 116)  
start_time = pd.Timestamp("2024-01-01 00:00:00")
end_time = pd.Timestamp("2024-01-03 23:59:59")

# Generate simulated data
dispatch_ids = np.arange(1, n_rows + 1)
location_id_samples = np.random.choice(location_ids, n_rows)
job_type_samples = np.random.choice(job_types, n_rows)
dispatch_time_samples = pd.to_datetime(np.random.uniform(start_time.timestamp(), end_time.timestamp(), n_rows), unit="s")

# Create the DataFrame
dispatch_data_simulated = pd.DataFrame({
    "dispatch_id": dispatch_ids,
    "location_id": location_id_samples,
    "job_type": job_type_samples,
    "dispatch_time": dispatch_time_samples
})

# Save to CSV
dispatch_data_simulated.to_csv("data/dispatch_data_simulated.csv", index=False)

# Preview the dataset
print(dispatch_data_simulated)


# Introduce missing and junk values into dispatch_data
dispatch_data_simulated.loc[10:20, "job_type"] = None  # Introduce missing job types
dispatch_data_simulated.loc[30, "location_id"] = 9999  # Invalid location_id
dispatch_data_simulated.loc[40, "dispatch_time"] = "InvalidTime"  # Corrupted timestamp

# Save the updated dataset
dispatch_data_simulated.to_csv("data/dispatch_data_with_issues.csv", index=False)

     dispatch_id  location_id       job_type                 dispatch_time
0              1          106  fuel_delivery 2024-01-02 08:47:37.867155968
1              2          106  fuel_delivery 2024-01-03 23:29:39.787000320
2              3          112    tire_change 2024-01-03 11:09:58.744973312
3              4          113   battery_jump 2024-01-01 12:23:25.783618048
4              5          111        lockout 2024-01-01 15:50:39.269447424
..           ...          ...            ...                           ...
495          496          112    tire_change 2024-01-02 05:29:15.308436992
496          497          107  fuel_delivery 2024-01-01 07:03:13.519422720
497          498          102  fuel_delivery 2024-01-02 13:41:25.300250624
498          499          106         towing 2024-01-02 20:11:06.366472960
499          500          103  fuel_delivery 2024-01-02 23:10:55.364893440

[500 rows x 4 columns]


In [3]:
dispatch_data_simulated['job_type'].value_counts()

job_type
lockout          107
battery_jump     101
fuel_delivery     99
tire_change       93
towing            89
Name: count, dtype: int64

In [4]:
# Traffic data simulation

# Set random seed for reproducibility
np.random.seed(987)

# Define parameters for simulation
n_rows = 400
location_ids = range(101, 116) 
traffic_levels = ["low", "moderate", "high", "severe"]
start_date = pd.Timestamp("2024-01-01")
end_date = pd.Timestamp("2024-01-05")

# Generate simulated data
location_id_samples = np.random.choice(location_ids, n_rows)
date_samples = pd.to_datetime(
    np.random.uniform(start_date.timestamp(), end_date.timestamp(), n_rows), unit="s"
).normalize()  # Normalize to ensure dates (no time component)
traffic_level_samples = np.random.choice(traffic_levels, n_rows)

# Create the DataFrame
traffic_data_simulated = pd.DataFrame({
    "location_id": location_id_samples,
    "date": date_samples,
    "traffic_level": traffic_level_samples
})

# Save to CSV
traffic_data_simulated.to_csv("data/traffic_data_simulated.csv", index=False)

# Preview the dataset
print(traffic_data_simulated.head())

# Introduce missing and junk values into traffic_data
traffic_data_simulated.loc[50:60, "traffic_level"] = None  # Missing traffic levels
traffic_data_simulated.loc[100, "location_id"] = 123456  # Invalid location_id
traffic_data_simulated.loc[150, "date"] = "2024-01-99"  # Invalid date

# Save the updated dataset
traffic_data_simulated.to_csv("data/traffic_data_with_issues.csv", index=False)

   location_id       date traffic_level
0          104 2024-01-01          high
1          108 2024-01-04        severe
2          110 2024-01-02      moderate
3          107 2024-01-04      moderate
4          104 2024-01-01        severe


In [5]:
# Weather data simulation

# Set random seed for reproducibility
np.random.seed(456)

# Define parameters for simulation
n_rows = 450
location_ids = range(101, 125) 
start_date = pd.Timestamp("2024-01-01")
end_date = pd.Timestamp("2024-01-04")
temperature_range = (60, 100)  # Simulated temperature range (in Fahrenheit)
precipitation_range = (0, 3)  # Simulated precipitation (in inches)

# Generate simulated data
location_id_samples = np.random.choice(location_ids, n_rows)
date_samples = pd.to_datetime(
    np.random.uniform(start_date.timestamp(), end_date.timestamp(), n_rows), unit="s"
).normalize()  # Normalize to ensure only the date
temperature_samples = np.random.uniform(temperature_range[0], temperature_range[1], n_rows).round(1)
dof = 2  # Degrees of freedom for chi square
precipitation_samples = np.random.chisquare(dof, n_rows).round(2)

# Create the DataFrame
weather_data_simulated = pd.DataFrame({
    "location_id": location_id_samples,
    "date": date_samples,
    "temperature": temperature_samples,
    "precipitation": precipitation_samples
})

# Save to CSV
weather_data_simulated.to_csv("data/weather_data_simulated.csv", index=False)

# Preview the dataset
print(weather_data_simulated.head())


# Introduce missing and junk values into weather_data
weather_data_simulated.loc[20:30, "temperature"] = None  # Missing temperatures
weather_data_simulated.loc[50, "precipitation"] = -1  # Invalid precipitation
weather_data_simulated.loc[70, "date"] = "InvalidDate"  # Corrupted date

# Save the updated dataset
weather_data_simulated.to_csv("data/weather_data_with_issues.csv", index=False)

   location_id       date  temperature  precipitation
0          106 2024-01-01         72.3           1.85
1          112 2024-01-02         90.3           0.96
2          111 2024-01-03         72.2           4.50
3          116 2024-01-03         71.4           3.21
4          115 2024-01-01         74.7           0.00
