### 1: Imports and Setup

In [1]:
import pandas as pd
import numpy as np

# Set seed for full reproducibility
np.random.seed(42)

print("Libraries imported and random seed set for reproducibility.")

Libraries imported and random seed set for reproducibility.


#### 2: Generate Clean Base Dataset

In [2]:
num_ticks = 20000  # ~1 trading day at high frequency

base_price = 150.0
volatility = 0.001  # Typical intraday vol for liquid stock
drift = 0.00001     # Slight upward bias

# High-frequency timestamps
timestamps = pd.date_range(start='2025-06-01 09:30:00', end='2025-06-01 16:00:00', periods=num_ticks)

# Realistic price path: random walk with drift
price_changes = drift + volatility * np.random.standard_normal(num_ticks)
prices = base_price + np.cumsum(price_changes)
prices = np.maximum(prices, 0.01)  # Prevent impossible negative/zero

# Random trade sizes
volumes = np.random.randint(1, 500, size=num_ticks)

# Multiple exchanges (realistic fragmentation in equities)
exchanges = np.random.choice(['NYSE', 'NASDAQ', 'BATS', 'ARCA'], size=num_ticks, p=[0.4, 0.3, 0.2, 0.1])

df = pd.DataFrame({
    'timestamp': timestamps,
    'exchange': exchanges,
    'price': prices,
    'volume': volumes
})

df = df.sort_values('timestamp').reset_index(drop=True)

print("Clean synthetic dataset generated.")
print(df.head(10))
print(f"\nShape: {df.shape}")
print(f"Price range: {df['price'].min():.4f} - {df['price'].max():.4f}")
print(f"Total volume: {df['volume'].sum():,}")

Clean synthetic dataset generated.
                      timestamp exchange       price  volume
0 2025-06-01 09:30:00.000000000     BATS  150.000507     288
1 2025-06-01 09:30:01.170058502   NASDAQ  150.000378      81
2 2025-06-01 09:30:02.340117005   NASDAQ  150.001036      18
3 2025-06-01 09:30:03.510175508   NASDAQ  150.002569     196
4 2025-06-01 09:30:04.680234011     NYSE  150.002345     291
5 2025-06-01 09:30:05.850292514     NYSE  150.002121     360
6 2025-06-01 09:30:07.020351017   NASDAQ  150.003710     356
7 2025-06-01 09:30:08.190409520     NYSE  150.004488     279
8 2025-06-01 09:30:09.360468023     NYSE  150.004028     463
9 2025-06-01 09:30:10.530526526   NASDAQ  150.004581     397

Shape: (20000, 4)
Price range: 149.9875 - 150.3268
Total volume: 4,981,785


#### 3: Inject Realistic Errors (Noisy Raw Data Simulation)

In [3]:
n_missing = int(0.03 * num_ticks)           # ~3% missing prices (feed drops)
missing_idx = np.random.choice(df.index, n_missing, replace=False)
df.loc[missing_idx, 'price'] = np.nan

n_outliers = int(0.005 * num_ticks)         # ~0.5% extreme outliers (glitches)
outlier_idx = np.random.choice(df.index, n_outliers, replace=False)
df.loc[outlier_idx, 'price'] = df.loc[outlier_idx, 'price'] * np.random.uniform(5, 20, n_outliers)

n_negative_price = int(0.001 * num_ticks)   # Rare negative prices (encoding bug)
neg_idx = np.random.choice(df.index, n_negative_price, replace=False)
df.loc[neg_idx, 'price'] = -df.loc[neg_idx, 'price'].abs()

n_zero_volume = int(0.002 * num_ticks)      # Zero volume trades (invalid)
zero_vol_idx = np.random.choice(df.index, n_zero_volume, replace=False)
df.loc[zero_vol_idx, 'volume'] = 0

n_duplicates = int(0.01 * num_ticks)        # 1% duplicates (retransmission)
dup_idx = np.random.choice(df.index, n_duplicates, replace=False)
df = pd.concat([df, df.loc[dup_idx]]).sort_values('timestamp').reset_index(drop=True)

# Timestamp gaps (network delays)
n_timestamp_gaps = 5
gap_idx = np.random.choice(df.index[1000:-1000], n_timestamp_gaps, replace=False)
for idx in gap_idx:
    gap_size = pd.Timedelta(seconds=np.random.randint(60, 300))
    df.loc[idx+1:, 'timestamp'] += gap_size

print("Realistic errors injected for quant trading simulation.")
print("Error summary:")
print(f"- Missing prices: {df['price'].isna().sum()}")
print(f"- Negative prices: {(df['price'] < 0).sum()}")
print(f"- Zero volumes: {(df['volume'] == 0).sum()}")
print(f"- Duplicates: {df.duplicated().sum()}")
print(f"- Timestamp gaps created: {n_timestamp_gaps}")
print("\nFirst 10 rows after errors:")
print(df.head(10))
print(f"\nFinal shape: {df.shape}")

Realistic errors injected for quant trading simulation.
Error summary:
- Missing prices: 608
- Negative prices: 20
- Zero volumes: 43
- Duplicates: 200
- Timestamp gaps created: 5

First 10 rows after errors:
                      timestamp exchange       price  volume
0 2025-06-01 09:30:00.000000000     BATS  150.000507     288
1 2025-06-01 09:30:01.170058502   NASDAQ  150.000378      81
2 2025-06-01 09:30:02.340117005   NASDAQ  150.001036      18
3 2025-06-01 09:30:03.510175508   NASDAQ  150.002569     196
4 2025-06-01 09:30:04.680234011     NYSE  150.002345     291
5 2025-06-01 09:30:05.850292514     NYSE  150.002121     360
6 2025-06-01 09:30:07.020351017   NASDAQ  150.003710     356
7 2025-06-01 09:30:08.190409520     NYSE  150.004488     279
8 2025-06-01 09:30:09.360468023     NYSE  150.004028     463
9 2025-06-01 09:30:10.530526526   NASDAQ  150.004581     397

Final shape: (20200, 4)


#### 4: Save Raw Noisy Dataset

In [4]:
df.to_csv('raw_market_data_with_errors.csv', index=False)

print("Raw noisy dataset saved to 'raw_market_data_with_errors.csv'")
print(f"Rows: {len(df):,}, Estimated size: ~2 MB")

Raw noisy dataset saved to 'raw_market_data_with_errors.csv'
Rows: 20,200, Estimated size: ~2 MB
