<a href="https://colab.research.google.com/github/cbonnin88/RailFlow/blob/main/Data_Generation_RailFlow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install faker

Collecting faker
  Downloading faker-40.1.2-py3-none-any.whl.metadata (16 kB)
Downloading faker-40.1.2-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-40.1.2


In [21]:
import pandas as pd
import numpy as np
import random
import uuid
from faker import Faker
from datetime import datetime, timedelta

In [22]:
# Initialize Faker (French Locale for realism)
fake = Faker('fr_FR')
Faker.seed(42)
np.random.seed(42)

In [23]:
# Configuration
NUM_USERS = 5000
NUM_SEARCHES = 15000
NUM_BOOKINGS = 4000 # Not all searches result in bookings

print('Generating RailFlow Dirty Databases')

Generating RailFlow Dirty Databases


# **Generate users (users.csv)**

In [24]:
print(f'Generating {NUM_USERS} users...')

user_ids = [str(uuid.uuid4()) for _ in range(NUM_USERS)]
sub_types = ['Free','Max Jeune','Max Senior', None] # None = Dirty Data

users_data = {
    'user_id': user_ids,
    'signup_date': [fake.date_between(start_date='-2y', end_date='today') for _ in range(NUM_USERS)],
    'subscription_type': [np.random.choice(sub_types, p=[0.6,0.2,0.1,0.1]) for _ in range(NUM_USERS)],
    'age': [fake.random_int(min=18,max=90) for _ in range(NUM_USERS)],
    'device_os': [np.random.choice(['iOS','Android','Web','Unknown']) for _ in range(NUM_USERS)]
}

df_users = pd.DataFrame(users_data)

Generating 5000 users...


# **Injecting Dirt (Users)**

1. Duplicates: Duplicate 5% of users

In [25]:
duplicates = df_users.sample(frac=0.05)
df_users = pd.concat([df_users, duplicates])

In [26]:
# Dirty Ages: Converting some ages to weird strings or negative numbers
def dirty_age(age):
  choice = np.random.rand()
  if choice < 0.02: return f'{age}yo', # String with text
  if choice < 0.04: return -age, # Negative logic error
  if choice < 0.05: return 'unknown' # String text
  return age

df_users['age'] = df_users['age'].apply(dirty_age)

In [27]:
df_users.to_csv('users.csv', index=False)
print('users.csv created')

users.csv created


In [28]:
df_users.head()

Unnamed: 0,user_id,signup_date,subscription_type,age,device_os
0,c368c747-5eac-4b48-bd5a-80f14229d70b,2025-04-25,Free,34,Web
1,0fd64e81-710d-4dd6-9380-2f2b8b18af77,2024-02-01,,90,Unknown
2,50f44434-2231-47a6-af09-d84d2ef07082,2024-08-02,Max Jeune,81,Web
3,087fe693-1095-4915-82bb-27a841d88087,2024-06-25,Free,84,Unknown
4,1fc7b57c-d5c9-4276-9668-de30b8e912d5,2025-07-05,Free,81,Unknown


# **Generate Searches (searches.csv)**

In [29]:
print(f'Generating {NUM_SEARCHES} searches')

stations = ['Paris Gare de Lyon', 'Lyon Part-Dieu', 'Marseille St-Charles', 'Bordeaux St-Jean', 'Lille Europe', 'Strasbourg', 'Nantes']

search_ids = [str(uuid.uuid4()) for _ in range(NUM_SEARCHES)]
# Pick random users for searches
search_user_ids = [np.random.choice(user_ids) for _ in range(NUM_SEARCHES)]

searches_data = {
    'search_id': search_ids,
    'user_id': search_user_ids,
    'timestamp': [fake.date_time_between(start_date='-1y',end_date='now') for _ in range(NUM_SEARCHES)],
    'origin_station':[np.random.choice(stations) for _ in range(NUM_SEARCHES)],
    'destination_station': [np.random.choice(stations) for _ in range(NUM_SEARCHES)],
    'passenger_count': [np.random.choice([1,2,3,4], p=[0.7,0.2,0.05,0.05]) for _ in range(NUM_SEARCHES)]
}

df_searches = pd.DataFrame(searches_data)

Generating 15000 searches


# **Injecting Dirty Data**

In [30]:
df_searches['departure_date'] = df_searches['timestamp'] + pd.to_timedelta(np.random.randint(0,30,size=NUM_SEARCHES), unit='D')

In [31]:
# 1. Casing issues: 'Paris' vs 'PARIS' vs 'paris'

def dirty_stations(station):
  choice = np.random.rand()
  if choice < 0.10: return station.upper()
  if choice < 0.20: return station.lower()
  if choice < 0.25: return f'{station}' # Trailing whitespace
  return station

df_searches['destination_station'] = df_searches['destination_station'].apply(dirty_stations)

In [32]:
# 2. Time Travel (Logical Error): Departure date BEFORE search date

mask = df_searches.index > (NUM_SEARCHES * 0.98)
df_searches.loc[mask, 'depature_date'] = df_searches.loc[mask,'timestamp'] - pd.to_timedelta(5,unit='D')

In [33]:
# 3. Mixed Timestamp Formats (ISO String vs Datetime Obj)

def dirty_time(ts):
  if np.random.rand() < 0.15:
    return ts.strftime('%Y/%m/%d %H:%M:%S') # wrong separator
  return ts.isoformat()

df_searches['timestamp'] = df_searches['timestamp'].apply(dirty_time)

In [34]:
df_searches.to_csv('searches.csv',index=False)
print('searches.csv created')

searches.csv created


In [35]:
df_searches.head()

Unnamed: 0,search_id,user_id,timestamp,origin_station,destination_station,passenger_count,departure_date,depature_date
0,fba4f473-c45c-44cb-a180-f1f0c573ec28,d09d81e5-e17a-416c-b37a-a69be092bd2e,2025/05/10 11:53:09,Nantes,Strasbourg,1,2025-05-30 11:53:09.419189,NaT
1,e979d7ef-1c14-4bd7-b19e-fbf76bb5455b,bfcade59-80f8-4618-9ccc-de44a74999f6,2025-01-18T02:45:56.746525,Marseille St-Charles,Nantes,1,2025-01-23 02:45:56.746525,NaT
2,529b14cb-0ed5-4ee0-9228-e4a84d40d61a,0f2b800f-9f10-433e-9998-06d95e07dcc1,2025-05-14T06:00:00.490315,Lyon Part-Dieu,Nantes,2,2025-05-21 06:00:00.490315,NaT
3,1b905016-8e6c-4ea4-b96c-b4b9e8b0710c,5eab057d-28fa-4bcb-b1ab-a08a2c69d0fe,2025-07-03T02:35:23.365211,Bordeaux St-Jean,Bordeaux St-Jean,3,2025-08-01 02:35:23.365211,NaT
4,10bbab91-5dde-45a9-8717-df2cf2985ab0,64f51332-15dd-467b-9ef1-f4a32ba13673,2025-10-31T14:05:16.469735,Lille Europe,Paris Gare de Lyon,1,2025-11-05 14:05:16.469735,NaT


# **Generate Bookings**

In [36]:
print(f'Generating {NUM_BOOKINGS} bookings')

# Randomly select searches that converted into bookings
converted_searches = np.random.choice(search_ids, size=NUM_BOOKINGS, replace=False)

bookings_data = {
    'booking_id': [str(uuid.uuid4()) for _ in range(NUM_BOOKINGS)],
    'search_id': converted_searches,
    'payment_status': [np.random.choice(['Success', 'Failed', 'Pending'], p=[0.85, 0.1, 0.05]) for _ in range(NUM_BOOKINGS)],
    'ticket_class': [np.random.choice(['1st Class', '2nd Class'], p=[0.2, 0.8]) for _ in range(NUM_BOOKINGS)],
    'amount_eur': np.round(np.random.uniform(25, 150, size=NUM_BOOKINGS), 2)
}

df_bookings = pd.DataFrame(bookings_data)

Generating 4000 bookings


# **Injecting Dirt (Bookings)**

In [37]:
# 1. Currency Symbols (Type Mismatch): Mix floats and strings like "€45.50"
def dirty_price(price):
    if np.random.rand() < 0.2:
        return f"€{price}"
    return price

df_bookings['amount_eur'] = df_bookings['amount_eur'].apply(dirty_price)

In [38]:
# 2. Orphan Bookings: Create bookings with a search_id that DOES NOT exist in searches.csv
# We generate 50 random orphan bookings
orphans = pd.DataFrame({
    'booking_id': [str(uuid.uuid4()) for _ in range(50)],
    'search_id': [str(uuid.uuid4()) for _ in range(50)], # These UUIDs won't match searches
    'payment_status': 'Success',
    'ticket_class': '2nd Class',
    'amount_eur': 50.0
})

df_bookings = pd.concat([df_bookings, orphans])

In [39]:
df_bookings.to_csv('bookings.csv', index=False)
print('bookings.csv created')

bookings.csv created


In [40]:
df_bookings.head()

Unnamed: 0,booking_id,search_id,payment_status,ticket_class,amount_eur
0,e80dc352-ee1b-4c10-b3eb-ce93bd1c7f23,2fc1ab31-20f9-4577-8bed-54479754f29c,Success,2nd Class,71.81
1,77b36a1f-fe38-4a8c-9af9-e7cff1b893ce,de0dbd1d-85f8-436b-b95f-ce4c0f826158,Success,2nd Class,140.79
2,c21dffac-e07b-4338-92df-95cff24d8426,2ab924c3-e678-45f6-b838-9248df70e1e1,Success,2nd Class,32.69
3,ccb7e082-734b-4644-9997-7bd3e48409d0,f4bb9c14-7546-4df8-87c1-91765e759d31,Success,2nd Class,60.48
4,06d439ac-a86b-46ec-aaaf-159bd69e9fc6,eb188485-97a9-4903-9521-b6e5d8c6ad72,Success,2nd Class,132.07
