In [1]:
import numpy as np
import csv

# Define sample data for Stop/Station and Feedback
stops = ['Kuala Lumpur', 'Penang', 'Johor Bahru', 'Malacca', 'Langkawi', 'Ipoh', 
         'Kota Kinabalu', 'Kuching', 'Georgetown', 'Cameron Highlands', 'Port Dickson']
feedbacks = ['Service was excellent', 'Seats were uncomfortable', 'Delay in departure', 
             'Cleanliness was lacking', 'Driver was friendly', 'Bus arrived on time', 
             'Train was overcrowded', 'Station was well-maintained', 'Route was confusing', 
             'Poor signage at the station']

# Set seed for reproducibility
np.random.seed(0)

# Generate synthetic data
num_rows = 1000

# Generate Date and Time data
dates = np.random.choice(np.arange(np.datetime64('2022-01-01'), np.datetime64('2023-12-31')), num_rows)
times = np.array([f"{np.random.randint(0, 24):02d}:{np.random.randint(0, 60):02d}" for i in range(num_rows)])

# Generate Stop/Station, Passenger_Count, and Vehicle_ID data
stop_station = np.random.choice(stops, num_rows)
passenger_count = np.random.randint(10, 100, size=num_rows)
vehicle_id = []
for _ in range(num_rows):
    if np.random.rand() < 0.5:
        num = np.random.randint(1, 1000)
        vehicle_id.append('BUS' + str(num).zfill(3))
    else:
        num = np.random.randint(1, 100)
        vehicle_id.append('TRAIN' + str(num).zfill(2))


# Generate Latitude and Longitude data
latitude = np.random.uniform(1, 7, size=num_rows)
longitude = np.random.uniform(100, 120, size=num_rows)

# Generate Temperature, Precipitation, and Humidity data
temperature = np.random.randint(-10, 40, size=num_rows)
precipitation = np.random.randint(0, 10, size=num_rows)
humidity = np.random.randint(50, 100, size=num_rows)

# Generate Age_Group, Gender, and Feedback data
age_group = np.random.choice(['18-24', '25-40', '40-60'], num_rows)
gender = np.random.choice(['Male', 'Female'], num_rows)
feedback = np.random.choice(feedbacks, num_rows)

# Save synthetic data to CSV file
with open('synthetic_data.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Date', 'Time', 'Stop/Station', 'Passenger_Count', 'Vehicle_ID', 'Latitude', 'Longitude', 'Temperature (°C)', 'Precipitation (mm)', 'Humidity (%)', 'Age_Group', 'Gender', 'Feedback'])
    for i in range(num_rows):
        writer.writerow([dates[i], times[i], stop_station[i], passenger_count[i], vehicle_id[i], latitude[i], longitude[i], temperature[i], precipitation[i], humidity[i], age_group[i], gender[i], feedback[i]])

print("Data saved to synthetic_data.csv")

Data saved to synthetic_data.csv


In [6]:
import numpy as np
import csv
import random

# Define sample data for Stop/Station and Feedback
stops = ['Kuala Lumpur', 'Penang', 'Johor Bahru', 'Malacca', 'Langkawi', 'Ipoh', 
         'Kota Kinabalu', 'Kuching', 'Georgetown', 'Cameron Highlands', 'Port Dickson']
feedbacks = ['Service was excellent', 'Seats were uncomfortable', 'Delay in departure', 
             'Cleanliness was lacking', 'Driver was friendly', 'Bus arrived on time', 
             'Train was overcrowded', 'Station was well-maintained', 'Route was confusing', 
             'Poor signage at the station', 'Missing value', 'Duplicate', 'Outlier']

# Set seed for reproducibility
np.random.seed(0)
random.seed(0)

# Generate synthetic data
num_rows = 1000

# Generate Date and Time data
dates = np.random.choice(np.arange(np.datetime64('2022-01-01'), np.datetime64('2023-12-31')), num_rows)
times = np.array([f"{np.random.randint(0, 24):02d}:{np.random.randint(0, 60):02d}" for i in range(num_rows)])

# Generate Stop/Station, Passenger_Count, and Vehicle_ID data
stop_station = np.random.choice(stops, num_rows)
passenger_count = np.random.randint(10, 150, size=num_rows)
vehicle_id = []
for _ in range(num_rows):
    if np.random.rand() < 0.5:
        num = np.random.randint(1, 1000)
        vehicle_id.append('BUS' + str(num).zfill(3))
    else:
        num = np.random.randint(1, 100)
        vehicle_id.append('TRAIN' + str(num).zfill(2))

# Generate Latitude and Longitude data
latitude = np.random.uniform(1, 7, size=num_rows)
longitude = np.random.uniform(100, 120, size=num_rows)

# Generate Temperature, Precipitation, and Humidity data
temperature_base = np.random.randint(15, 30, size=num_rows)
precipitation = np.random.randint(0, 5, size=num_rows)
humidity = np.random.randint(50, 100, size=num_rows)

# Adjust temperature based on precipitation
temperature = np.where(precipitation > 0, temperature_base - 5, temperature_base)

# Generate Age_Group, Gender, and Feedback data
age_group = np.random.choice(['18-24', '25-40', '40-60'], num_rows)
gender = np.random.choice(['Male', 'Female'], num_rows)
feedback = np.random.choice(feedbacks, num_rows)

# Introduce duplicates, missing values, and outliers
# Duplicates
duplicate_indices = random.sample(range(num_rows), 20)
stop_station[duplicate_indices] = random.choices(stop_station, k=20)
# Missing Values
missing_value_indices = random.sample(range(num_rows), 20)
stop_station[missing_value_indices] = 'Missing value'
# Outliers
outlier_indices = random.sample(range(num_rows), 10)
passenger_count[outlier_indices] = np.random.randint(200, 500, size=10)

# Save synthetic data to CSV file
with open('synthetic_data.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Date', 'Time', 'Stop/Station', 'Passenger_Count', 'Vehicle_ID', 'Latitude', 'Longitude', 'Temperature (°C)', 'Precipitation (mm)', 'Humidity (%)', 'Age_Group', 'Gender', 'Feedback'])
    for i in range(num_rows):
        writer.writerow([dates[i], times[i], stop_station[i], passenger_count[i], vehicle_id[i], latitude[i], longitude[i], temperature[i], precipitation[i], humidity[i], age_group[i], gender[i], feedback[i]])

print("Data saved to synthetic_data.csv")


Data saved to synthetic_data.csv
