In [11]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Generic car plates
car_plates = ["YEM-076-B","BYD-065-A","AMG-003-A","YEM-065-A","YUP-765-A","FES-090-B","ZUX-230-F","PUR-203-C","LET-098-D","CUZ-304-F",
              "SOS-900-L","YEN-007-B","ISI-999-O","YET-404-L","LIL-065-F","AND-999-X","YES-545-Y","FER-543-C","TET-000-A","XES-511-A"]

# Generate random confidence scores with slight correlation
def generate_confidence_scores():
    pgie_conf = round(random.uniform(0.75, 0.98), 3)
    sgie_conf = round(min(max(pgie_conf + random.uniform(-0.05, 0.05), 0), 1), 3)
    return pgie_conf, sgie_conf

# Adjusted function to enforce IN-OUT sequencing
def generate_day_events(day, car_plate_status):
    num_events = random.randint(3, 5)  # Daily events for around 100/month
    events = []
    current_time = datetime.combine(day, datetime.min.time())

    for _ in range(num_events):
        # Randomize event timing (around the clock)
        event_time = current_time + timedelta(
            hours=random.randint(0, 23),
            minutes=random.randint(0, 59),
            seconds=random.randint(0, 59)
        )

        # Pick a Car Plate and determine direction based on last status
        car_plate = random.choice(car_plates)
        direction = 'IN' if car_plate_status[car_plate] == 'OUT' else 'OUT'
        car_plate_status[car_plate] = direction  # Update car plate status

        # Generate confidence scores
        pgie_conf, sgie_conf = generate_confidence_scores()

        # Append event to list
        events.append({
            "timestamp": event_time,
            "cleaned_label": car_plate,
            "direction": direction,
            "avg_pgie_confidence": pgie_conf,
            "avg_sgie_confidence": sgie_conf
        })

    return events

# Generate car movement data over multiple days
def generate_random_data(num_days):
    start_date = datetime.now() - timedelta(days=num_days)
    all_events = []

    # Track the status of each railcar (initially OUT)
    car_plate_status = {car_plate: 'OUT' for car_plate in car_plates}

    # Generate events for each day
    for day_offset in range(num_days):
        current_day = start_date + timedelta(days=day_offset)
        day_events = generate_day_events(current_day.date(), car_plate_status)
        all_events.extend(day_events)

    # Create DataFrame and sort by timestamp
    df = pd.DataFrame(all_events)
    df = df.sort_values(by="timestamp").reset_index(drop=True)

    return df

# Generate random data for 30 days
data_large = generate_random_data(30)
data_large.head(), data_large.shape  # Show sample data and the total number of records


(            timestamp cleaned_label direction  avg_pgie_confidence  \
 0 2024-11-09 11:35:51     AMG-003-A        IN                0.892   
 1 2024-11-09 15:27:53     YEN-007-B        IN                0.827   
 2 2024-11-09 19:29:15     YEM-065-A        IN                0.783   
 3 2024-11-09 21:47:00     YES-545-Y        IN                0.771   
 4 2024-11-10 01:42:43     YEM-076-B        IN                0.935   
 
    avg_sgie_confidence  
 0                0.859  
 1                0.799  
 2                0.801  
 3                0.783  
 4                0.897  ,
 (112, 5))

In [12]:
import os

def generate_and_save_data(num_days=30, filename='car_plate_logs_3.csv'):
    """
    Generate random car movement data and save to CSV

    Parameters:
    num_days (int): Number of days to generate data for
    filename (str): Name of the output CSV file

    Returns:
    tuple: (DataFrame, str) - The generated DataFrame and the full path to the saved CSV file
    """
    # Generate the data
    df = generate_random_data(num_days=num_days)

    # Format timestamp for CSV
    df['timestamp'] = df['timestamp'].dt.strftime('%Y-%m-%d %H:%M:%S')

    # Get current directory
    current_dir = os.getcwd()

    # Create full file path
    file_path = os.path.join(current_dir, filename)

    # Save to CSV
    df.to_csv(file_path, index=False)

    return df, file_path

# Example usage:
df, saved_path = generate_and_save_data(num_days=30, filename='car_plate_logs_3.csv')
print(f"Data has been saved to: {saved_path}")

#Quick verification of the saved data:
df_verification = pd.read_csv(saved_path)
print("\nFirst few rows of the saved data:")
print(df_verification.head())

Data has been saved to: /content/car_plate_logs_3.csv

First few rows of the saved data:
             timestamp cleaned_label direction  avg_pgie_confidence  \
0  2024-11-09 19:23:47     YEM-065-A        IN                0.786   
1  2024-11-09 20:44:19     AND-999-X       OUT                0.930   
2  2024-11-09 21:04:49     AND-999-X        IN                0.922   
3  2024-11-10 05:38:49     YEN-007-B        IN                0.972   
4  2024-11-10 07:18:21     YUP-765-A        IN                0.969   

   avg_sgie_confidence  
0                0.831  
1                0.903  
2                0.932  
3                0.979  
4                0.936  
