In [5]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import faker
import random

# Set a seed for reproducibility
seed_value = 42
random.seed(seed_value)
np.random.seed(seed_value)

# Initialize Faker generator with seed
fake = faker.Faker()
faker.Faker.seed(seed_value)

# Load race tracks data
race_tracks = pd.read_csv('/content/circuits.csv')

# Function to generate random data with systematic positions and dates
def generate_data_with_positions_and_dates(num_rows, race_tracks):
    num_tracks = len(race_tracks)

    data = {
        'Racer ID': np.random.randint(200000, 299999, size=num_rows),
        'FirstName': [fake.first_name() for _ in range(num_rows)],
        'LastName': [fake.last_name() for _ in range(num_rows)],
        'Location': [race_tracks['location'].iloc[i % num_tracks] for i in range(num_rows)],
        'CircuitName': [race_tracks['track'].iloc[i % num_tracks] for i in range(num_rows)],
        'TeamName': [fake.company() for _ in range(num_rows)],
        'Sponsor': [fake.company() for _ in range(num_rows)],
        'LapTime': np.round(np.random.uniform(60, 600, size=num_rows), 2)
    }

    # Create a DataFrame from the generated data
    df = pd.DataFrame(data)

    # Sort the DataFrame by 'CircuitName', 'LapTime'
    df = df.sort_values(by=['CircuitName', 'LapTime'], ascending=[True, True])

    # Generate unique dates for each circuit
    circuit_dates = [datetime(2013, 1, 1) + timedelta(days=i) for i in range(num_tracks)]
    df['Date'] = np.repeat(circuit_dates, num_rows // num_tracks)[:num_rows]

    # Assign positions within each circuit group based on LapTime
    df['Position'] = df.groupby('CircuitName').cumcount() % 10 + 1

    # Convert 'LapTime' to string format
    df['LapTime'] = df['LapTime'].apply(lambda x: f'{int(x // 60):02d}:{int(x % 60):02d}.{int((x % 1) * 100):02d}')

    return df

# Generate random data with systematic positions and dates
num_rows = 770
data_with_positions_and_dates = generate_data_with_positions_and_dates(num_rows, race_tracks)

# Display the resulting DataFrame
print(data_with_positions_and_dates)

# Save DataFrames to CSV
csv_file_path = '/content/UpdatedDataBase.csv'
data_with_positions_and_dates.to_csv(csv_file_path, index=False, header=True, mode='w', encoding='utf-8')

# Print the file path
print(f'CSV file saved at: {csv_file_path}')


     Racer ID FirstName LastName        Location CircuitName  \
368    200301     Aaron  Schultz          Berlin        AVUS   
599    244811     Stacy    Potts          Berlin        AVUS   
522    209823     Debra    Ponce          Berlin        AVUS   
137    282948  Kathleen   Garner          Berlin        AVUS   
60     289475  Veronica   Sawyer          Berlin        AVUS   
..        ...       ...      ...             ...         ...   
193    298098    Sandra  Bentley  Heusden-Zolder      Zolder   
501    295981   Anthony    Mills  Heusden-Zolder      Zolder   
347    205600    Joshua   Farmer  Heusden-Zolder      Zolder   
578    233496      Ryan   Murray  Heusden-Zolder      Zolder   
732    271726   Sabrina   Thomas  Heusden-Zolder      Zolder   

                         TeamName                    Sponsor   LapTime  \
368  Hernandez, Jenkins and Parks                Jones Group  01:13.70   
599      Reeves, Parks and Little              Fernandez Inc  02:33.90   
522      