In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Define vehicle brands and models
vehicle_brands = ['Toyota', 'Honda', 'Ford', 'Chevrolet', 'Volkswagen', 'BMW', 'Mercedes-Benz', 'Audi', 'Nissan', 'Hyundai']
vehicle_models = {
    'Toyota': ['Camry', 'Corolla', 'Rav4', 'Prius'],
    'Honda': ['Accord', 'Civic', 'CR-V', 'Pilot'],
    'Ford': ['F-150', 'Escape', 'Focus', 'Explorer'],
    'Chevrolet': ['Silverado', 'Equinox', 'Malibu', 'Traverse'],
    'Volkswagen': ['Jetta', 'Passat', 'Tiguan', 'Atlas'],
    'BMW': ['3 Series', '5 Series', 'X3', 'X5'],
    'Mercedes-Benz': ['C-Class', 'E-Class', 'GLC', 'GLE'],
    'Audi': ['A4', 'A6', 'Q5', 'Q7'],
    'Nissan': ['Altima', 'Rogue', 'Sentra', 'Pathfinder'],
    'Hyundai': ['Elantra', 'Tucson', 'Santa Fe', 'Sonata']
}

# Define service/repair types and their impact on reliability
service_types = {
    'Transmission Diagnostics/Repair/Replacement': 'High',
    'Engine Diagnostics/Repair/Replacement': 'High',
    'Multiple Unscheduled Repairs': 'High',
    'Electrical System Repairs': 'High',
    'Suspension System Repairs': 'Moderate',
    'Brake System Repairs': 'Moderate',
    'Cooling System Repairs': 'Moderate',
    'Routine Maintenance Services': 'Low',
    'Wear-and-Tear Replacements': 'Low'
}

def generate_synthetic_data(num_records):
    data = []
    for _ in range(num_records):
        # Randomly select vehicle brand and model
        brand = np.random.choice(vehicle_brands)
        model = np.random.choice(vehicle_models[brand])
        
        # Randomly select year of manufacture between 2010 and 2020
        year_of_manufacture = np.random.randint(2010, 2021)
        
        # Generate mileage based on vehicle age (whole number)
        current_year = datetime.now().year
        age = current_year - year_of_manufacture
        mileage = np.random.randint(low=age*500, high=(age+1)*500)  # Assume an average of 15,000 miles per year
        
        # Randomly select type of service/repair
        service_type = np.random.choice(list(service_types.keys()))
        
        # Generate service date within a reasonable range
        service_date = datetime(year_of_manufacture, 1, 1) + timedelta(days=np.random.randint(0, 365*age))
        
        reliability = np.random.randint(0,2)
        
        scheduled_or_unscheduled = np.random.randint(0,2)
        
        data.append([brand, model, year_of_manufacture, mileage, service_type, service_date , scheduled_or_unscheduled , reliability])
    
    df = pd.DataFrame(data, columns=['Vehicle_Brand', 'Vehicle_Model', 'Year_of_Manufacture', 'Mileage', 
                                     'Type_of_Service', 'Service_Date' , 'scheduled' , 'reliability'])
    return df

synthetic_data = generate_synthetic_data(num_records=1000)

synthetic_data.to_csv('synthetic_vehicle_reliability_data.csv', index=False)

print("Synthetic data generated and exported successfully.")


Synthetic data generated and exported successfully.
