In [1]:
import pandas as pd
import numpy as np

# --- Seed for reproducibility ---
np.random.seed(42)

# --- Row counts ---
n_total = 3000
n_fail = 1200
n_not_fail = 1800

# --- Function to generate realistic data ---
def generate_group(size, fail_flag):
    df = pd.DataFrame({
        "Vehicle_ID": [f"V{1000+i}" for i in range(size)],
        "Load_Torque(Nm)": np.random.normal(950 if fail_flag else 650, 100, size),
        "Speed(RPM)": np.random.normal(1800 if fail_flag else 1400, 200, size),
        "Lubricant_Temp(°C)": np.random.normal(110 if fail_flag else 85, 10, size),
        "Oil_Viscosity(cSt)": np.random.normal(60 if fail_flag else 85, 10, size),
        "Axle_Age(km)": np.random.normal(180000 if fail_flag else 90000, 20000, size),
        "Road_Type": np.random.choice(["Highway", "Rough", "Off-road"], size, 
                                      p=[0.3, 0.3, 0.4] if fail_flag else [0.5, 0.3, 0.2]),
        "Shock_Load": np.random.choice(["Low", "Medium", "High"], size, 
                                       p=[0.2, 0.3, 0.5] if fail_flag else [0.5, 0.3, 0.2]),
        "Manufacturing_Batch": np.random.randint(100, 200, size),
        "Maintenance_Gap(km)": np.random.normal(25000 if fail_flag else 12000, 3000, size),
        "Noise_Level(dB)": np.random.normal(92 if fail_flag else 72, 5, size),
        "Vibration(mm/s)": np.random.normal(12 if fail_flag else 5, 2, size),
        "Temperature_Rise(°C)": np.random.normal(45 if fail_flag else 25, 5, size),
        "Torque_Speed_Ratio": np.random.normal(0.7 if fail_flag else 0.5, 0.05, size),
        "Aging_Factor": np.random.normal(0.8 if fail_flag else 0.4, 0.1, size),
        "Thermal_Stress_Index": np.random.normal(0.85 if fail_flag else 0.45, 0.1, size),
        "Ambient_Temp(°C)": np.random.normal(38 if fail_flag else 28, 5, size),
        "Humidity(%)": np.random.normal(75 if fail_flag else 55, 10, size),
        "Dust_Level": np.random.choice(["Low", "Medium", "High"], size, 
                                       p=[0.2, 0.3, 0.5] if fail_flag else [0.5, 0.3, 0.2]),
        "Altitude(m)": np.random.normal(200 if fail_flag else 500, 100, size),
        "Rain_Exposure": np.random.choice(["Low", "Medium", "High"], size, 
                                          p=[0.2, 0.3, 0.5] if fail_flag else [0.5, 0.3, 0.2]),
        "Road_Roughness(m/s²)": np.random.normal(2.2 if fail_flag else 1.1, 0.3, size),
        "Vehicle_Load(kg)": np.random.normal(15000 if fail_flag else 9500, 1500, size),
        "Brake_Usage_Freq(/100km)": np.random.normal(40 if fail_flag else 20, 5, size),
        "Oil_Contamination_Index": np.random.normal(0.8 if fail_flag else 0.3, 0.1, size),
        "Lubrication_Efficiency": np.random.normal(0.4 if fail_flag else 0.8, 0.1, size),
        "Shock_Severity_Index": np.random.normal(0.75 if fail_flag else 0.4, 0.1, size),
        "Cumulative_Damage_Index": np.random.normal(0.8 if fail_flag else 0.3, 0.1, size),
        "Load_Factor": np.random.choice(["Low", "Medium", "High"], size, 
                                        p=[0.1, 0.3, 0.6] if fail_flag else [0.5, 0.3, 0.2]),
        "Module_mm": np.random.normal(7.5 if fail_flag else 9.0, 0.5, size),
        "Pinion_Teeth": np.random.randint(8, 11, size),
        "Face_Width_mm": np.random.normal(60 if fail_flag else 80, 5, size),
        "Pressure_Angle_deg": np.random.normal(23 if fail_flag else 20, 1.5, size),
        "Profile_Shift": np.random.normal(0.05 if fail_flag else 0.2, 0.05, size),
        "Contact_Ratio": np.random.normal(1.2 if fail_flag else 1.6, 0.1, size),
        "Backlash_mm": np.random.normal(0.35 if fail_flag else 0.20, 0.05, size),
        "Material_Grade": np.random.choice(["Low", "Medium", "High"], size, 
                                           p=[0.5, 0.3, 0.2] if fail_flag else [0.2, 0.3, 0.5]),
        "Case_Depth_mm": np.random.normal(0.9 if fail_flag else 1.4, 0.2, size),
        "Surface_Hardness_HRC": np.random.normal(45 if fail_flag else 58, 3, size),
        "Runout_mm": np.random.normal(0.08 if fail_flag else 0.03, 0.02, size),
        "Housing_Stiffness_index": np.random.normal(0.5 if fail_flag else 0.8, 0.1, size),
        "Failure": np.full(size, 1 if fail_flag else 0)
    })
    return df

# --- Create failed and not-failed groups ---
df_fail = generate_group(n_fail, True)
df_not_fail = generate_group(n_not_fail, False)

# --- Combine & shuffle ---
df_final = pd.concat([df_fail, df_not_fail]).sample(frac=1).reset_index(drop=True)

# --- Save to CSV ---
output_path = "rear_axle_failure_realistic_v2.csv"
df_final.to_csv(output_path, index=False)

print(f"✅ Dataset generated and saved as '{output_path}' with {df_final.shape[0]} rows and {df_final.shape[1]} columns.")

✅ Dataset generated and saved as 'rear_axle_failure_realistic_v2.csv' with 3000 rows and 42 columns.
