In [1]:
print("Hello World")

Hello World


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import glob
import os

In [23]:
# Step 1: Combine CSV files
data_dir = "./data/dirt-2/"  # Update with your directory
csv_files = glob.glob(os.path.join(data_dir, "*.csv"))
df_list = [pd.read_csv(file) for file in csv_files]
data = pd.concat(df_list, ignore_index=True)
print(f"Combined data shape: {data.shape}")

Combined data shape: (309220, 80)


In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309220 entries, 0 to 309219
Data columns (total 80 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Angle                 288986 non-null  float64
 1    CurrentLapTime       309220 non-null  float64
 2    Damage               309220 non-null  int64  
 3    DistanceFromStart    309220 non-null  float64
 4    DistanceCovered      309220 non-null  float64
 5    FuelLevel            309220 non-null  float64
 6    gear_drop            309220 non-null  int64  
 7    LastLapTime          309220 non-null  float64
 8    Opponent_1           309220 non-null  float64
 9   Opponent_2            309220 non-null  float64
 10  Opponent_3            309220 non-null  float64
 11  Opponent_4            309220 non-null  float64
 12  Opponent_5            309220 non-null  float64
 13  Opponent_6            309220 non-null  float64
 14  Opponent_7            309220 non-null  float64
 15  

In [25]:
# Step 2: Drop incorrect 'drop_gear' columns
drop_gear_cols = [col for col in data.columns if 'gear_drop' in col.lower()]  # Catches 'drop_gear', 'drop_gear_1', etc.
if drop_gear_cols:
    data = data.drop(columns=drop_gear_cols)
    print(f"Dropped columns: {drop_gear_cols}")
else:
    print("No 'drop_gear' columns found.")
print(f"Shape after dropping drop_gear columns: {data.shape}")


Dropped columns: [' gear_drop', 'gear_drop']
Shape after dropping drop_gear columns: (309220, 78)


In [26]:
# Step 3: Drop opponent-related columns
opponent_cols = [col for col in data.columns if col.startswith('Opponent_')]  # Adjust based on column names
data = data.drop(columns=opponent_cols)
print(f"Shape after dropping opponent columns: {data.shape}")


Shape after dropping opponent columns: (309220, 43)


In [27]:
# Step 4: Check and handle null values
null_counts = data.isnull().sum()
print("Null values per column:\n", null_counts[null_counts > 0])
# Drop rows with nulls (or impute if preferred)
data = data.dropna()
print(f"Shape after handling nulls: {data.shape}")


Null values per column:
 Angle    20234
dtype: int64
Shape after handling nulls: (288986, 43)


In [33]:
# Step 5: Compute mean and standard deviation, normalize features
# Separate features (sensors) and labels (actuators)
sensor_cols = [col for col in data.columns if col not in ['Acceleration', 'Braking', 'Clutch', 'Gear', 'Steering']]
actuator_cols = [col for col in data.columns if col in ['Acceleration', 'Braking', 'Steering']]
features = data[sensor_cols]
all_actuator_cols = [col for col in data.columns if col in ['Acceleration', 'Braking', 'Clutch', 'Gear', 'Steering']]
labels = data[all_actuator_cols]

# Calculate mean and std for sensors and continuous actuators
feature_stats = features.describe().loc[['mean', 'std']].T
label_stats = labels[actuator_cols].describe().loc[['mean', 'std']].T
print("Feature stats:\n", feature_stats)
print("Label stats:\n", label_stats)

Feature stats:
                              mean          std
Angle                   -0.000640     0.282541
 CurrentLapTime         49.227567    29.028026
 Damage               2823.975051  2339.246312
 DistanceFromStart     801.164121   455.878970
 DistanceCovered      2891.367966  1700.453458
 FuelLevel              53.366001     0.905742
 LastLapTime            70.338981    45.912840
 Opponent_1            175.131463    58.672077
RacePosition             2.162219     1.595496
 RPM                  5476.455812  1541.549847
 SpeedX                 59.128886    28.477126
 SpeedY                 -0.025914     5.588287
 SpeedZ                 -0.013730     4.671972
 Track_1                 5.067960     3.448218
Track_2                  5.835095     5.886011
Track_3                  7.683733     6.994096
Track_4                 11.181911    11.890595
Track_5                 16.910797    15.476984
Track_6                 19.790101    16.120858
Track_7                 24.688684    18.8962

In [34]:
# Step 6: Check statistics and distributions
# Continuous features and actuators
continuous_cols = sensor_cols + actuator_cols
stats = data[continuous_cols].describe().loc[['mean', 'std', 'min', 'max']].T
print("Statistics for continuous features and actuators:\n", stats)
# Discrete outputs (Gear, Clutch)
print("Gear value counts:\n", data['Gear'].value_counts())
print("Clutch value counts:\n", data['Clutch'].value_counts())

Statistics for continuous features and actuators:
                              mean          std          min          max
Angle                   -0.000640     0.282541    -3.129000      3.13044
 CurrentLapTime         49.227567    29.028026    -0.982000    115.03200
 Damage               2823.975051  2339.246312     0.000000   8438.00000
 DistanceFromStart     801.164121   455.878970     0.006884   1760.94000
 DistanceCovered      2891.367966  1700.453458     0.000000   6563.45000
 FuelLevel              53.366001     0.905742    51.817200     55.00000
 LastLapTime            70.338981    45.912840     0.000000    107.36000
 Opponent_1            175.131463    58.672077     3.470930    200.00000
RacePosition             2.162219     1.595496     1.000000      7.00000
 RPM                  5476.455812  1541.549847  1151.920000  10015.40000
 SpeedX                 59.128886    28.477126   -78.060800    122.20200
 SpeedY                 -0.025914     5.588287   -76.869300     88.68200


In [35]:
# Step 7: Check for constant columns
constant_cols = [col for col in sensor_cols if data[col].std() == 0]
if constant_cols:
    print(f"Constant columns to drop: {constant_cols}")
    features = features.drop(columns=constant_cols)
    sensor_cols = [col for col in sensor_cols if col not in constant_cols]
print(f"Shape after dropping constant columns: {features.shape}")

Shape after dropping constant columns: (288986, 39)


In [36]:
# Step 8: Normalize continuous features and actuators
features_normalized = features.copy()
labels_normalized = labels.copy()
for col in sensor_cols:
    features_normalized[col] = (features[col] - features[col].mean()) / features[col].std()
for col in actuator_cols:
    labels_normalized[col] = (labels[col] - labels[col].mean()) / labels[col].std()
# Gear and Clutch remain unnormalized
data_normalized = pd.concat([features_normalized, labels_normalized], axis=1)

In [37]:
# Step 10: Split into train/validation/test
train_data, temp_data = train_test_split(data_normalized, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)
print(f"Train shape: {train_data.shape}, Validation shape: {val_data.shape}, Test shape: {test_data.shape}")

# Step 11: Save preprocessed data
train_data.to_csv("./data/dirt-2/train_data.csv", index=False)
val_data.to_csv("./data/dirt-2/val_data.csv", index=False)
test_data.to_csv("./data/dirt-2/test_data.csv", index=False)
print("./data/dirt-2/Preprocessed data saved.")


Train shape: (231188, 43), Validation shape: (28899, 43), Test shape: (28899, 43)
./data/dirt-2/Preprocessed data saved.


In [None]:


# # Step 8: Check for outliers (optional, based on stats)
# # Example: Clip features to [-3, 3] std deviations
# for col in sensor_cols + actuator_cols:
#     mean, std = data[col].mean(), data[col].std()
#     data_normalized[col] = data_normalized[col].clip(lower=mean-3*std, upper=mean+3*std)

# data_normalized.to_csv("./data/dirt-2/dirt-2_data.csv", index=False)

# Step 9: Split into train/validation/test



# # Normalize features and continuous actuators
# features_normalized = (features - features.mean()) / features.std()
# labels_normalized = labels.copy()
# labels_normalized[actuator_cols] = (labels[actuator_cols] - labels[actuator_cols].mean()) / labels[actuator_cols].std()
# # Gear remains unnormalized (if discrete)

# # Combine normalized features and labels
# data_normalized = pd.concat([features_normalized, labels_normalized], axis=1)

# # Step 5: Split into train/validation/test
# train_data, temp_data = train_test_split(data_normalized, test_size=0.2, random_state=42)
# val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)
# print(f"Train shape: {train_data.shape}, Validation shape: {val_data.shape}, Test shape: {test_data.shape}")

# # Step 6: Save preprocessed data
# train_data.to_csv("train_data.csv", index=False)
# val_data.to_csv("val_data.csv", index=False)
# test_data.to_csv("test_data.csv", index=False)
# print("Preprocessed data saved.")

Combined data shape: (309220, 80)


KeyError: "['gear_drop'] not found in axis"