In [1]:
print("Hello World")

Hello World


In [54]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import glob
import os
import joblib

In [3]:
# Step 1: Combine CSV files
data_dir = "./data/dirt-2/"  # Update with your directory
csv_files = glob.glob(os.path.join(data_dir, "*.csv"))
df_list = [pd.read_csv(file) for file in csv_files]
data = pd.concat(df_list, ignore_index=True)
print(f"Combined data shape: {data.shape}")

Combined data shape: (473051, 80)


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 473051 entries, 0 to 473050
Data columns (total 80 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Angle                 452818 non-null  float64
 1    CurrentLapTime       473051 non-null  float64
 2    Damage               473051 non-null  int64  
 3    DistanceFromStart    473051 non-null  float64
 4    DistanceCovered      473051 non-null  float64
 5    FuelLevel            473051 non-null  float64
 6    gear_drop            473051 non-null  int64  
 7    LastLapTime          473051 non-null  float64
 8    Opponent_1           473051 non-null  float64
 9   Opponent_2            473051 non-null  float64
 10  Opponent_3            473051 non-null  float64
 11  Opponent_4            473051 non-null  float64
 12  Opponent_5            473051 non-null  float64
 13  Opponent_6            473051 non-null  float64
 14  Opponent_7            473051 non-null  float64
 15  

In [5]:
# remove whitespace from the name of the columns. Helps in accessing later on
data.columns = data.columns.str.strip()

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 473051 entries, 0 to 473050
Data columns (total 80 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Angle                452818 non-null  float64
 1   CurrentLapTime       473051 non-null  float64
 2   Damage               473051 non-null  int64  
 3   DistanceFromStart    473051 non-null  float64
 4   DistanceCovered      473051 non-null  float64
 5   FuelLevel            473051 non-null  float64
 6   gear_drop            473051 non-null  int64  
 7   LastLapTime          473051 non-null  float64
 8   Opponent_1           473051 non-null  float64
 9   Opponent_2           473051 non-null  float64
 10  Opponent_3           473051 non-null  float64
 11  Opponent_4           473051 non-null  float64
 12  Opponent_5           473051 non-null  float64
 13  Opponent_6           473051 non-null  float64
 14  Opponent_7           473051 non-null  float64
 15  Opponent_8       

In [7]:
# Step 2: Drop incorrect 'drop_gear' columns
drop_gear_cols = [col for col in data.columns if 'gear_drop' in col.lower()]  # Catches 'drop_gear', 'drop_gear_1', etc.
if drop_gear_cols:
    data = data.drop(columns=drop_gear_cols)
    print(f"Dropped columns: {drop_gear_cols}")
else:
    print("No 'drop_gear' columns found.")
print(f"Shape after dropping drop_gear columns: {data.shape}")


Dropped columns: ['gear_drop', 'gear_drop']
Shape after dropping drop_gear columns: (473051, 78)


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 473051 entries, 0 to 473050
Data columns (total 78 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Angle                452818 non-null  float64
 1   CurrentLapTime       473051 non-null  float64
 2   Damage               473051 non-null  int64  
 3   DistanceFromStart    473051 non-null  float64
 4   DistanceCovered      473051 non-null  float64
 5   FuelLevel            473051 non-null  float64
 6   LastLapTime          473051 non-null  float64
 7   Opponent_1           473051 non-null  float64
 8   Opponent_2           473051 non-null  float64
 9   Opponent_3           473051 non-null  float64
 10  Opponent_4           473051 non-null  float64
 11  Opponent_5           473051 non-null  float64
 12  Opponent_6           473051 non-null  float64
 13  Opponent_7           473051 non-null  float64
 14  Opponent_8           473051 non-null  float64
 15  Opponent_9       

In [9]:
# Step 3: Drop opponent-related columns
opponent_cols = [col for col in data.columns if col.startswith('Opponent_')]  # Adjust based on column names
data = data.drop(columns=opponent_cols)
print(f"Shape after dropping opponent columns: {data.shape}")


Shape after dropping opponent columns: (473051, 42)


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 473051 entries, 0 to 473050
Data columns (total 42 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Angle                452818 non-null  float64
 1   CurrentLapTime       473051 non-null  float64
 2   Damage               473051 non-null  int64  
 3   DistanceFromStart    473051 non-null  float64
 4   DistanceCovered      473051 non-null  float64
 5   FuelLevel            473051 non-null  float64
 6   LastLapTime          473051 non-null  float64
 7   RacePosition         473051 non-null  int64  
 8   RPM                  473051 non-null  float64
 9   SpeedX               473051 non-null  float64
 10  SpeedY               473051 non-null  float64
 11  SpeedZ               473051 non-null  float64
 12  Track_1              473051 non-null  float64
 13  Track_2              473051 non-null  float64
 14  Track_3              473051 non-null  float64
 15  Track_4          

In [11]:
# drop irrelevant columns one by one
cols_to_drop = ['CurrentLapTime', 'Damage', 'DistanceFromStart', 'FuelLevel', 'RacePosition']
opponent_cols = [col for col in data.columns if col in cols_to_drop]  # Adjust based on column names
data = data.drop(columns=opponent_cols)
print(f"Shape after dropping opponent columns: {data.shape}")


Shape after dropping opponent columns: (473051, 37)


In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 452818 entries, 0 to 452817
Data columns (total 37 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Angle                452818 non-null  float64
 1   DistanceCovered      452818 non-null  float64
 2   LastLapTime          452818 non-null  float64
 3   RPM                  452818 non-null  float64
 4   SpeedX               452818 non-null  float64
 5   SpeedY               452818 non-null  float64
 6   SpeedZ               452818 non-null  float64
 7   Track_1              452818 non-null  float64
 8   Track_2              452818 non-null  float64
 9   Track_3              452818 non-null  float64
 10  Track_4              452818 non-null  float64
 11  Track_5              452818 non-null  float64
 12  Track_6              452818 non-null  float64
 13  Track_7              452818 non-null  float64
 14  Track_8              452818 non-null  float64
 15  Track_9              4

In [13]:
# Step 4: Check and handle null values
null_counts = data.isnull().sum()
print("Null values per column:\n", null_counts[null_counts > 0])
# Drop rows with nulls (or impute if preferred)
data = data.dropna()
print(f"Shape after handling nulls: {data.shape}")


Null values per column:
 Angle    20233
dtype: int64
Shape after handling nulls: (452818, 37)


In [18]:
data['Gear'].value_counts()

Gear
 1    171305
 3    156451
 2    124086
-1       976
Name: count, dtype: int64

In [22]:
data['Gear'] = data['Gear'].replace(-1, 0)
data['Gear'].value_counts()

Gear
1    171305
3    156451
2    124086
0       976
Name: count, dtype: int64

In [24]:
data['Clutch'] = data['Clutch'].replace(0.5, 1)
data['Clutch'].value_counts()

Clutch
0.0    449388
1.0      3430
Name: count, dtype: int64

In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 452818 entries, 0 to 452817
Data columns (total 37 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Angle                452818 non-null  float64
 1   DistanceCovered      452818 non-null  float64
 2   LastLapTime          452818 non-null  float64
 3   RPM                  452818 non-null  float64
 4   SpeedX               452818 non-null  float64
 5   SpeedY               452818 non-null  float64
 6   SpeedZ               452818 non-null  float64
 7   Track_1              452818 non-null  float64
 8   Track_2              452818 non-null  float64
 9   Track_3              452818 non-null  float64
 10  Track_4              452818 non-null  float64
 11  Track_5              452818 non-null  float64
 12  Track_6              452818 non-null  float64
 13  Track_7              452818 non-null  float64
 14  Track_8              452818 non-null  float64
 15  Track_9              4

In [26]:
null_counts = data.isnull().sum()
print("Null values per column:\n", null_counts[null_counts > 0])

Null values per column:
 Series([], dtype: int64)


In [27]:
data.describe()

Unnamed: 0,Angle,DistanceCovered,LastLapTime,RPM,SpeedX,SpeedY,SpeedZ,Track_1,Track_2,Track_3,...,WheelSpinVelocity_1,WheelSpinVelocity_2,WheelSpinVelocity_3,WheelSpinVelocity_4,Z,Acceleration,Braking,Clutch,Gear,Steering
count,452818.0,452818.0,452818.0,452818.0,452818.0,452818.0,452818.0,452818.0,452818.0,452818.0,...,452818.0,452818.0,452818.0,452818.0,452818.0,452818.0,452818.0,452818.0,452818.0,452818.0
mean,0.01426,2784.643767,69.299157,5461.048438,59.506223,0.066478,-0.02027,5.060985,5.767817,7.616118,...,53.188911,52.936979,54.023318,53.733688,0.397465,0.367241,0.040568,0.007575,1.962886,0.042993
std,0.326901,1648.723946,46.65046,1571.438243,28.824144,5.438569,4.641137,3.465205,5.70568,6.934192,...,26.197792,26.259395,26.211874,26.242336,0.316794,0.381451,0.153659,0.086703,0.85502,0.352227
min,-3.13502,0.0,0.0,1151.92,-78.0608,-76.8693,-41.43,-1.0,-1.0,-1.0,...,-47.1882,-50.1071,-81.8893,-85.938,0.215737,0.0,0.0,0.0,0.0,-1.0
25%,-0.081747,1245.56,0.0,4083.0525,36.064125,-0.630206,-0.517247,3.268572,3.42583,4.388397,...,31.1874,30.870475,32.522325,32.172325,0.326347,0.0,0.0,0.0,1.0,-0.052042
50%,0.008609,2720.85,98.508,5619.32,56.06135,-0.006286,0.035794,5.01896,5.29627,6.691855,...,50.61415,50.08525,51.85465,51.42325,0.33528,0.207206,0.0,0.0,2.0,0.017757
75%,0.07821,4216.8,101.01,6708.15,80.4685,0.548359,0.924407,6.7066,7.218085,9.20091,...,72.7413,72.7313,73.2424,73.218775,0.348915,0.63643,0.0,0.0,3.0,0.156732
max,3.14066,6563.45,107.36,10015.4,122.913,88.682,29.6747,177.671,179.967,196.31,...,111.403,110.93,111.586,110.923,4.07852,1.0,1.0,1.0,3.0,1.0


In [32]:
data['Acceleration'].value_counts()
# print(data['Acceleration'].unique())

Acceleration
0.000000    140141
1.000000     76020
1.000000      1054
1.000000       365
1.000000       273
             ...  
0.417409         1
0.421365         1
0.426082         1
0.430043         1
0.431062         1
Name: count, Length: 141735, dtype: int64

In [33]:
epsilon = 1e-5

# Round values close to 1 to exactly 1
data['Acceleration'] = data['Acceleration'].apply(lambda x: 1.0 if np.isclose(x, 1.0, atol=epsilon) else x)
data['Acceleration'].value_counts()

Acceleration
0.000000    140141
1.000000     81165
0.164978        51
0.133351        50
0.195356        49
             ...  
0.431062         1
0.429840         1
0.423254         1
0.423583         1
0.424608         1
Name: count, Length: 141418, dtype: int64

In [39]:
data['Acceleration'] = data['Acceleration'].apply(lambda x: 1.0 if x != 0 else x)
data['Acceleration'].value_counts()

data['Braking'] = data['Braking'].apply(lambda x: 1.0 if x != 0 else x)
data['Braking'].value_counts()

Braking
0.0    392497
1.0     60321
Name: count, dtype: int64

In [35]:
data.describe()

Unnamed: 0,Angle,DistanceCovered,LastLapTime,RPM,SpeedX,SpeedY,SpeedZ,Track_1,Track_2,Track_3,...,WheelSpinVelocity_1,WheelSpinVelocity_2,WheelSpinVelocity_3,WheelSpinVelocity_4,Z,Acceleration,Braking,Clutch,Gear,Steering
count,452818.0,452818.0,452818.0,452818.0,452818.0,452818.0,452818.0,452818.0,452818.0,452818.0,...,452818.0,452818.0,452818.0,452818.0,452818.0,452818.0,452818.0,452818.0,452818.0,452818.0
mean,0.01426,2784.643767,69.299157,5461.048438,59.506223,0.066478,-0.02027,5.060985,5.767817,7.616118,...,53.188911,52.936979,54.023318,53.733688,0.397465,0.690514,0.040568,0.007575,1.962886,0.042993
std,0.326901,1648.723946,46.65046,1571.438243,28.824144,5.438569,4.641137,3.465205,5.70568,6.934192,...,26.197792,26.259395,26.211874,26.242336,0.316794,0.462282,0.153659,0.086703,0.85502,0.352227
min,-3.13502,0.0,0.0,1151.92,-78.0608,-76.8693,-41.43,-1.0,-1.0,-1.0,...,-47.1882,-50.1071,-81.8893,-85.938,0.215737,0.0,0.0,0.0,0.0,-1.0
25%,-0.081747,1245.56,0.0,4083.0525,36.064125,-0.630206,-0.517247,3.268572,3.42583,4.388397,...,31.1874,30.870475,32.522325,32.172325,0.326347,0.0,0.0,0.0,1.0,-0.052042
50%,0.008609,2720.85,98.508,5619.32,56.06135,-0.006286,0.035794,5.01896,5.29627,6.691855,...,50.61415,50.08525,51.85465,51.42325,0.33528,1.0,0.0,0.0,2.0,0.017757
75%,0.07821,4216.8,101.01,6708.15,80.4685,0.548359,0.924407,6.7066,7.218085,9.20091,...,72.7413,72.7313,73.2424,73.218775,0.348915,1.0,0.0,0.0,3.0,0.156732
max,3.14066,6563.45,107.36,10015.4,122.913,88.682,29.6747,177.671,179.967,196.31,...,111.403,110.93,111.586,110.923,4.07852,1.0,1.0,1.0,3.0,1.0


In [37]:
sensor_cols = [col for col in data.columns if col not in ['Acceleration', 'Braking', 'Clutch', 'Gear', 'Steering']]

# Step 5: Normalize sensor features
scaler = MinMaxScaler()
data[sensor_cols] = scaler.fit_transform(data[sensor_cols])
print("Normalized sensor features to [0, 1]")

data.describe()

Normalized sensor features to [0, 1]


Unnamed: 0,Angle,DistanceCovered,LastLapTime,RPM,SpeedX,SpeedY,SpeedZ,Track_1,Track_2,Track_3,...,WheelSpinVelocity_1,WheelSpinVelocity_2,WheelSpinVelocity_3,WheelSpinVelocity_4,Z,Acceleration,Braking,Clutch,Gear,Steering
count,452818.0,452818.0,452818.0,452818.0,452818.0,452818.0,452818.0,452818.0,452818.0,452818.0,...,452818.0,452818.0,452818.0,452818.0,452818.0,452818.0,452818.0,452818.0,452818.0,452818.0
mean,0.501823,0.424265,0.645484,0.486167,0.684502,0.464725,0.582377,0.033923,0.037398,0.043668,...,0.63293,0.639878,0.70248,0.709494,0.047046,0.690514,0.040568,0.007575,1.962886,0.042993
std,0.05209,0.251198,0.434524,0.177294,0.143422,0.032851,0.065272,0.019394,0.031529,0.035144,...,0.165191,0.163064,0.135479,0.133304,0.082012,0.462282,0.153659,0.086703,0.85502,0.352227
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0
25%,0.486525,0.189772,0.0,0.330698,0.56786,0.460516,0.575387,0.023891,0.024457,0.027309,...,0.494199,0.50285,0.59135,0.599968,0.028635,0.0,0.0,0.0,1.0,-0.052042
50%,0.500922,0.414546,0.917548,0.504023,0.667361,0.464285,0.583165,0.033687,0.034792,0.038984,...,0.616695,0.622169,0.691271,0.697758,0.030947,1.0,0.0,0.0,2.0,0.017757
75%,0.512013,0.642467,0.940853,0.626868,0.788806,0.467635,0.595663,0.043133,0.045412,0.0517,...,0.756218,0.762796,0.801817,0.808473,0.034477,1.0,0.0,0.0,3.0,0.156732
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0


In [41]:
print(data['Steering'].value_counts())

# Step 6: Normalize continuous labels
data['Steering'] = np.clip(data['Steering'], -1, 1)
print("Clipped continuous labels: Steering [-1, 1]")

print(data['Steering'].value_counts())

Steering
-1.000000    11171
 1.000000     7487
 0.800000     4383
-0.800000     2113
 0.097654      882
             ...  
 0.015377        1
 0.015378        1
 0.015378        1
 0.015378        1
 0.022383        1
Name: count, Length: 344183, dtype: int64
Clipped continuous labels: Steering [-1, 1]
Steering
-1.000000    11171
 1.000000     7487
 0.800000     4383
-0.800000     2113
 0.097654      882
             ...  
 0.015377        1
 0.015378        1
 0.015378        1
 0.015378        1
 0.022383        1
Name: count, Length: 344183, dtype: int64


In [42]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 452818 entries, 0 to 452817
Data columns (total 37 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Angle                452818 non-null  float64
 1   DistanceCovered      452818 non-null  float64
 2   LastLapTime          452818 non-null  float64
 3   RPM                  452818 non-null  float64
 4   SpeedX               452818 non-null  float64
 5   SpeedY               452818 non-null  float64
 6   SpeedZ               452818 non-null  float64
 7   Track_1              452818 non-null  float64
 8   Track_2              452818 non-null  float64
 9   Track_3              452818 non-null  float64
 10  Track_4              452818 non-null  float64
 11  Track_5              452818 non-null  float64
 12  Track_6              452818 non-null  float64
 13  Track_7              452818 non-null  float64
 14  Track_8              452818 non-null  float64
 15  Track_9              4

In [43]:
# Step 7: Remove outliers
# Example: Remove rows with extreme RPM or SpeedX
outlier_mask = (data['RPM'] < data['RPM'].quantile(0.99)) & (data['SpeedX'] < data['SpeedX'].quantile(0.99)) & (data['SpeedY'] < data['SpeedY'].quantile(0.99)) & (data['SpeedZ'] < data['SpeedZ'].quantile(0.99))
data = data[outlier_mask]
print(f"After removing outliers: {len(data)} rows")

After removing outliers: 434699 rows


In [44]:
# Step 9: Remove duplicates
data = data.drop_duplicates()
print(f"After removing duplicates: {len(data)} rows")

After removing duplicates: 353067 rows


In [46]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 353067 entries, 0 to 452817
Data columns (total 37 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Angle                353067 non-null  float64
 1   DistanceCovered      353067 non-null  float64
 2   LastLapTime          353067 non-null  float64
 3   RPM                  353067 non-null  float64
 4   SpeedX               353067 non-null  float64
 5   SpeedY               353067 non-null  float64
 6   SpeedZ               353067 non-null  float64
 7   Track_1              353067 non-null  float64
 8   Track_2              353067 non-null  float64
 9   Track_3              353067 non-null  float64
 10  Track_4              353067 non-null  float64
 11  Track_5              353067 non-null  float64
 12  Track_6              353067 non-null  float64
 13  Track_7              353067 non-null  float64
 14  Track_8              353067 non-null  float64
 15  Track_9              3

In [47]:
data['Gear'].value_counts()

Gear
1    138240
3    118805
2     95465
0       557
Name: count, dtype: int64

In [48]:
data['Clutch'].value_counts()

Clutch
0.0    352299
1.0       768
Name: count, dtype: int64

In [None]:
# Step 10: Split data
train, temp = train_test_split(data, test_size=0.2, random_state=42, stratify=data[['Gear', 'Clutch']])
val, test = train_test_split(temp, test_size=0.4, random_state=42)
print(f"Train: {len(train)} rows, Val: {len(val)} rows, Test: {len(test)} rows")

Train: 282453 rows, Val: 42368 rows, Test: 28246 rows


In [55]:
# Step 11: Save datasets
train.to_csv('./data/dirt-2/train_data.csv', index=False)
val.to_csv('./data/dirt-2/val_data.csv', index=False)
test.to_csv('./data/dirt-2/test_data.csv', index=False)
print("Saved train_data.csv, val_data.csv, test_data.csv")

# Step 12: Save scaler
joblib.dump(scaler, './data/dirt-2/scaler.pkl')
print("Saved scaler as scaler.pkl")

Saved train_data.csv, val_data.csv, test_data.csv
Saved scaler as scaler.pkl


In [None]:
# Step 5: Compute mean and standard deviation, normalize features
# Separate features (sensors) and labels (actuators)
sensor_cols = [col for col in data.columns if col not in ['Acceleration', 'Braking', 'Clutch', 'Gear', 'Steering']]
actuator_cols = [col for col in data.columns if col in ['Acceleration', 'Braking', 'Steering']]
features = data[sensor_cols]
all_actuator_cols = [col for col in data.columns if col in ['Acceleration', 'Braking', 'Clutch', 'Gear', 'Steering']]
labels = data[all_actuator_cols]

# Calculate mean and std for sensors and continuous actuators
feature_stats = features.describe().loc[['mean', 'std']].T
label_stats = labels[actuator_cols].describe().loc[['mean', 'std']].T
print("Feature stats:\n", feature_stats)
print("Label stats:\n", label_stats)

Feature stats:
                              mean          std
Angle                   -0.000640     0.282541
 CurrentLapTime         49.227567    29.028026
 Damage               2823.975051  2339.246312
 DistanceFromStart     801.164121   455.878970
 DistanceCovered      2891.367966  1700.453458
 FuelLevel              53.366001     0.905742
 LastLapTime            70.338981    45.912840
 Opponent_1            175.131463    58.672077
RacePosition             2.162219     1.595496
 RPM                  5476.455812  1541.549847
 SpeedX                 59.128886    28.477126
 SpeedY                 -0.025914     5.588287
 SpeedZ                 -0.013730     4.671972
 Track_1                 5.067960     3.448218
Track_2                  5.835095     5.886011
Track_3                  7.683733     6.994096
Track_4                 11.181911    11.890595
Track_5                 16.910797    15.476984
Track_6                 19.790101    16.120858
Track_7                 24.688684    18.8962

In [34]:
# Step 6: Check statistics and distributions
# Continuous features and actuators
continuous_cols = sensor_cols + actuator_cols
stats = data[continuous_cols].describe().loc[['mean', 'std', 'min', 'max']].T
print("Statistics for continuous features and actuators:\n", stats)
# Discrete outputs (Gear, Clutch)
print("Gear value counts:\n", data['Gear'].value_counts())
print("Clutch value counts:\n", data['Clutch'].value_counts())

Statistics for continuous features and actuators:
                              mean          std          min          max
Angle                   -0.000640     0.282541    -3.129000      3.13044
 CurrentLapTime         49.227567    29.028026    -0.982000    115.03200
 Damage               2823.975051  2339.246312     0.000000   8438.00000
 DistanceFromStart     801.164121   455.878970     0.006884   1760.94000
 DistanceCovered      2891.367966  1700.453458     0.000000   6563.45000
 FuelLevel              53.366001     0.905742    51.817200     55.00000
 LastLapTime            70.338981    45.912840     0.000000    107.36000
 Opponent_1            175.131463    58.672077     3.470930    200.00000
RacePosition             2.162219     1.595496     1.000000      7.00000
 RPM                  5476.455812  1541.549847  1151.920000  10015.40000
 SpeedX                 59.128886    28.477126   -78.060800    122.20200
 SpeedY                 -0.025914     5.588287   -76.869300     88.68200


In [35]:
# Step 7: Check for constant columns
constant_cols = [col for col in sensor_cols if data[col].std() == 0]
if constant_cols:
    print(f"Constant columns to drop: {constant_cols}")
    features = features.drop(columns=constant_cols)
    sensor_cols = [col for col in sensor_cols if col not in constant_cols]
print(f"Shape after dropping constant columns: {features.shape}")

Shape after dropping constant columns: (288986, 39)


In [36]:
# Step 8: Normalize continuous features and actuators
features_normalized = features.copy()
labels_normalized = labels.copy()
for col in sensor_cols:
    features_normalized[col] = (features[col] - features[col].mean()) / features[col].std()
for col in actuator_cols:
    labels_normalized[col] = (labels[col] - labels[col].mean()) / labels[col].std()
# Gear and Clutch remain unnormalized
data_normalized = pd.concat([features_normalized, labels_normalized], axis=1)

In [37]:
# Step 10: Split into train/validation/test
train_data, temp_data = train_test_split(data_normalized, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)
print(f"Train shape: {train_data.shape}, Validation shape: {val_data.shape}, Test shape: {test_data.shape}")

# Step 11: Save preprocessed data
train_data.to_csv("./data/dirt-2/train_data.csv", index=False)
val_data.to_csv("./data/dirt-2/val_data.csv", index=False)
test_data.to_csv("./data/dirt-2/test_data.csv", index=False)
print("./data/dirt-2/Preprocessed data saved.")


Train shape: (231188, 43), Validation shape: (28899, 43), Test shape: (28899, 43)
./data/dirt-2/Preprocessed data saved.


In [None]:


# # Step 8: Check for outliers (optional, based on stats)
# # Example: Clip features to [-3, 3] std deviations
# for col in sensor_cols + actuator_cols:
#     mean, std = data[col].mean(), data[col].std()
#     data_normalized[col] = data_normalized[col].clip(lower=mean-3*std, upper=mean+3*std)

# data_normalized.to_csv("./data/dirt-2/dirt-2_data.csv", index=False)

# Step 9: Split into train/validation/test



# # Normalize features and continuous actuators
# features_normalized = (features - features.mean()) / features.std()
# labels_normalized = labels.copy()
# labels_normalized[actuator_cols] = (labels[actuator_cols] - labels[actuator_cols].mean()) / labels[actuator_cols].std()
# # Gear remains unnormalized (if discrete)

# # Combine normalized features and labels
# data_normalized = pd.concat([features_normalized, labels_normalized], axis=1)

# # Step 5: Split into train/validation/test
# train_data, temp_data = train_test_split(data_normalized, test_size=0.2, random_state=42)
# val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)
# print(f"Train shape: {train_data.shape}, Validation shape: {val_data.shape}, Test shape: {test_data.shape}")

# # Step 6: Save preprocessed data
# train_data.to_csv("train_data.csv", index=False)
# val_data.to_csv("val_data.csv", index=False)
# test_data.to_csv("test_data.csv", index=False)
# print("Preprocessed data saved.")

Combined data shape: (309220, 80)


KeyError: "['gear_drop'] not found in axis"