In [1]:
print("Hello World")

Hello World


In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import glob
import os
import joblib

In [46]:
# Step 1: Combine CSV files
# data_dir = "./data/dirt-2/"  # Update with your directory
# csv_files = glob.glob(os.path.join(data_dir, "*.csv"))
# df_list = [pd.read_csv(file) for file in csv_files]
# data = pd.concat(df_list, ignore_index=True)
def clean_columns(df):
    df.columns = df.columns.str.strip().str.replace(' ', '_')

    # Drop duplicate columns if any
    df = df.loc[:, ~df.columns.duplicated()]
    return df


track_types = ["dirt-2", "oval", "road"]

csv_files = []
for track in track_types:
    path = os.path.join('./data/', track, "*.csv")
    csv_files.extend(glob.glob(path))

# df_list = [pd.read_csv(file) for file in csv_files]
# data = pd.concat(df_list, ignore_index=True)

df_list = []
for file in csv_files:
    temp_df = pd.read_csv(file)
    temp_df = clean_columns(temp_df)
    df_list.append(temp_df)

data = pd.concat(df_list, ignore_index=True)

data = data.loc[:, ~data.columns.duplicated()]

# Optional: Drop columns that are entirely null
data = data.dropna(axis=1, how='all')

# Optional: Reset index again just to be clean
data.reset_index(drop=True, inplace=True)

print(f"Combined data shape: {data.shape}")

Combined data shape: (826118, 79)


In [47]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 826118 entries, 0 to 826117
Data columns (total 79 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Angle                805885 non-null  float64
 1   CurrentLapTime       473051 non-null  float64
 2   Damage               473051 non-null  float64
 3   DistanceFromStart    473051 non-null  float64
 4   DistanceCovered      826118 non-null  float64
 5   FuelLevel            473051 non-null  float64
 6   gear_drop            473051 non-null  float64
 7   LastLapTime          826118 non-null  float64
 8   Opponent_1           473051 non-null  float64
 9   Opponent_2           473051 non-null  float64
 10  Opponent_3           473051 non-null  float64
 11  Opponent_4           473051 non-null  float64
 12  Opponent_5           473051 non-null  float64
 13  Opponent_6           473051 non-null  float64
 14  Opponent_7           473051 non-null  float64
 15  Opponent_8       

In [50]:
# remove whitespace from the name of the columns. Helps in accessing later on
data.columns = data.columns.str.strip()

In [51]:
data.info()
print(data['Gear'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 826118 entries, 0 to 826117
Data columns (total 79 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Angle                805885 non-null  float64
 1   CurrentLapTime       473051 non-null  float64
 2   Damage               473051 non-null  float64
 3   DistanceFromStart    473051 non-null  float64
 4   DistanceCovered      826118 non-null  float64
 5   FuelLevel            473051 non-null  float64
 6   gear_drop            473051 non-null  float64
 7   LastLapTime          826118 non-null  float64
 8   Opponent_1           473051 non-null  float64
 9   Opponent_2           473051 non-null  float64
 10  Opponent_3           473051 non-null  float64
 11  Opponent_4           473051 non-null  float64
 12  Opponent_5           473051 non-null  float64
 13  Opponent_6           473051 non-null  float64
 14  Opponent_7           473051 non-null  float64
 15  Opponent_8       

In [52]:
# Step 2: Drop incorrect 'drop_gear' columns
drop_gear_cols = [col for col in data.columns if 'gear_drop' in col.lower()]  # Catches 'drop_gear', 'drop_gear_1', etc.
if drop_gear_cols:
    data = data.drop(columns=drop_gear_cols)
    print(f"Dropped columns: {drop_gear_cols}")
else:
    print("No 'drop_gear' columns found.")
print(f"Shape after dropping drop_gear columns: {data.shape}")


Dropped columns: ['gear_drop']
Shape after dropping drop_gear columns: (826118, 78)


In [53]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 826118 entries, 0 to 826117
Data columns (total 78 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Angle                805885 non-null  float64
 1   CurrentLapTime       473051 non-null  float64
 2   Damage               473051 non-null  float64
 3   DistanceFromStart    473051 non-null  float64
 4   DistanceCovered      826118 non-null  float64
 5   FuelLevel            473051 non-null  float64
 6   LastLapTime          826118 non-null  float64
 7   Opponent_1           473051 non-null  float64
 8   Opponent_2           473051 non-null  float64
 9   Opponent_3           473051 non-null  float64
 10  Opponent_4           473051 non-null  float64
 11  Opponent_5           473051 non-null  float64
 12  Opponent_6           473051 non-null  float64
 13  Opponent_7           473051 non-null  float64
 14  Opponent_8           473051 non-null  float64
 15  Opponent_9       

In [54]:
# Step 3: Drop opponent-related columns
opponent_cols = [col for col in data.columns if col.startswith('Opponent_')]  # Adjust based on column names
data = data.drop(columns=opponent_cols)
print(f"Shape after dropping opponent columns: {data.shape}")


Shape after dropping opponent columns: (826118, 42)


In [55]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 826118 entries, 0 to 826117
Data columns (total 42 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Angle                805885 non-null  float64
 1   CurrentLapTime       473051 non-null  float64
 2   Damage               473051 non-null  float64
 3   DistanceFromStart    473051 non-null  float64
 4   DistanceCovered      826118 non-null  float64
 5   FuelLevel            473051 non-null  float64
 6   LastLapTime          826118 non-null  float64
 7   RacePosition         473051 non-null  float64
 8   RPM                  826118 non-null  float64
 9   SpeedX               826118 non-null  float64
 10  SpeedY               826118 non-null  float64
 11  SpeedZ               826118 non-null  float64
 12  Track_1              826118 non-null  float64
 13  Track_2              826118 non-null  float64
 14  Track_3              826118 non-null  float64
 15  Track_4          

In [56]:
# drop irrelevant columns one by one
cols_to_drop = ['CurrentLapTime', 'Damage', 'DistanceFromStart', 'FuelLevel', 'RacePosition']
opponent_cols = [col for col in data.columns if col in cols_to_drop]  # Adjust based on column names
data = data.drop(columns=opponent_cols)
print(f"Shape after dropping opponent columns: {data.shape}")


Shape after dropping opponent columns: (826118, 37)


In [57]:
data.info()
# data.dtypes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 826118 entries, 0 to 826117
Data columns (total 37 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Angle                805885 non-null  float64
 1   DistanceCovered      826118 non-null  float64
 2   LastLapTime          826118 non-null  float64
 3   RPM                  826118 non-null  float64
 4   SpeedX               826118 non-null  float64
 5   SpeedY               826118 non-null  float64
 6   SpeedZ               826118 non-null  float64
 7   Track_1              826118 non-null  float64
 8   Track_2              826118 non-null  float64
 9   Track_3              826118 non-null  float64
 10  Track_4              826118 non-null  float64
 11  Track_5              826118 non-null  float64
 12  Track_6              826118 non-null  float64
 13  Track_7              826118 non-null  float64
 14  Track_8              826118 non-null  float64
 15  Track_9          

In [58]:
# Step 4: Check and handle null values
null_counts = data.isnull().sum()
print("Null values per column:\n", null_counts[null_counts > 0])
# Drop rows with nulls (or impute if preferred)
data = data.dropna()
print(f"Shape after handling nulls: {data.shape}")


Null values per column:
 Angle    20233
dtype: int64
Shape after handling nulls: (805885, 37)


In [59]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 805885 entries, 0 to 826117
Data columns (total 37 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Angle                805885 non-null  float64
 1   DistanceCovered      805885 non-null  float64
 2   LastLapTime          805885 non-null  float64
 3   RPM                  805885 non-null  float64
 4   SpeedX               805885 non-null  float64
 5   SpeedY               805885 non-null  float64
 6   SpeedZ               805885 non-null  float64
 7   Track_1              805885 non-null  float64
 8   Track_2              805885 non-null  float64
 9   Track_3              805885 non-null  float64
 10  Track_4              805885 non-null  float64
 11  Track_5              805885 non-null  float64
 12  Track_6              805885 non-null  float64
 13  Track_7              805885 non-null  float64
 14  Track_8              805885 non-null  float64
 15  Track_9              8

In [60]:
data['Gear'].value_counts()

Gear
 1    309545
 3    275256
 2    219551
-1       976
 0       557
Name: count, dtype: int64

In [61]:
data['Gear']= data['Gear'].replace(0, 4)

data['Gear'] = data['Gear'].replace(-1, 0)
data['Gear'].value_counts()

Gear
1    309545
3    275256
2    219551
0       976
4       557
Name: count, dtype: int64

In [62]:
data['Clutch'] = data['Clutch'].replace(0.5, 1)
data['Clutch'].value_counts()

Clutch
0.0    801687
1.0      4198
Name: count, dtype: int64

In [63]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 805885 entries, 0 to 826117
Data columns (total 37 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Angle                805885 non-null  float64
 1   DistanceCovered      805885 non-null  float64
 2   LastLapTime          805885 non-null  float64
 3   RPM                  805885 non-null  float64
 4   SpeedX               805885 non-null  float64
 5   SpeedY               805885 non-null  float64
 6   SpeedZ               805885 non-null  float64
 7   Track_1              805885 non-null  float64
 8   Track_2              805885 non-null  float64
 9   Track_3              805885 non-null  float64
 10  Track_4              805885 non-null  float64
 11  Track_5              805885 non-null  float64
 12  Track_6              805885 non-null  float64
 13  Track_7              805885 non-null  float64
 14  Track_8              805885 non-null  float64
 15  Track_9              8

In [64]:
null_counts = data.isnull().sum()
print("Null values per column:\n", null_counts[null_counts > 0])

Null values per column:
 Series([], dtype: int64)


In [65]:
data.describe()

Unnamed: 0,Angle,DistanceCovered,LastLapTime,RPM,SpeedX,SpeedY,SpeedZ,Track_1,Track_2,Track_3,...,WheelSpinVelocity_1,WheelSpinVelocity_2,WheelSpinVelocity_3,WheelSpinVelocity_4,Z,Acceleration,Braking,Clutch,Gear,Steering
count,805885.0,805885.0,805885.0,805885.0,805885.0,805885.0,805885.0,805885.0,805885.0,805885.0,...,805885.0,805885.0,805885.0,805885.0,805885.0,805885.0,805885.0,805885.0,805885.0,805885.0
mean,0.226869,1564.870865,39.258078,3068.714066,33.734293,0.240233,0.242923,2.858626,3.257371,4.298631,...,30.161715,30.023186,30.661409,30.501769,0.242441,0.510139,0.084821,0.005209,1.956412,0.038928
std,0.344664,1853.532184,48.788565,2954.282968,36.313754,4.081475,3.491888,3.601091,5.135709,6.413499,...,32.645245,32.570626,32.955072,32.850669,0.298967,0.448426,0.262723,0.071986,0.855204,0.352065
min,-3.13502,0.0,0.0,0.0,-78.0608,-76.8693,-41.43,-1.0,-1.0,-1.0,...,-47.1882,-50.1071,-81.8893,-85.938,0.0,0.0,0.0,0.0,0.0,-1.0
25%,-0.004809,0.549837,0.0,0.532373,0.678647,-0.062957,-0.019671,0.035116,0.036565,0.040965,...,0.631043,0.635791,0.705489,0.711,0.03178,0.0,0.0,0.0,1.0,-0.056024
50%,0.216753,583.373,0.946926,3503.83,24.7389,0.458711,0.576819,1.65068,1.73592,2.24952,...,23.0718,23.6127,23.558,23.6268,0.323368,0.479893,0.0,0.0,2.0,0.016957
75%,0.499888,3006.9,98.586,5881.57,60.4828,0.468308,0.603063,5.36929,5.72319,7.23462,...,54.6026,54.0279,56.3522,55.6059,0.341144,1.0,0.0,0.0,3.0,0.155356
max,3.14066,6563.45,107.36,10015.4,122.913,88.682,29.6747,177.671,179.967,196.31,...,111.403,110.93,111.586,110.923,4.07852,1.0,1.0,1.0,4.0,1.0


In [66]:
data['Acceleration'].value_counts()
# print(data['Acceleration'].unique())

Acceleration
1.000000    320840
0.000000    248388
1.000000      1054
1.000000       365
1.000000       273
             ...  
0.417409         1
0.421365         1
0.426082         1
0.430043         1
0.431062         1
Name: count, Length: 141735, dtype: int64

In [None]:
epsilon = 1e-5

# Round values close to 1 to exactly 1
data['Acceleration'] = data['Acceleration'].apply(lambda x: 1.0 if np.isclose(x, 1.0, atol=epsilon) else x)
data['Acceleration'].value_counts()



Acceleration
1.000000    325985
0.000000    248388
0.164978        51
0.133351        50
0.195356        49
             ...  
0.431062         1
0.429840         1
0.423254         1
0.423583         1
0.424608         1
Name: count, Length: 141418, dtype: int64

In [69]:
data['Acceleration'] = data['Acceleration'].apply(lambda x: 1.0 if x != 0 else x)
print(data['Acceleration'].value_counts())

data['Braking'] = data['Braking'].apply(lambda x: 1.0 if x != 0 else x)
data['Braking'].value_counts()

Acceleration
1.0    557497
0.0    248388
Name: count, dtype: int64


Braking
0.0    695578
1.0    110307
Name: count, dtype: int64

In [70]:
data.describe()

Unnamed: 0,Angle,DistanceCovered,LastLapTime,RPM,SpeedX,SpeedY,SpeedZ,Track_1,Track_2,Track_3,...,WheelSpinVelocity_1,WheelSpinVelocity_2,WheelSpinVelocity_3,WheelSpinVelocity_4,Z,Acceleration,Braking,Clutch,Gear,Steering
count,805885.0,805885.0,805885.0,805885.0,805885.0,805885.0,805885.0,805885.0,805885.0,805885.0,...,805885.0,805885.0,805885.0,805885.0,805885.0,805885.0,805885.0,805885.0,805885.0,805885.0
mean,0.226869,1564.870865,39.258078,3068.714066,33.734293,0.240233,0.242923,2.858626,3.257371,4.298631,...,30.161715,30.023186,30.661409,30.501769,0.242441,0.691782,0.136877,0.005209,1.956412,0.038928
std,0.344664,1853.532184,48.788565,2954.282968,36.313754,4.081475,3.491888,3.601091,5.135709,6.413499,...,32.645245,32.570626,32.955072,32.850669,0.298967,0.461757,0.343718,0.071986,0.855204,0.352065
min,-3.13502,0.0,0.0,0.0,-78.0608,-76.8693,-41.43,-1.0,-1.0,-1.0,...,-47.1882,-50.1071,-81.8893,-85.938,0.0,0.0,0.0,0.0,0.0,-1.0
25%,-0.004809,0.549837,0.0,0.532373,0.678647,-0.062957,-0.019671,0.035116,0.036565,0.040965,...,0.631043,0.635791,0.705489,0.711,0.03178,0.0,0.0,0.0,1.0,-0.056024
50%,0.216753,583.373,0.946926,3503.83,24.7389,0.458711,0.576819,1.65068,1.73592,2.24952,...,23.0718,23.6127,23.558,23.6268,0.323368,1.0,0.0,0.0,2.0,0.016957
75%,0.499888,3006.9,98.586,5881.57,60.4828,0.468308,0.603063,5.36929,5.72319,7.23462,...,54.6026,54.0279,56.3522,55.6059,0.341144,1.0,0.0,0.0,3.0,0.155356
max,3.14066,6563.45,107.36,10015.4,122.913,88.682,29.6747,177.671,179.967,196.31,...,111.403,110.93,111.586,110.923,4.07852,1.0,1.0,1.0,4.0,1.0


In [71]:
sensor_cols = [col for col in data.columns if col not in ['Acceleration', 'Braking', 'Clutch', 'Gear', 'Steering']]

# Step 5: Normalize sensor features
scaler = MinMaxScaler()
data[sensor_cols] = scaler.fit_transform(data[sensor_cols])
print("Normalized sensor features to [0, 1]")

data.describe()

Normalized sensor features to [0, 1]


Unnamed: 0,Angle,DistanceCovered,LastLapTime,RPM,SpeedX,SpeedY,SpeedZ,Track_1,Track_2,Track_3,...,WheelSpinVelocity_1,WheelSpinVelocity_2,WheelSpinVelocity_3,WheelSpinVelocity_4,Z,Acceleration,Braking,Clutch,Gear,Steering
count,805885.0,805885.0,805885.0,805885.0,805885.0,805885.0,805885.0,805885.0,805885.0,805885.0,...,805885.0,805885.0,805885.0,805885.0,805885.0,805885.0,805885.0,805885.0,805885.0,805885.0
mean,0.535701,0.238422,0.365668,0.3064,0.556267,0.465774,0.586078,0.021596,0.023526,0.026854,...,0.487731,0.497589,0.581732,0.591482,0.059443,0.691782,0.136877,0.005209,1.956412,0.038928
std,0.054921,0.282402,0.454439,0.294974,0.180689,0.024654,0.049109,0.020155,0.028379,0.032505,...,0.205845,0.202255,0.170332,0.166872,0.073303,0.461757,0.343718,0.071986,0.855204,0.352065
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0
25%,0.498784,8.4e-05,0.0,5.3e-05,0.39179,0.463943,0.582385,0.005793,0.005728,0.005276,...,0.301525,0.315101,0.426901,0.440153,0.007792,0.0,0.0,0.0,1.0,-0.056024
50%,0.534089,0.088882,0.00882,0.349844,0.511508,0.467094,0.590774,0.014836,0.015118,0.016469,...,0.443026,0.457781,0.545017,0.556559,0.079286,1.0,0.0,0.0,2.0,0.016957
75%,0.579205,0.458128,0.918275,0.587253,0.689361,0.467152,0.591143,0.035648,0.037151,0.041734,...,0.641844,0.646652,0.714518,0.719004,0.083644,1.0,0.0,0.0,3.0,0.155356
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,1.0


In [72]:
print(data['Steering'].value_counts())

# Step 6: Normalize continuous labels
data['Steering'] = np.clip(data['Steering'], -1, 1)
print("Clipped continuous labels: Steering [-1, 1]")

print(data['Steering'].value_counts())

Steering
-1.000000    20127
 1.000000    13322
 0.800000     5661
-0.800000     3588
 0.000000     1018
             ...  
 0.158337        1
 0.172185        1
 0.185388        1
-0.051177        1
-0.051220        1
Name: count, Length: 344183, dtype: int64
Clipped continuous labels: Steering [-1, 1]
Steering
-1.000000    20127
 1.000000    13322
 0.800000     5661
-0.800000     3588
 0.000000     1018
             ...  
 0.158337        1
 0.172185        1
 0.185388        1
-0.051177        1
-0.051220        1
Name: count, Length: 344183, dtype: int64


In [73]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 805885 entries, 0 to 826117
Data columns (total 37 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Angle                805885 non-null  float64
 1   DistanceCovered      805885 non-null  float64
 2   LastLapTime          805885 non-null  float64
 3   RPM                  805885 non-null  float64
 4   SpeedX               805885 non-null  float64
 5   SpeedY               805885 non-null  float64
 6   SpeedZ               805885 non-null  float64
 7   Track_1              805885 non-null  float64
 8   Track_2              805885 non-null  float64
 9   Track_3              805885 non-null  float64
 10  Track_4              805885 non-null  float64
 11  Track_5              805885 non-null  float64
 12  Track_6              805885 non-null  float64
 13  Track_7              805885 non-null  float64
 14  Track_8              805885 non-null  float64
 15  Track_9              8

In [74]:
# Step 7: Remove outliers
# Example: Remove rows with extreme RPM or SpeedX
outlier_mask = (data['RPM'] < data['RPM'].quantile(0.99)) & (data['SpeedX'] < data['SpeedX'].quantile(0.99)) & (data['SpeedY'] < data['SpeedY'].quantile(0.99)) & (data['SpeedZ'] < data['SpeedZ'].quantile(0.99))
data = data[outlier_mask]
print(f"After removing outliers: {len(data)} rows")

After removing outliers: 774161 rows


In [75]:
# Step 9: Remove duplicates
data = data.drop_duplicates()
print(f"After removing duplicates: {len(data)} rows")

After removing duplicates: 696006 rows


In [76]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 696006 entries, 0 to 826117
Data columns (total 37 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Angle                696006 non-null  float64
 1   DistanceCovered      696006 non-null  float64
 2   LastLapTime          696006 non-null  float64
 3   RPM                  696006 non-null  float64
 4   SpeedX               696006 non-null  float64
 5   SpeedY               696006 non-null  float64
 6   SpeedZ               696006 non-null  float64
 7   Track_1              696006 non-null  float64
 8   Track_2              696006 non-null  float64
 9   Track_3              696006 non-null  float64
 10  Track_4              696006 non-null  float64
 11  Track_5              696006 non-null  float64
 12  Track_6              696006 non-null  float64
 13  Track_7              696006 non-null  float64
 14  Track_8              696006 non-null  float64
 15  Track_9              6

In [77]:
data['Gear'].value_counts()

Gear
1    274864
3    232382
2    187761
4       557
0       442
Name: count, dtype: int64

In [78]:
data['Clutch'].value_counts()

Clutch
0.0    694492
1.0      1514
Name: count, dtype: int64

In [80]:
# Step 10: Split data
train, temp = train_test_split(data, test_size=0.2, random_state=42, stratify=data[['Gear', 'Clutch']])
val, test = train_test_split(temp, test_size=0.4, random_state=42)
print(f"Train: {len(train)} rows, Val: {len(val)} rows, Test: {len(test)} rows")

Train: 556804 rows, Val: 83521 rows, Test: 55681 rows


In [81]:
# Step 11: Save datasets
train.to_csv('./data/train_data_1.csv', index=False)
val.to_csv('./data/val_data_1.csv', index=False)
test.to_csv('./data/test_data_1.csv', index=False)
print("Saved train_data.csv, val_data.csv, test_data.csv")

# Step 12: Save scaler
joblib.dump(scaler, './data/scaler_1.pkl')
print("Saved scaler as scaler.pkl")

Saved train_data.csv, val_data.csv, test_data.csv
Saved scaler as scaler.pkl


In [None]:
########################################
############## STOP HERE ###############


# Step 5: Compute mean and standard deviation, normalize features
# Separate features (sensors) and labels (actuators)
sensor_cols = [col for col in data.columns if col not in ['Acceleration', 'Braking', 'Clutch', 'Gear', 'Steering']]
actuator_cols = [col for col in data.columns if col in ['Acceleration', 'Braking', 'Steering']]
features = data[sensor_cols]
all_actuator_cols = [col for col in data.columns if col in ['Acceleration', 'Braking', 'Clutch', 'Gear', 'Steering']]
labels = data[all_actuator_cols]

# Calculate mean and std for sensors and continuous actuators
feature_stats = features.describe().loc[['mean', 'std']].T
label_stats = labels[actuator_cols].describe().loc[['mean', 'std']].T
print("Feature stats:\n", feature_stats)
print("Label stats:\n", label_stats)

Feature stats:
                              mean          std
Angle                   -0.000640     0.282541
 CurrentLapTime         49.227567    29.028026
 Damage               2823.975051  2339.246312
 DistanceFromStart     801.164121   455.878970
 DistanceCovered      2891.367966  1700.453458
 FuelLevel              53.366001     0.905742
 LastLapTime            70.338981    45.912840
 Opponent_1            175.131463    58.672077
RacePosition             2.162219     1.595496
 RPM                  5476.455812  1541.549847
 SpeedX                 59.128886    28.477126
 SpeedY                 -0.025914     5.588287
 SpeedZ                 -0.013730     4.671972
 Track_1                 5.067960     3.448218
Track_2                  5.835095     5.886011
Track_3                  7.683733     6.994096
Track_4                 11.181911    11.890595
Track_5                 16.910797    15.476984
Track_6                 19.790101    16.120858
Track_7                 24.688684    18.8962

In [34]:
# Step 6: Check statistics and distributions
# Continuous features and actuators
continuous_cols = sensor_cols + actuator_cols
stats = data[continuous_cols].describe().loc[['mean', 'std', 'min', 'max']].T
print("Statistics for continuous features and actuators:\n", stats)
# Discrete outputs (Gear, Clutch)
print("Gear value counts:\n", data['Gear'].value_counts())
print("Clutch value counts:\n", data['Clutch'].value_counts())

Statistics for continuous features and actuators:
                              mean          std          min          max
Angle                   -0.000640     0.282541    -3.129000      3.13044
 CurrentLapTime         49.227567    29.028026    -0.982000    115.03200
 Damage               2823.975051  2339.246312     0.000000   8438.00000
 DistanceFromStart     801.164121   455.878970     0.006884   1760.94000
 DistanceCovered      2891.367966  1700.453458     0.000000   6563.45000
 FuelLevel              53.366001     0.905742    51.817200     55.00000
 LastLapTime            70.338981    45.912840     0.000000    107.36000
 Opponent_1            175.131463    58.672077     3.470930    200.00000
RacePosition             2.162219     1.595496     1.000000      7.00000
 RPM                  5476.455812  1541.549847  1151.920000  10015.40000
 SpeedX                 59.128886    28.477126   -78.060800    122.20200
 SpeedY                 -0.025914     5.588287   -76.869300     88.68200


In [35]:
# Step 7: Check for constant columns
constant_cols = [col for col in sensor_cols if data[col].std() == 0]
if constant_cols:
    print(f"Constant columns to drop: {constant_cols}")
    features = features.drop(columns=constant_cols)
    sensor_cols = [col for col in sensor_cols if col not in constant_cols]
print(f"Shape after dropping constant columns: {features.shape}")

Shape after dropping constant columns: (288986, 39)


In [36]:
# Step 8: Normalize continuous features and actuators
features_normalized = features.copy()
labels_normalized = labels.copy()
for col in sensor_cols:
    features_normalized[col] = (features[col] - features[col].mean()) / features[col].std()
for col in actuator_cols:
    labels_normalized[col] = (labels[col] - labels[col].mean()) / labels[col].std()
# Gear and Clutch remain unnormalized
data_normalized = pd.concat([features_normalized, labels_normalized], axis=1)

In [37]:
# Step 10: Split into train/validation/test
train_data, temp_data = train_test_split(data_normalized, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)
print(f"Train shape: {train_data.shape}, Validation shape: {val_data.shape}, Test shape: {test_data.shape}")

# Step 11: Save preprocessed data
train_data.to_csv("./data/dirt-2/train_data.csv", index=False)
val_data.to_csv("./data/dirt-2/val_data.csv", index=False)
test_data.to_csv("./data/dirt-2/test_data.csv", index=False)
print("./data/dirt-2/Preprocessed data saved.")


Train shape: (231188, 43), Validation shape: (28899, 43), Test shape: (28899, 43)
./data/dirt-2/Preprocessed data saved.


In [None]:


# # Step 8: Check for outliers (optional, based on stats)
# # Example: Clip features to [-3, 3] std deviations
# for col in sensor_cols + actuator_cols:
#     mean, std = data[col].mean(), data[col].std()
#     data_normalized[col] = data_normalized[col].clip(lower=mean-3*std, upper=mean+3*std)

# data_normalized.to_csv("./data/dirt-2/dirt-2_data.csv", index=False)

# Step 9: Split into train/validation/test



# # Normalize features and continuous actuators
# features_normalized = (features - features.mean()) / features.std()
# labels_normalized = labels.copy()
# labels_normalized[actuator_cols] = (labels[actuator_cols] - labels[actuator_cols].mean()) / labels[actuator_cols].std()
# # Gear remains unnormalized (if discrete)

# # Combine normalized features and labels
# data_normalized = pd.concat([features_normalized, labels_normalized], axis=1)

# # Step 5: Split into train/validation/test
# train_data, temp_data = train_test_split(data_normalized, test_size=0.2, random_state=42)
# val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)
# print(f"Train shape: {train_data.shape}, Validation shape: {val_data.shape}, Test shape: {test_data.shape}")

# # Step 6: Save preprocessed data
# train_data.to_csv("train_data.csv", index=False)
# val_data.to_csv("val_data.csv", index=False)
# test_data.to_csv("test_data.csv", index=False)
# print("Preprocessed data saved.")

Combined data shape: (309220, 80)


KeyError: "['gear_drop'] not found in axis"