In [4]:
import os
from tqdm import tqdm
import pandas as pd
import json


def convert_to_dataframe(vehicle_id):
    path = f'private_datasets/veh_{vehicle_id}.json'
    with open(path, 'r') as f:
        data = json.load(f)

    feature_names = [
        'ego_speed', 'ego_lane', 'ego_desired_speed',
        'front_exists', 'front_distance', 'front_speed',
        'left_front_exists', 'left_front_distance', 'left_front_speed',
        'left_rear_exists', 'left_rear_distance', 'left_rear_speed',
        'right_front_exists', 'right_front_distance', 'right_front_speed',
        'right_rear_exists', 'right_rear_distance', 'right_rear_speed',
        'speed_diff_front', 'speed_diff_left_front',
        'speed_diff_right_front', 'speed_diff_left_rear'
    ]

    features = [sample['features'] for sample in data]
    labels = [sample['label'] for sample in data]

    df = pd.DataFrame(features, columns=feature_names)
    df['label'] = labels

    return df

def concatenate_all_vehicles(num_vehicles=None):

    private_datasets_dir = 'private_datasets'
    vehicle_files = sorted([f for f in os.listdir(private_datasets_dir) if f.startswith('veh_')])

    if num_vehicles is None:
        num_vehicles = len(vehicle_files)
    else:
        num_vehicles = min(num_vehicles, len(vehicle_files))

    print(f"Concatenando dados de {num_vehicles} veículos...")

    all_dataframes = []

    for i in tqdm(range(num_vehicles)):
        df_vehicle = convert_to_dataframe(i)
        df_vehicle['vehicle_id'] = i

        all_dataframes.append(df_vehicle)

    df_complete = pd.concat(all_dataframes, ignore_index=True)

    print(f"Total de amostras: {len(df_complete):,}")
    print(f"Veículos: {num_vehicles}")
    print(f"Média de amostras por veículo: {len(df_complete)/num_vehicles:.1f}")

    return df_complete

df_all_vehicles = concatenate_all_vehicles()

Concatenando dados de 400 veículos...


100%|██████████| 400/400 [00:11<00:00, 34.83it/s]

Total de amostras: 115,511
Veículos: 400
Média de amostras por veículo: 288.8





In [5]:
df_all_vehicles.head(20)

Unnamed: 0,ego_speed,ego_lane,ego_desired_speed,front_exists,front_distance,front_speed,left_front_exists,left_front_distance,left_front_speed,left_rear_exists,...,right_front_speed,right_rear_exists,right_rear_distance,right_rear_speed,speed_diff_front,speed_diff_left_front,speed_diff_right_front,speed_diff_left_rear,label,vehicle_id
0,21.18,0,21.23,1,385.34,20.86,0,0.0,0.0,0,...,22.74,1,262.24,23.97,0.32,21.18,-1.56,21.18,1,0
1,23.25,0,23.35,1,262.89,25.26,0,0.0,0.0,0,...,23.63,1,66.9,20.04,-2.01,23.25,-0.38,23.25,2,0
2,21.3,0,24.97,1,57.84,20.38,0,0.0,0.0,0,...,22.33,1,4.96,22.4,0.92,21.3,-1.03,21.3,1,0
3,20.27,1,20.3,1,326.64,21.18,1,610.28,20.85,1,...,27.53,1,184.84,21.64,-0.91,-0.58,-7.26,-1.62,1,0
4,21.84,2,26.04,1,50.05,21.72,1,33.04,22.65,1,...,0.0,0,0.0,0.0,0.12,-0.81,21.84,0.08,1,0
5,25.32,0,25.31,1,1355.29,25.67,0,0.0,0.0,0,...,21.41,1,944.7,29.1,-0.35,25.32,3.91,25.32,1,0
6,21.09,1,21.09,1,1174.85,21.81,1,725.23,26.45,1,...,22.41,1,807.98,29.55,-0.72,-5.36,-1.32,-0.97,1,0
7,21.98,2,25.44,1,63.69,20.66,1,123.96,22.18,1,...,0.0,0,0.0,0.0,1.32,-0.2,21.98,0.67,1,0
8,25.33,1,25.52,1,600.77,21.78,1,55.71,21.72,1,...,25.81,1,107.99,25.52,3.55,3.61,-0.48,1.46,1,0
9,28.31,2,28.32,1,2045.66,23.46,1,838.61,20.93,1,...,0.0,0,0.0,0.0,4.85,7.38,28.31,4.34,1,0


In [6]:
print(f"Shape: {df_all_vehicles.shape}")
print(f"\nDistribuição de classes:")
print(df_all_vehicles['label'].value_counts().sort_index())
print(f"Min amostras: {df_all_vehicles['vehicle_id'].value_counts().min()}")
print(f"Max amostras: {df_all_vehicles['vehicle_id'].value_counts().max()}")
print(f"Média amostras: {df_all_vehicles['vehicle_id'].value_counts().mean():.1f}")

Shape: (115511, 24)

Distribuição de classes:
label
0      5965
1    102213
2      7333
Name: count, dtype: int64
Min amostras: 282
Max amostras: 2993
Média amostras: 288.8


In [9]:
df_all_vehicles.to_csv('dataset_all_vehicles.csv', index=False)

In [12]:
def convert_validation_to_dataframe():

    with open('validation_set.json', 'r') as f:
        data = json.load(f)

    # Nomes das features (mesmas do conjunto de treino)
    feature_names = [
        'ego_speed', 'ego_lane', 'ego_desired_speed',
        'front_exists', 'front_distance', 'front_speed',
        'left_front_exists', 'left_front_distance', 'left_front_speed',
        'left_rear_exists', 'left_rear_distance', 'left_rear_speed',
        'right_front_exists', 'right_front_distance', 'right_front_speed',
        'right_rear_exists', 'right_rear_distance', 'right_rear_speed',
        'speed_diff_front', 'speed_diff_left_front',
        'speed_diff_right_front', 'speed_diff_left_rear'
    ]

    # Extrair features e labels
    features = [sample['features'] for sample in data]
    labels = [sample['label'] for sample in data]

    # Criar DataFrame
    df = pd.DataFrame(features, columns=feature_names)
    df['label'] = labels
    
    print(f"   Total de amostras: {len(df):,}")

    return df

df_validation = convert_validation_to_dataframe()

   Total de amostras: 28,877


In [17]:
print(f"\nShape: {df_validation.shape}")
print(f"\nDistribuição de classes:")
print(df_validation['label'].value_counts().sort_index())



Shape: (28877, 23)

Distribuição de classes:
label
0     1413
1    25629
2     1835
Name: count, dtype: int64


In [15]:
df_validation.columns

Index(['ego_speed', 'ego_lane', 'ego_desired_speed', 'front_exists',
       'front_distance', 'front_speed', 'left_front_exists',
       'left_front_distance', 'left_front_speed', 'left_rear_exists',
       'left_rear_distance', 'left_rear_speed', 'right_front_exists',
       'right_front_distance', 'right_front_speed', 'right_rear_exists',
       'right_rear_distance', 'right_rear_speed', 'speed_diff_front',
       'speed_diff_left_front', 'speed_diff_right_front',
       'speed_diff_left_rear', 'label'],
      dtype='object')

In [14]:
df_validation.head(20)

Unnamed: 0,ego_speed,ego_lane,ego_desired_speed,front_exists,front_distance,front_speed,left_front_exists,left_front_distance,left_front_speed,left_rear_exists,...,right_front_distance,right_front_speed,right_rear_exists,right_rear_distance,right_rear_speed,speed_diff_front,speed_diff_left_front,speed_diff_right_front,speed_diff_left_rear,label
0,28.9,2,28.92,1,1133.94,28.37,1,34.36,20.5,1,...,0.0,0.0,0,0.0,0.0,0.53,8.4,28.9,2.73,1
1,21.93,1,24.84,1,68.51,20.76,1,375.77,22.7,1,...,113.15,22.15,1,12.14,22.02,1.17,-0.77,-0.22,-3.4,1
2,20.6,2,21.78,1,72.2,20.69,1,24.67,24.42,1,...,0.0,0.0,0,0.0,0.0,-0.09,-3.82,20.6,-5.41,1
3,24.59,2,24.62,1,498.01,24.85,1,53.24,20.85,1,...,0.0,0.0,0,0.0,0.0,-0.26,3.74,24.59,0.3,1
4,24.25,0,24.28,1,635.71,24.1,0,0.0,0.0,0,...,863.13,24.91,1,1060.41,23.9,0.15,24.25,-0.66,24.25,1
5,20.85,2,20.84,1,3127.21,23.56,1,930.45,24.88,1,...,0.0,0.0,0,0.0,0.0,-2.71,-4.03,20.85,-0.85,2
6,27.53,0,27.57,1,1216.76,22.19,0,0.0,0.0,0,...,1693.75,28.69,1,1383.02,28.35,5.34,27.53,-1.16,27.53,1
7,21.61,1,26.71,1,49.88,21.1,1,1278.59,23.96,1,...,102.45,23.59,1,760.08,20.13,0.51,-2.35,-1.98,-1.13,2
8,24.55,2,24.64,1,520.63,22.13,1,802.81,26.87,1,...,0.0,0.0,0,0.0,0.0,2.42,-2.32,24.55,3.6,2
9,24.36,0,29.73,1,52.05,24.36,0,0.0,0.0,0,...,683.2,21.8,1,562.14,20.28,0.0,24.36,2.56,24.36,1


In [18]:
df_validation.tail(20)

Unnamed: 0,ego_speed,ego_lane,ego_desired_speed,front_exists,front_distance,front_speed,left_front_exists,left_front_distance,left_front_speed,left_rear_exists,...,right_front_distance,right_front_speed,right_rear_exists,right_rear_distance,right_rear_speed,speed_diff_front,speed_diff_left_front,speed_diff_right_front,speed_diff_left_rear,label
28857,25.97,1,27.18,1,80.14,27.43,1,1121.63,23.68,1,...,646.21,23.46,1,345.41,27.49,-1.46,2.29,2.51,1.92,1
28858,22.42,1,23.07,1,147.14,20.68,1,177.81,25.66,1,...,1325.85,21.38,1,148.02,27.96,1.74,-3.24,1.04,-6.61,1
28859,24.18,1,24.23,1,633.0,26.98,1,1095.93,28.42,1,...,1613.99,21.85,1,627.06,28.47,-2.8,-4.24,2.33,3.69,1
28860,26.35,2,28.01,1,481.95,24.99,1,286.55,20.08,1,...,0.0,0.0,0,0.0,0.0,1.36,6.27,26.35,3.72,2
28861,22.64,1,22.87,1,239.46,21.67,1,543.13,28.86,1,...,1527.06,22.76,1,292.27,27.05,0.97,-6.22,-0.12,2.59,1
28862,27.15,0,27.15,1,1532.39,28.17,0,0.0,0.0,0,...,198.17,21.7,1,8.54,27.1,-1.02,27.15,5.45,27.15,1
28863,22.97,1,25.08,1,68.23,22.86,1,456.18,25.31,1,...,844.98,22.28,1,130.57,22.25,0.11,-2.34,0.69,2.9,1
28864,21.78,1,21.8,1,211.14,24.4,1,183.61,22.04,1,...,92.71,24.91,1,1252.02,22.57,-2.62,-0.26,-3.13,1.44,1
28865,23.84,1,23.99,1,326.82,22.38,1,66.24,22.27,1,...,123.37,25.03,1,937.61,29.66,1.46,1.57,-1.19,-4.25,2
28866,22.8,0,23.13,1,161.32,22.55,0,0.0,0.0,0,...,39.29,20.36,1,11.71,21.74,0.25,22.8,2.44,22.8,2


In [19]:
df_validation.to_csv('dataset_validation_all_vehicles.csv', index=False)

In [20]:
print("\n" + "="*50)
print("COMPARAÇÃO: TREINO vs VALIDAÇÃO")
print("="*50)

print(f"\nTREINO:")
print(f"  Total de amostras: {len(df_all_vehicles):,}")
print(f"  Distribuição de classes:")
for label, count in df_all_vehicles['label'].value_counts().sort_index().items():
    print(f"    Classe {label}: {count:6d} ({count/len(df_all_vehicles)*100:5.2f}%)")

print(f"\nVALIDAÇÃO:")
print(f"  Total de amostras: {len(df_validation):,}")
print(f"  Distribuição de classes:")
for label, count in df_validation['label'].value_counts().sort_index().items():
    print(f"    Classe {label}: {count:6d} ({count/len(df_validation)*100:5.2f}%)")

print(f"\nPROPORÇÃO VALIDAÇÃO/TREINO: {len(df_validation)/len(df_all_vehicles)*100:.2f}%")


COMPARAÇÃO: TREINO vs VALIDAÇÃO

TREINO:
  Total de amostras: 115,511
  Distribuição de classes:
    Classe 0:   5965 ( 5.16%)
    Classe 1: 102213 (88.49%)
    Classe 2:   7333 ( 6.35%)

VALIDAÇÃO:
  Total de amostras: 28,877
  Distribuição de classes:
    Classe 0:   1413 ( 4.89%)
    Classe 1:  25629 (88.75%)
    Classe 2:   1835 ( 6.35%)

PROPORÇÃO VALIDAÇÃO/TREINO: 25.00%
