In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('data_updated.csv')

In [3]:
df['has_delay'] = (df['Min Delay'] > 15).astype(int)

In [4]:
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')

df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day_of_Month'] = df['Date'].dt.day

df.drop('Date', axis=1, inplace=True)

In [5]:
df['Time'] = pd.to_datetime(df['Time'], format='%H:%M')

df['Hour'] = df['Time'].dt.hour
df['Minute'] = df['Time'].dt.minute

df.drop('Time', axis=1, inplace=True)

In [6]:
day_encoder = LabelEncoder()
df['Day'] = day_encoder.fit_transform(df['Day'])

In [7]:
incident_encoder = LabelEncoder()
df['Incident'] = incident_encoder.fit_transform(df['Incident'])

In [8]:
unique_locations = df['Location'].dropna().unique()

location_counts = df['Location'].value_counts(normalize=True)
df['Location'] = df['Location'].map(location_counts)

In [9]:
df = df.dropna(subset=['Location'])
print("Fila con valor nulo en 'Location' eliminada.")

Fila con valor nulo en 'Location' eliminada.


In [10]:
df['Bound'] = df['Bound'].replace(['8'], np.nan)
df = pd.get_dummies(df, columns=['Bound'], prefix='Bound')

dummy_columns = [col for col in df.columns if col.startswith('Bound_')]
df[dummy_columns] = df[dummy_columns].astype(int)

In [11]:
unique_lines = df['Line'].dropna().unique()

line_frequencies = df['Line'].value_counts(normalize=True)

df['Line'] = df['Line'].map(line_frequencies)
df['Line'] = df['Line'].fillna(0)

In [12]:
df.to_csv('dataset_with_has_delay.csv', index=False)
print("\nArchivo actualizado guardado como 'dataset_with_has_delay.csv'")


Archivo actualizado guardado como 'dataset_with_has_delay.csv'


In [13]:
non_numeric_columns = df.select_dtypes(include=['object']).columns
print("Non-numeric columns:\n", non_numeric_columns)


for column in non_numeric_columns:
    print(f"\nUnique values in column '{column}':")
    print(df[column].unique())

Non-numeric columns:
 Index([], dtype='object')


In [14]:
print(df.head())

   Week      Line  Day  Location  Incident  Min Delay  Min Gap  Vehicle  \
0     1  0.126664    1  0.000992        11         10       20     4416   
1     1  0.126664    1  0.000361         3         52       72     4461   
2     1  0.064381    1  0.000180        11          0        0     4545   
3     1  0.126664    1  0.020566        11         37        0     4551   
4     1  0.002645    1  0.005412         2         78       93     8116   

   has_delay  Year  Month  Day_of_Month  Hour  Minute  Bound_B  Bound_E  \
0          0  2024      1             1     2      45        0        0   
1          1  2024      1             1     3       6        0        1   
2          0  2024      1             1     3      21        0        0   
3          1  2024      1             1     3      53        0        0   
4          1  2024      1             1     4      27        0        1   

   Bound_N  Bound_S  Bound_W  
0        0        0        1  
1        0        0        0  
2    

In [15]:
missing_values = df.isnull().sum()
print("Valores nulos por columna:\n", missing_values)

missing_percentage = (missing_values / len(df)) * 100
print("\nPorcentaje de valores nulos por columna:\n", missing_percentage)

Valores nulos por columna:
 Week            0
Line            0
Day             0
Location        0
Incident        0
Min Delay       0
Min Gap         0
Vehicle         0
has_delay       0
Year            0
Month           0
Day_of_Month    0
Hour            0
Minute          0
Bound_B         0
Bound_E         0
Bound_N         0
Bound_S         0
Bound_W         0
dtype: int64

Porcentaje de valores nulos por columna:
 Week            0.0
Line            0.0
Day             0.0
Location        0.0
Incident        0.0
Min Delay       0.0
Min Gap         0.0
Vehicle         0.0
has_delay       0.0
Year            0.0
Month           0.0
Day_of_Month    0.0
Hour            0.0
Minute          0.0
Bound_B         0.0
Bound_E         0.0
Bound_N         0.0
Bound_S         0.0
Bound_W         0.0
dtype: float64
