In [2]:
import sys
from pathlib import Path
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Load the preprocessed data
processed_data_path = "../data/processed/yellow_processed_2022_05.parquet"
cleaned_df = pd.read_parquet(processed_data_path)

# Visualize the data
# Distribution of trip distance
sns.histplot(cleaned_df['trip_distance'], kde=True, bins=30)
plt.title("Distribution of Trip Distance")
plt.xlabel("Trip Distance (miles)")
plt.ylabel("Frequency")
plt.show()

# Distribution of fare amount
if 'fare_amount' in cleaned_df.columns:
    sns.histplot(cleaned_df['fare_amount'], kde=True, bins=30)
    plt.title("Distribution of Fare Amount")
    plt.xlabel("Fare Amount ($)")
    plt.ylabel("Frequency")
    plt.show()

# Correlation heatmap
numeric_df = cleaned_df.select_dtypes(include=['number'])
plt.figure(figsize=(10, 6))
sns.heatmap(numeric_df.corr(), annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
plt.title("Correlation Heatmap")
plt.show()


In [None]:
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from tqdm import tqdm

In [None]:
cleaned_df.columns

In [None]:
target = ['trip_duration', 'fare_amount']
features = ['trip_distance', 'store_and_fwd_flag', 'Borough_pu', 'Zone_pu', 'service_zone_pu',
       'Borough_do', 'Zone_do', 'service_zone_do', 'day_of_week_pu',
       'hour_of_day_pu', 'time_of_day_pu']

In [None]:

X = cleaned_df[features]
y = cleaned_df[target]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
categorical_features = ['store_and_fwd_flag', 'Borough_pu', 'Zone_pu', 'service_zone_pu',
       'Borough_do', 'Zone_do', 'service_zone_do', 'time_of_day_pu']
numerical_features = ['trip_distance', 'day_of_week_pu', 'hour_of_day_pu', ]

In [None]:
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
numerical_transformer = StandardScaler()

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [None]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [None]:

y_train = y_train.values
y_test = y_test.values

In [None]:

X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)
y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

In [None]:
dt_model = DecisionTreeRegressor(random_state=42)
print(X_train.shape)
# Entrenar el modelo con seguimiento del progreso
best_mse = float('inf')
n_steps = 10
print("Entrenando Decision Tree...")
for i in range(1, X_train.shape[0] + 1):
    dt_model.fit(X_train[:i], y_train[:i])  # Ajuste incremental
    y_partial_pred = dt_model.predict(X_test)
    mse_partial = mean_squared_error(y_test, y_partial_pred)
    if mse_partial < best_mse:
        best_mse = mse_partial  # Actualiza el mejor MSE encontrado
    sys.stdout.write(f"\rIteración {i}/{X_train.shape[0]}: MSE parcial = {mse_partial:.4f}, Mejor MSE = {best_mse:.4f}")  # Sobrescribe la línea anterior
    sys.stdout.flush()

# Evaluar el modelo final
y_pred = dt_model.predict(X_test)  # Genera predicciones en el conjunto de prueba.
mse = mean_squared_error(y_test, y_pred)  # Calcula el error cuadrático medio.
r2 = r2_score(y_test, y_pred)  # Calcula el coeficiente de determinación R^2.
print(f"Error cuadrático medio (MSE): {mse:.4f}")  # Imprime el MSE.
print(f"Coeficiente de determinación (R^2): {r2:.4f}")  # Imprime el R^2.

# Importancia de las características
feature_importances = pd.DataFrame(
    dt_model.feature_importances_,  # Obtiene la importancia de cada característica.
    index=X.columns,  # Usa los nombres de las columnas como índice.
    columns=['Importancia']  # Define el nombre de la columna de importancia.
).sort_values(by='Importancia', ascending=False)  # Ordena las características por importancia en orden descendente.

print("\nImportancia de las características:")
print(feature_importances)  # Imprime la tabla de importancias de características.