In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout

In [2]:
# Paso 1: Preprocesamiento de datos
data_taxi = pd.read_csv(
    "C://Users//Andres//Desktop//AnyoneAI//Proyecto_Final//dataframe.csv"
)  # Reemplaza 'data_taxi.csv' con la ubicación correcta del archivo
data_taxi = data_taxi.dropna()  # Eliminar filas con valores faltantes o NaN

In [3]:
# Agregar la extracción de horas y dias de la semana

In [4]:
# Convierte la columna 'tpep_pickup_datetime' en formato datetime
data_taxi["tpep_pickup_datetime"] = pd.to_datetime(data_taxi["tpep_pickup_datetime"])
data_taxi["tpep_dropoff_datetime"] = pd.to_datetime(data_taxi["tpep_dropoff_datetime"])

data_taxi["duration"] = (
    data_taxi["tpep_dropoff_datetime"] - data_taxi["tpep_pickup_datetime"]
).dt.total_seconds() / 60

data_taxi["pickup_day"] = data_taxi["tpep_pickup_datetime"].dt.day
data_taxi["pickup_hour"] = data_taxi["tpep_pickup_datetime"].dt.hour

data_taxi["dropoff_day"] = data_taxi["tpep_dropoff_datetime"].dt.day
data_taxi["dropoff_hour"] = data_taxi["tpep_dropoff_datetime"].dt.hour

data_taxi = data_taxi.drop("Unnamed: 0", axis=1)

In [5]:
data_taxi

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,duration,store_and_fwd_flag_encoded,pickup_day,pickup_hour,dropoff_day,dropoff_hour
0,1,2022-05-01 00:00:36,2022-05-01 00:19:18,1.0,6.6010,1.0,N,246,151,2,...,0.3,20.80,2.5,0.0,18.700000,0,1,0,1,0
1,1,2022-05-01 00:27:44,2022-05-01 00:41:33,1.0,3.7030,1.0,N,238,74,2,...,0.3,14.80,2.5,0.0,13.816667,0,1,0,1,0
2,1,2022-05-01 00:59:00,2022-05-01 01:14:22,1.0,6.7620,1.0,N,163,260,2,...,0.3,19.30,2.5,0.0,15.366667,0,1,0,1,1
3,1,2022-05-01 00:28:26,2022-05-01 00:37:49,1.0,2.5760,1.0,N,238,75,1,...,0.3,13.55,2.5,0.0,9.383333,0,1,0,1,0
4,2,2022-04-30 23:53:47,2022-05-01 00:05:17,1.0,3.0107,1.0,N,249,164,1,...,0.3,15.96,2.5,0.0,11.500000,0,30,23,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3168605,2,2022-05-31 23:32:53,2022-05-31 23:38:24,1.0,2.1413,1.0,N,161,236,1,...,0.3,12.96,2.5,0.0,5.516667,0,31,23,31,23
3168606,2,2022-05-31 23:50:58,2022-05-31 23:57:27,1.0,3.0429,1.0,N,142,238,1,...,0.3,15.34,2.5,0.0,6.483333,0,31,23,31,23
3168607,2,2022-05-31 23:43:33,2022-05-31 23:57:09,1.0,5.5223,1.0,N,186,43,1,...,0.3,17.30,2.5,0.0,13.600000,0,31,23,31,23
3168608,2,2022-05-31 23:01:31,2022-05-31 23:04:42,2.0,1.5295,1.0,N,239,238,1,...,0.3,10.56,2.5,0.0,3.183333,0,31,23,31,23


In [6]:
(data_taxi.loc[:, data_taxi.columns[-9:]] < 0).sum()

total_amount                  0
congestion_surcharge          0
airport_fee                   0
duration                      0
store_and_fwd_flag_encoded    0
pickup_day                    0
pickup_hour                   0
dropoff_day                   0
dropoff_hour                  0
dtype: int64

In [7]:
data_taxi["RatecodeID"].value_counts()

RatecodeID
1.0    3033928
2.0     129395
5.0       4755
4.0        518
3.0          9
6.0          5
Name: count, dtype: int64

In [8]:
data_taxi = data_taxi.sample(10000, random_state=42)

In [9]:
(data_taxi.loc[:, data_taxi.columns[-9:]] < 0).sum()

total_amount                  0
congestion_surcharge          0
airport_fee                   0
duration                      0
store_and_fwd_flag_encoded    0
pickup_day                    0
pickup_hour                   0
dropoff_day                   0
dropoff_hour                  0
dtype: int64

In [10]:
# Paso 2: Dividir los datos en características (X) y etiquetas (y)
X = data_taxi[
    [
        "VendorID",
        # "tpep_pickup_datetime",
        "PULocationID",
        "DOLocationID",
        "trip_distance",
        "passenger_count",
        "RatecodeID",
        "payment_type",
        "improvement_surcharge",
        "pickup_day",
        "pickup_hour",
    ]
]  # Características
y = data_taxi[["duration"]]  # Etiquetas

In [11]:
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Escalar los datos utilizando StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
model = Sequential(
    [
        Dense(32, activation="relu", input_shape=(X_train_scaled.shape[1],)),
        Dense(16, activation="relu"),
        Dense(1),
    ]
)

In [13]:
model.compile(optimizer="adam", loss="mean_squared_error")

In [14]:
model.fit(
    X_train_scaled,
    y_train,
    epochs=10,
    batch_size=16,
    validation_data=(X_test_scaled, y_test),
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x21e30446190>

In [15]:
loss = model.evaluate(X_test_scaled, y_test)
print("Mean Squared Error:", loss)

Mean Squared Error: 41.24875259399414


In [16]:
import xgboost as xgb

xgboost = xgb.XGBRegressor(random_state=42)

In [17]:
X_xgb = data_taxi[
    [
        "VendorID",
        # "tpep_pickup_datetime",
        "PULocationID",
        "DOLocationID",
        "trip_distance",
        "passenger_count",
        "RatecodeID",
        "payment_type",
        "improvement_surcharge",
        "pickup_day",
        "pickup_hour",
    ]
]  # Características
y_xgb = data_taxi[["duration"]]  # Etiquetas

In [18]:
X_train_xgb, X_test_xgb, y_train_xgb, y_test_xgb = train_test_split(
    X_xgb, y_xgb, test_size=0.2, random_state=42
)

In [20]:
xgboost.fit(X_test_xgb, y_test_xgb)

In [21]:
y_pred = xgboost.predict(X_test_xgb)

In [22]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(y_test_xgb, y_pred)
mae = mean_absolute_error(y_test_xgb, y_pred)
mae = r2_score(y_test_xgb, y_pred)

print("MSE:", mse)
print("MAE:", mae)
print("R^2:", mae)

MSE: 1.4602320631563452
MAE: 0.9912419299924524
R^2: 0.9912419299924524


In [None]:
asdasd

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Cargar el dataframe 'data_taxi'
data_taxi = pd.read_csv(
    "ruta_del_archivo.csv"
)  # Asegúrate de reemplazar 'ruta_del_archivo.csv' con la ruta correcta de tu archivo

# Dividir el dataframe en características (X) y objetivos (y) para el primer modelo
X_all = data_taxi.drop(["fare_amount", "duration"], axis=1)
y_all = data_taxi[["fare_amount", "duration"]]

# Normalizar los datos de características para el primer modelo
scaler_all = MinMaxScaler()
X_all_normalized = scaler_all.fit_transform(X_all)

# Dividir los datos en conjuntos de entrenamiento y prueba para el primer modelo
X_all_train, X_all_test, y_all_train, y_all_test = train_test_split(
    X_all_normalized, y_all, test_size=0.2, random_state=42
)

# Crear el primer modelo de red neuronal para todas las características
model_all = tf.keras.models.Sequential(
    [
        tf.keras.layers.Dense(
            64, activation="relu", input_shape=(X_all_train.shape[1],)
        ),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(
            2
        ),  # 2 neuronas de salida para 'fare_amount' y 'duration'
    ]
)

# Compilar y entrenar el primer modelo
model_all.compile(optimizer="adam", loss="mse")
model_all.fit(
    X_all_train,
    y_all_train,
    epochs=10,
    batch_size=32,
    validation_data=(X_all_test, y_all_test),
)

# Crear el segundo modelo sin las variables 'PULocationID' y 'DOLocationID'
X_partial = data_taxi.drop(
    ["PULocationID", "DOLocationID", "fare_amount", "duration"], axis=1
)
y_partial = data_taxi[["fare_amount", "duration"]]

# Normalizar los datos de características para el segundo modelo
scaler_partial = MinMaxScaler()
X_partial_normalized = scaler_partial.fit_transform(X_partial)

# Dividir los datos en conjuntos de entrenamiento y prueba para el segundo modelo
X_partial_train, X_partial_test, y_partial_train, y_partial_test = train_test_split(
    X_partial_normalized, y_partial, test_size=0.2, random_state=42
)

# Crear el segundo modelo de red neuronal sin las variables 'PULocationID' y 'DOLocationID'
model_partial = tf.keras.models.Sequential(
    [
        tf.keras.layers.Dense(
            64, activation="relu", input_shape=(X_partial_train.shape[1],)
        ),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(
            2
        ),  # 2 neuronas de salida para 'fare_amount' y 'duration'
    ]
)

# Compilar y entrenar el segundo modelo
model_partial.compile(optimizer="adam", loss="mse")
model_partial.fit(
    X_partial_train,
    y_partial_train,
    epochs=10,
    batch_size=32,
    validation_data=(X_partial_test, y_partial_test),
)

# Solicitar al usuario las zonas de pickup y dropoff
pickup_zone = int(input("Ingrese la zona de recogida (PULocationID): "))
dropoff_zone = int(input("Ingrese la zona de entrega (DOLocationID): "))

# Preparar los datos para la predicción
input_data = pd.DataFrame(
    [[pickup_zone, dropoff_zone]], columns=["PULocationID", "DOLocationID"]
)
input_data_normalized_all = scaler_all.transform(input_data)
input_data_normalized_partial = scaler_partial.transform(
    input_data.drop(["PULocationID", "DOLocationID"], axis=1)
)

# Realizar la predicción de 'fare_amount' y 'duration' utilizando el primer modelo
prediction_all = model_all.predict(input_data_normalized_all)

# Realizar la predicción de 'fare_amount' y 'duration' utilizando el segundo modelo
prediction_partial = model_partial.predict(input_data_normalized_partial)

print("Prediction (using all features):")
print("fare_amount:", prediction_all[0][0])
print("duration:", prediction_all[0][1])

print("Prediction (excluding PULocationID and DOLocationID):")
print("fare_amount:", prediction_partial[0][0])
print("duration:", prediction_partial[0][1])