# New York Taxi - Predict Duration based on Pickup and Dropoff

In [53]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import root_mean_squared_error

In [22]:
# Set options for the pandas
pd.options.display.float_format = '{:.3f}'.format

In [23]:
# Read the data source and return the pandas DataFrame
def read_datasource(filename) -> pd.DataFrame:
    # Read the data file 
    df = pd.read_parquet(filename)

    # Calculate the duration of the taxi using the pickup time and dropoff time
    duration = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']
    df['duration'] = duration
    
    # Convert the datetime, to numeric value in minutes
    df.duration = df.duration.apply(lambda td: td.total_seconds()/60)

    # Filter the data that gives taxi duration between 1 min to 60 mins
    filter_duration = (df.duration > 1) & (df.duration < 60)
    df = df[filter_duration]

    # Create Categorical column
    categorical = ["PULocationID", "DOLocationID"]
    df[categorical] = df[categorical].astype(str)

    return df

In [None]:
# From the given DataFrame create a train or validation data (X_train and X_val) along with target data (y_train or y_val)

def create_train_or_validation_data(df_train: pd.DataFrame, df_val:pd.DataFrame) -> tuple[DictVectorizer, np.ndarray, DictVectorizer, np.ndarray]:
    
    # Create a categorical data based on the Pickup and Dropoff location ID and a numneric feature based on the trip duration.
    categorical = ["PULocationID", "DOLocationID"]
    numerical = ["trip_distance"]

    # Transforms lists of feature values to vectors 
    dv = DictVectorizer()

    # If the operation is train then 
    train_dicts = df_train[categorical + numerical].to_dict(orient='records')
    X_train = dv.fit_transform(train_dicts)
    
    # If the operation is validation
    val_dicts = df_val[categorical + numerical].to_dict(orient='records')
    X_val = dv.transform(val_dicts)

    # prepare the y from the values of the target (trip_duration in this case)
    target = 'duration'
    y_train = df_train[target].values
    y_val = df_val[target].values
    
    return (X_train, y_train, X_val, y_val)

#### New York Green Taxi, predict the trip duration

In [28]:
# Create a train data for Green Taxi New York with January Data

df_train = read_datasource("https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet")
df_val = read_datasource("https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-02.parquet")

X_train, y_train, X_val, y_val = create_train_or_validation_data(df_train, df_val)

### Running Machine learning models on the data to train and validate predictions

#### Linear Regression model

In [None]:
# With the training data lets run the linear regression to train our model
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# Predict the values for this model using validation data
y_predict_lin = lin_reg.predict(X_val)

# Calculate the root mean squared error regression loss
rmse = root_mean_squared_error(y_val, y_predict_lin)

print(f"The Root Mean Squared Error Regression Loss is {rmse}")

### Lasso Regression

In [None]:
# With the Training data lets run the Lasso regression to train our model
lasso = Lasso()
lasso.fit(X_train, y_train)

y_predict_lasso = lasso.predict(X_val)

rmse_lasso = root_mean_squared_error(y_val, y_predict_lasso)
print(f"The Root Mean Squared Error Regression Loss is {rmse_lasso}")

### Ridge Regression

In [None]:
# With the training Data lets run Ridge regression

ridge = Ridge()

ridge.fit(X_train, y_train)

y_predict_ridge = ridge.predict(X_val)

rmse_ridge = root_mean_squared_error(y_val, y_predict_ridge)

print(f"The Root Mean Squared Error Regression Loss is {rmse_ridge}")

### ElasticNet Regression

In [None]:
# ElasticNet Features

elastic = ElasticNet()

elastic.fit(X_train, y_train)

y_predict_elastic = elastic.predict(X_val)

rmse_elastic = root_mean_squared_error(y_val, y_predict_elastic)

print(f"The Root Mean Squared Error Regression Loss is {rmse_elastic}")
