In [26]:
from pathlib import Path
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

import seaborn as sns
import matplotlib.pyplot as plt


## Q1. Downloading the data

We'll use [the same NYC taxi dataset](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page),
but instead of "Green Taxi Trip Records", we'll use "For-Hire Vehicle Trip Records".

Download the data for January and February 2021.

Note that you need "For-Hire Vehicle Trip Records", not "High Volume For-Hire Vehicle Trip Records".

Read the data for January. How many records are there?

* 1054112
* 1154112
* 1254112
* 1354112

In [27]:
# make directory for save file from download
Path.mkdir(Path.cwd() / "data", exist_ok=True, parents=True)

In [28]:
# Download  For-Hire Vehicle Trip Records taxi data from NYC taxi data for january and febuary 2021
# January URL : https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-01.parquet
# Febuary URL : https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-02.parquet 
JANUARY_URL = 'https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-01.parquet'
FEBUARY_URL = 'https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-02.parquet'
!wget $JANUARY_URL -P data/
!wget $FEBUARY_URL -P data/

--2022-05-21 22:27:04--  https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-01.parquet
Resolving nyc-tlc.s3.amazonaws.com (nyc-tlc.s3.amazonaws.com)... 52.217.82.108
Connecting to nyc-tlc.s3.amazonaws.com (nyc-tlc.s3.amazonaws.com)|52.217.82.108|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11886281 (11M) [binary/octet-stream]
Saving to: ‘data/fhv_tripdata_2021-01.parquet.2’


2022-05-21 22:27:08 (4.61 MB/s) - ‘data/fhv_tripdata_2021-01.parquet.2’ saved [11886281/11886281]

--2022-05-21 22:27:08--  https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-02.parquet
Resolving nyc-tlc.s3.amazonaws.com (nyc-tlc.s3.amazonaws.com)... 52.217.82.108
Connecting to nyc-tlc.s3.amazonaws.com (nyc-tlc.s3.amazonaws.com)|52.217.82.108|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10645466 (10M) [binary/octet-stream]
Saving to: ‘data/fhv_tripdata_2021-02.parquet.2’


2022-05-21 22:27:12 (4.30 MB/s) - ‘data/fhv_tripdata_2021-02.parq

In [29]:
jan_df = pd.read_parquet(Path.cwd() / "data" / "fhv_tripdata_2021-01.parquet")

In [30]:
print(f'There are {jan_df.shape[0]} recoreds')

There are 1154112 recoreds



## Q2. Computing duration

Now let's compute the `duration` variable. It should contain the duration of a ride in minutes. 

What's the average trip duration in January?

* 15.16
* 19.16
* 24.16
* 29.16

In [31]:
jan_df.columns

Index(['dispatching_base_num', 'pickup_datetime', 'dropOff_datetime',
       'PUlocationID', 'DOlocationID', 'SR_Flag', 'Affiliated_base_number'],
      dtype='object')

In [32]:
# Get duration 
jan_df['duration'] = jan_df.dropOff_datetime - jan_df.pickup_datetime
jan_df.duration = jan_df.duration.apply(lambda td: td.total_seconds() / 60)

In [33]:
jan_df['duration'].mean()

19.167224093791006

In [34]:
jan_df = jan_df.fillna(-1)

In [35]:
jan_df.describe(percentiles=[0.53, 0.63, 0.73, 0.83])

Unnamed: 0,PUlocationID,DOlocationID,SR_Flag,duration
count,1154112.0,1154112.0,1154112.0,1154112.0
mean,22.90294,116.6559,-1.0,19.16722
std,61.23654,88.4866,0.0,398.6922
min,-1.0,-1.0,-1.0,0.01666667
50%,-1.0,97.0,-1.0,13.4
53%,-1.0,119.0,-1.0,14.16667
63%,-1.0,165.0,-1.0,17.13333
73%,-1.0,192.0,-1.0,21.21667
83%,-1.0,225.0,-1.0,27.63333
max,265.0,265.0,-1.0,423371.0


# Data preparation

Check the distribution of the duration variable. There are some outliers. 

Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive).

How many records did you drop? 


In [36]:
jan_df = jan_df[(jan_df.duration >= 1) & (jan_df.duration <= 60)]

## Q4. One-hot encoding

Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model. 

* Turn the dataframe into a list of dictionaries
* Fit a dictionary vectorizer 
* Get a feature matrix from it

What's the dimensionality of this matrix? (The number of columns).

* 2
* 152
* 352
* 525
* 725

In [37]:
# Defined features
features = ['PUlocationID', 'DOlocationID']
# Convert PUlocationID and DOLocationID to str type
jan_df[features] = jan_df[features].astype(str)

# Turn the data frame into a list of dictionarires
train_dicts = jan_df[features].to_dict(orient='records')

# Fit a dirctionry vectorizer
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

# Target
target = 'duration'
y_train = jan_df[target].values




In [38]:
dv.get_feature_names_out()
print(f'There are {X_train.shape[1]} columns.')

There are 525 columns.


## Q5. Training a model

Now let's use the feature matrix from the previous step to train a model. 

* Train a plain linear regression model with default parameters 
* Calculate the RMSE of the model on the training data

What's the RMSE on train?

* 5.52
* 10.52
* 15.52
* 20.52

In [39]:
# lr
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False)


10.528519388232237

## Q6. Evaluating the model

Now let's apply this model to the validation dataset (Feb 2021). 

What's the RMSE on validation?

In [42]:
# load febuary dataframe
feb_df = pd.read_parquet(Path.cwd() / "data" / "fhv_tripdata_2021-02.parquet")
feb_df = feb_df.fillna(-1)
# Get duration 
feb_df['duration'] = feb_df.dropOff_datetime - feb_df.pickup_datetime
feb_df.duration = feb_df.duration.apply(lambda td: td.total_seconds() / 60)


In [43]:
feb_df = feb_df[(feb_df.duration >= 1) & (feb_df.duration <= 60)]

In [44]:
# Defined features
features = ['PUlocationID', 'DOlocationID']
# Convert PUlocationID and DOLocationID to str type
feb_df[features] = feb_df[features].astype(str)

# Turn the data frame into a list of dictionarires
val_dicts = feb_df[features].to_dict(orient='records')

X_val = dv.transform(val_dicts)

# Target
target = 'duration'
y_val = feb_df[target].values


In [45]:

y_val_pred = lr.predict(X_val)

mean_squared_error(y_val, y_val_pred, squared=False)



11.014286813221993

# Cleanup version.

In [62]:
def load_data(filename: str)-> pd.DataFrame:
    """Load taxi data

    Args:
        filename (str): filename

    Returns:
        pd.DataFrame: taxi dataframe
    """   
     
    # Load data
    df = pd.read_parquet(Path.cwd() / "data" / filename)
    # Fill na value with - 1
    df = df.fillna(-1)

    # Create duration and calculate duration from dropoff and pickup time
    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    # Filter duration in range [1, 60]
    df = df[(df.duration >= 1) & (df.duration <= 60)]

    # Defined features
    features = ['PUlocationID', 'DOlocationID']
    # Convert PUlocationID and DOLocationID to str type
    df[features] = df[features].astype(str)

    df = df[['PUlocationID', 'DOlocationID', 'duration']]

    print(f"""Load data from {filename}
        columns : {list(df.columns)}
    """)

    return df

In [74]:


def get_feature_and_target(df: pd.DataFrame, target_column: str, feature_columns):
    """Get feature and target dataframe from df input
    """
    target_df = df[target_column].values
    feature_df = df[feature_columns]

    return (feature_df, target_df)



In [64]:
train_df = load_data("fhv_tripdata_2021-01.parquet")
val_df = load_data("fhv_tripdata_2021-02.parquet")

Load data from fhv_tripdata_2021-01.parquet
        columns : ['PUlocationID', 'DOlocationID', 'duration']
    
Load data from fhv_tripdata_2021-02.parquet
        columns : ['PUlocationID', 'DOlocationID', 'duration']
    


In [86]:
# Get X (Features), y (target)
X_train, y_train = get_feature_and_target(train_df, 'duration', ['PUlocationID', 'DOlocationID'])
X_val, y_val = get_feature_and_target(val_df, 'duration', ['PUlocationID', 'DOlocationID'])

# Apply to_dict to dataframe
X_train = X_train.to_dict(orient='records')
X_val = X_val.to_dict(orient='records')

In [87]:
# Traning Model

## Vectorizer
dv = DictVectorizer()
X_train = dv.fit_transform(X_train)
X_val = dv.transform(X_val)


# Train model
lr = LinearRegression()
lr.fit(X_train, y_train)


In [94]:
# Evaluation with valid dataset
print("Evaluation on Training dataset")
y_train_pred = lr.predict(X_train)
print(mean_squared_error(y_train, y_train_pred, squared=False), end="\n\n")

y_val_pred = lr.predict(X_val)
print("Evaluation on validation dataset")
print(mean_squared_error(y_val, y_val_pred, squared=False), end="\n\n")


Evaluation on Training dataset
10.528519388232237

Evaluation on validation dataset
11.014286813221993

