In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
def prepare_data(df):
    cols = ['PUlocationID', 'DOlocationID', 'trip_duration']
    

    df['trip_duration'] = df['dropOff_datetime'] - df['pickup_datetime'] 
    df = df[cols]
    df['trip_duration'] = df['trip_duration'].dt.total_seconds() / 60
    df = df[(df.trip_duration > 1) & (df.trip_duration < 60)]

    df = df.fillna(-1)

    df[['PUlocationID', 'DOlocationID']] = df[['PUlocationID', 'DOlocationID']].astype(str)

    return df
    
    

In [3]:
dataset_january = pd.read_parquet('.data/.fhv_tripdata_2021-01.parquet')
dataset_february = pd.read_parquet('.data/.fhv_tripdata_2021-02.parquet')

Q1. Read the data for January. How many records are there? -> 1154112

In [4]:
print(dataset_january.shape)

(1154112, 7)


Q2. What's the average trip duration in January? -> 19 min 10 s 

In [5]:
dataset_january['trip_duration'] = dataset_january['dropOff_datetime'] - dataset_january['pickup_datetime'] 
print(dataset_january.trip_duration.mean())

0 days 00:19:10.033445627


Q3. The features we'll user for our model are the pickup and dropoff location IDs.

But they have a lot of missing values there. Let's replace them with "-1"

What's the factions of missing values for the pickup location ID? (Or the fraction of "-1"s after you filled the NAs)

In [6]:
df = pd.read_parquet('.data/.fhv_tripdata_2021-01.parquet')
print(df.isna().sum() / len(df))

dispatching_base_num      0.000000
pickup_datetime           0.000000
dropOff_datetime          0.000000
PUlocationID              0.830307
DOlocationID              0.140558
SR_Flag                   1.000000
Affiliated_base_number    0.000767
dtype: float64


Q4. Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model :

- Turn the dataframe into a list of dictionaries
- Fit a dictionary vectorizer
- Get a feature matrix from it

 What's the dimensionality of this matrix? (The number of columns)

In [11]:
df_jan = pd.read_parquet('.data/.fhv_tripdata_2021-01.parquet')
df_jan = prepare_data(df_jan)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['trip_duration'] = df['trip_duration'].dt.total_seconds() / 60


In [15]:
y_train = df_jan['trip_duration']
X_train = df_jan.drop(columns=['trip_duration'])

X_train = X_train.to_dict('records')
vectorizer = DictVectorizer()
X_train = vectorizer.fit_transform(X_train)

Q5. Now let's use the feature matrix from the previous step to train a model :

- Train a plain linear regression model with default parameters
- Calculate the RMSE of the model on the training data

What's the RMSE on train?



In [16]:
# Train and evaluate a linear regression model
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_train)
print(mean_squared_error(y_train, y_pred, squared = False))




10.39316304284463


Q6. Now let's apply this model to the validation dataset.

What's the RMSE on validation?



In [17]:
df_feb = pd.read_parquet('.data/.fhv_tripdata_2021-02.parquet')
df_feb = prepare_data(df_feb)

y_val = df_feb.trip_duration
X_val = df_feb.drop(columns=['trip_duration'])

X_val = X_val.to_dict('records')

X_val = vectorizer.transform(X_val)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['trip_duration'] = df['trip_duration'].dt.total_seconds() / 60


In [18]:
y_pred_val = regressor.predict(X_val)
print(mean_squared_error(y_val, y_pred_val, squared = False))

10.902406154063145
