In [140]:
import pandas as pd
import pyarrow as pa
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np


In [141]:
january = pd.read_parquet("/Users/anandramaswamy/ml-ops-zoomcamp/01-intro/yellow_tripdata_2023-01.parquet")
february = pd.read_parquet("/Users/anandramaswamy/ml-ops-zoomcamp/01-intro/yellow_tripdata_2023-02.parquet")

In [142]:
january.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3066766 entries, 0 to 3066765
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee           

In [143]:
len(january.columns)

19

In [144]:
january['duration'] = (january['tpep_dropoff_datetime'] - january['tpep_pickup_datetime']).dt.total_seconds() / 60
january['duration'].std()


42.594351241920904

In [145]:
january['duration'] = (january['tpep_dropoff_datetime'] - january['tpep_pickup_datetime']).dt.total_seconds() / 60
filtered = january[(january['duration'] >= 1) & (january['duration'] <= 60)]
fraction = len(filtered) / len(january)
print(f"Fraction remaining: {fraction:.4f} or {fraction * 100:.2f}%")

Fraction remaining: 0.9812 or 98.12%


In [146]:
# Drop rows with missing PU or DO before encoding
df = january[["PULocationID", "DOLocationID"]].dropna().copy()

# Now cast to string
df[["PULocationID", "DOLocationID"]] = df[["PULocationID", "DOLocationID"]].astype(str)

# Continue as before
train_dicts = df.to_dict(orient="records")
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
print(X_train.shape)  # Should be (N, 515)


(3066766, 518)


In [147]:
print(dv.get_feature_names_out())
print(len(dv.get_feature_names_out()))


['DOLocationID=1' 'DOLocationID=10' 'DOLocationID=100' 'DOLocationID=101'
 'DOLocationID=102' 'DOLocationID=105' 'DOLocationID=106'
 'DOLocationID=107' 'DOLocationID=108' 'DOLocationID=109'
 'DOLocationID=11' 'DOLocationID=111' 'DOLocationID=112'
 'DOLocationID=113' 'DOLocationID=114' 'DOLocationID=115'
 'DOLocationID=116' 'DOLocationID=117' 'DOLocationID=118'
 'DOLocationID=119' 'DOLocationID=12' 'DOLocationID=120'
 'DOLocationID=121' 'DOLocationID=122' 'DOLocationID=123'
 'DOLocationID=124' 'DOLocationID=125' 'DOLocationID=126'
 'DOLocationID=127' 'DOLocationID=128' 'DOLocationID=129'
 'DOLocationID=13' 'DOLocationID=130' 'DOLocationID=131'
 'DOLocationID=132' 'DOLocationID=133' 'DOLocationID=134'
 'DOLocationID=135' 'DOLocationID=136' 'DOLocationID=137'
 'DOLocationID=138' 'DOLocationID=139' 'DOLocationID=14'
 'DOLocationID=140' 'DOLocationID=141' 'DOLocationID=142'
 'DOLocationID=143' 'DOLocationID=144' 'DOLocationID=145'
 'DOLocationID=146' 'DOLocationID=147' 'DOLocationID=148'
 '

In [148]:
# 1. Filter rows where duration is between 1 and 60 minutes
df = january[(january["duration"] >= 1) & (january["duration"] <= 60)].copy()

# 2. Select only pickup/dropoff, drop NaNs, and cast to string
df = df[["PULocationID", "DOLocationID", "duration"]].dropna()
df[["PULocationID", "DOLocationID"]] = df[["PULocationID", "DOLocationID"]].astype(str)

# 3. One-hot encode
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer()
train_dicts = df[["PULocationID", "DOLocationID"]].to_dict(orient="records")
X_train = dv.fit_transform(train_dicts)

# 4. Set target
y_train = df["duration"].values

# 5. Train and evaluate
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_train)
rmse = np.sqrt(mean_squared_error(y_train, y_pred))
print(f"RMSE: {rmse:.2f}")


RMSE: 7.65


In [149]:
# 1. Compute duration
february["duration"] = (february["tpep_dropoff_datetime"] - february["tpep_pickup_datetime"]).dt.total_seconds() / 60

# 2. Filter duration
df_val = february[(february["duration"] >= 1) & (february["duration"] <= 60)].copy()

# 3. Prepare features
df_val = df_val[["PULocationID", "DOLocationID", "duration"]].dropna()
df_val[["PULocationID", "DOLocationID"]] = df_val[["PULocationID", "DOLocationID"]].astype(str)

# 4. Use previously fitted DictVectorizer (don't fit again!)
X_val = dv.transform(df_val[["PULocationID", "DOLocationID"]].to_dict(orient="records"))

# 5. Target
y_val = df_val["duration"].values

# 6. Predict and evaluate
y_pred = lr.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Validation RMSE: {rmse:.2f}")


Validation RMSE: 7.81
