In [1]:
import pandas as pd

In [2]:
pd.__version__

'1.4.2'

In [3]:
!pip install pyarrow



In [4]:
df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')

In [5]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [6]:
import sklearn
sklearn.__version__

'1.0.2'

In [7]:
# Convert datetime columns to datetime objects
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])

# Calculate duration in minutes
df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60

# Calculate standard deviation of duration
duration_std = df['duration'].std()
print(f'Standard deviation of trip durations: {duration_std:.2f} minutes')

Standard deviation of trip durations: 42.59 minutes


In [8]:
# Get original record count
orig_count = len(df)
orig_count

3066766

In [9]:
# Drop outlier durations
df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]

In [10]:
# Calculate fraction of remaining records
fraction_remaining = len(df) / orig_count
print(f'Fraction of records remaining after dropping outliers: {fraction_remaining:.0%}')

Fraction of records remaining after dropping outliers: 98%


In [11]:
# Convert IDs to strings
df['PULocationID'] = df['PULocationID'].astype(str)
df['DOLocationID'] = df['DOLocationID'].astype(str)

In [12]:
# Create list of dictionaries
data = df[['PULocationID', 'DOLocationID']].to_dict(orient='records')

In [13]:
 from sklearn.feature_extraction import DictVectorizer

In [14]:
# Fit DictVectorizer
dv = DictVectorizer()
dv.fit(data)

DictVectorizer()

In [15]:
# Get feature matrix
X = dv.transform(data)

In [16]:
# Print dimensionality
print(f'Dimensionality of feature matrix: {X.shape[1]}')

Dimensionality of feature matrix: 515


In [17]:
# Get unique location IDs 
unique_locs = set(df['PULocationID'].unique()) | set(df['DOLocationID'].unique())

# Create one-hot encoding mapping
loc_mapping = {loc: i for i, loc in enumerate(unique_locs, start=1)}

In [18]:
# One-hot encode location IDs
df['PU_one_hot'] = df['PULocationID'].map(loc_mapping)
df['DO_one_hot'] = df['DOLocationID'].map(loc_mapping)

In [19]:
# Create feature matrix and target variable
X = df[['PU_one_hot', 'DO_one_hot']].values
y = df['duration'].values

In [20]:
from sklearn.linear_model import LinearRegression

In [21]:
# Train linear regression model
model = LinearRegression()
model.fit(X, y)

LinearRegression()

In [22]:
from sklearn.metrics import mean_squared_error

In [23]:
# Calculate RMSE on training data
y_pred = model.predict(X)
rmse_train = (mean_squared_error(y, y_pred, squared=False)) ** 0.5

In [24]:
print(f'RMSE on training data: {rmse_train:.2f}')

RMSE on training data: 3.13


In [25]:
df_val = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')

In [26]:
df_val.head(5)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,1,2023-02-01 00:32:53,2023-02-01 00:34:34,2.0,0.3,1.0,N,142,163,2,4.4,3.5,0.5,0.0,0.0,1.0,9.4,2.5,0.0
1,2,2023-02-01 00:35:16,2023-02-01 00:35:30,1.0,0.0,1.0,N,71,71,4,-3.0,-1.0,-0.5,0.0,0.0,-1.0,-5.5,0.0,0.0
2,2,2023-02-01 00:35:16,2023-02-01 00:35:30,1.0,0.0,1.0,N,71,71,4,3.0,1.0,0.5,0.0,0.0,1.0,5.5,0.0,0.0
3,1,2023-02-01 00:29:33,2023-02-01 01:01:38,0.0,18.8,1.0,N,132,26,1,70.9,2.25,0.5,0.0,0.0,1.0,74.65,0.0,1.25
4,2,2023-02-01 00:12:28,2023-02-01 00:25:46,1.0,3.22,1.0,N,161,145,1,17.0,1.0,0.5,3.3,0.0,1.0,25.3,2.5,0.0


In [27]:
df_val['PULocationID'] = df_val['PULocationID'].astype(str)
df_val['DOLocationID'] = df_val['DOLocationID'].astype(str)

df_val['tpep_pickup_datetime'] = pd.to_datetime(df_val['tpep_pickup_datetime'])
df_val['tpep_dropoff_datetime'] = pd.to_datetime(df_val['tpep_dropoff_datetime'])

# Calculate duration in minutes
df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
df_val['duration'] = (df_val['tpep_dropoff_datetime'] - df_val['tpep_pickup_datetime']).dt.total_seconds() / 60

In [None]:
# Convert IDs to strings
df['PULocationID'] = df['PULocationID'].astype(str)
df['DOLocationID'] = df['DOLocationID'].astype(str)
df_val['PULocationID'] = df_val['PULocationID'].astype(str)
df_val['DOLocationID'] = df_val['DOLocationID'].astype(str)

# Get unique location IDs from training data
unique_locs = set(df['PULocationID'].unique()) | set(df['DOLocationID'].unique())

# Create one-hot encoding mapping
loc_mapping = {loc: i for i, loc in enumerate(unique_locs, start=1)}

# One-hot encode location IDs
df['PU_one_hot'] = df['PULocationID'].map(loc_mapping)
df['DO_one_hot'] = df['DOLocationID'].map(loc_mapping)
df_val['PU_one_hot'] = df_val['PULocationID'].map(loc_mapping)
df_val['DO_one_hot'] = df_val['DOLocationID'].map(loc_mapping)

# Create feature matrices
X_train = df[['PU_one_hot', 'DO_one_hot']].values
X_val = df_val[['PU_one_hot', 'DO_one_hot']].values

# Target variables
y_train = df['duration'].values
y_val = df_val['duration'].values

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate on validation
y_val_pred = model.predict(X_val)
rmse_val = (mean_squared_error(y_val, y_val_pred, squared=False)) ** 0.5

print(f'RMSE on validation data: {rmse_val:.2f}')