In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

In [3]:
jan24 = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet'
feb24 = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet'

df_1 = pd.read_parquet(jan24)
df_2 = pd.read_parquet(feb24)

# Q1. Downloading the data
18 columns

In [4]:
df_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3066766 entries, 0 to 3066765
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee           

In [5]:
#Q1 18 columns

In [6]:
# Convert columns A and B to datetime objects
df_1['duration'] = -(pd.to_datetime(df_1['tpep_pickup_datetime']) - pd.to_datetime(df_1['tpep_dropoff_datetime'])).dt.total_seconds()

In [7]:
std_deviation = df_1['duration'].std()

# Convert standard deviation back to minutes
std_deviation_minutes = std_deviation / 60

print("Standard Deviation (in minutes):", std_deviation_minutes)

Standard Deviation (in minutes): 42.59435124195673


# Q2. Computing duration

In [8]:
#HW 3
df_1['duration_mins'] = df_1['duration']/60

In [9]:
filtered_df = df_1[(df_1['duration_mins'] >= 1) & (df_1['duration_mins'] <= 60)]

In [10]:
filtered_df['duration_mins'].describe()

count    3.009173e+06
mean     1.420486e+01
std      9.939386e+00
min      1.000000e+00
25%      7.216667e+00
50%      1.155000e+01
75%      1.818333e+01
max      6.000000e+01
Name: duration_mins, dtype: float64

In [11]:
(filtered_df.shape[0] / df_1.shape[0]) * 100

98.1220282212598

# Q3. Dropping outliers


In [12]:
categorical = ['PULocationID', 'DOLocationID']
filtered_df[categorical] = filtered_df[categorical].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df[categorical] = filtered_df[categorical].astype(str)


In [14]:
train_dicts = filtered_df[categorical].to_dict(orient='records')

In [15]:
train_dicts

[{'PULocationID': '161', 'DOLocationID': '141'},
 {'PULocationID': '43', 'DOLocationID': '237'},
 {'PULocationID': '48', 'DOLocationID': '238'},
 {'PULocationID': '138', 'DOLocationID': '7'},
 {'PULocationID': '107', 'DOLocationID': '79'},
 {'PULocationID': '161', 'DOLocationID': '137'},
 {'PULocationID': '239', 'DOLocationID': '143'},
 {'PULocationID': '142', 'DOLocationID': '200'},
 {'PULocationID': '164', 'DOLocationID': '236'},
 {'PULocationID': '141', 'DOLocationID': '107'},
 {'PULocationID': '234', 'DOLocationID': '68'},
 {'PULocationID': '79', 'DOLocationID': '264'},
 {'PULocationID': '164', 'DOLocationID': '143'},
 {'PULocationID': '138', 'DOLocationID': '33'},
 {'PULocationID': '33', 'DOLocationID': '61'},
 {'PULocationID': '79', 'DOLocationID': '186'},
 {'PULocationID': '90', 'DOLocationID': '48'},
 {'PULocationID': '113', 'DOLocationID': '255'},
 {'PULocationID': '237', 'DOLocationID': '239'},
 {'PULocationID': '143', 'DOLocationID': '229'},
 {'PULocationID': '137', 'DOLocat

In [16]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

# Q4. One-hot encoding

In [17]:

X_train.shape[1]

515

In [18]:
X_train

<3009173x515 sparse matrix of type '<class 'numpy.float64'>'
	with 6018346 stored elements in Compressed Sparse Row format>

In [19]:
target = 'duration_mins'
y_train = filtered_df[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False)

7.649262443101424

# Q6 Evaluating the model

In [20]:
X_train.shape

(3009173, 515)

In [21]:
# Convert columns A and B to datetime objects
df_2['duration'] = -(pd.to_datetime(df_2['tpep_pickup_datetime']) - pd.to_datetime(df_2['tpep_dropoff_datetime'])).dt.total_seconds()
df_2['duration_mins'] = df_2['duration']/60


filtered_df2 = df_2[(df_2['duration_mins'] >= 1) & (df_2['duration_mins'] <= 60)]

filtered_df2[categorical] = filtered_df2[categorical].astype(str)

val_dicts = filtered_df2[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df2[categorical] = filtered_df2[categorical].astype(str)


In [22]:
X_val.shape

(2855951, 515)

In [23]:
y_val = filtered_df2[target].values
y_val_pred = lr.predict(X_val)
mean_squared_error(y_val, y_val_pred, squared=False)

7.811813318594438