In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor


taxiDataDf = pd.read_parquet('./yellow_tripdata_2022-01.parquet');

In [6]:
# print first 5 rows of data
print(taxiDataDf.head(5))

   VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
0         1  2022-01-01 00:35:40   2022-01-01 00:53:29              2.0   
1         1  2022-01-01 00:33:43   2022-01-01 00:42:07              1.0   
2         2  2022-01-01 00:53:21   2022-01-01 01:02:19              1.0   
3         2  2022-01-01 00:25:21   2022-01-01 00:35:23              1.0   
4         2  2022-01-01 00:36:48   2022-01-01 01:14:20              1.0   

   trip_distance  RatecodeID store_and_fwd_flag  PULocationID  DOLocationID  \
0           3.80         1.0                  N           142           236   
1           2.10         1.0                  N           236            42   
2           0.97         1.0                  N           166           166   
3           1.09         1.0                  N           114            68   
4           4.30         1.0                  N            68           163   

   payment_type  fare_amount  extra  mta_tax  tip_amount  tolls_amount  \


In [7]:
# drop any null value rows

filteredTaxiDataDf = taxiDataDf.dropna(how='any')
print(filteredTaxiDataDf.shape)

(2392428, 19)


In [8]:
#create trip duration as a new feature which holds duration of trip in minutes
duration = filteredTaxiDataDf['tpep_dropoff_datetime'] - filteredTaxiDataDf['tpep_pickup_datetime']

# convert timedelta to minutes, divide each item by 60 after converting each to their respective total seconds
filteredTaxiDataDf.loc[:,'duration'] = duration.dt.total_seconds().div(60)

print (filteredTaxiDataDf.head(5))

   VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
0         1  2022-01-01 00:35:40   2022-01-01 00:53:29              2.0   
1         1  2022-01-01 00:33:43   2022-01-01 00:42:07              1.0   
2         2  2022-01-01 00:53:21   2022-01-01 01:02:19              1.0   
3         2  2022-01-01 00:25:21   2022-01-01 00:35:23              1.0   
4         2  2022-01-01 00:36:48   2022-01-01 01:14:20              1.0   

   trip_distance  RatecodeID store_and_fwd_flag  PULocationID  DOLocationID  \
0           3.80         1.0                  N           142           236   
1           2.10         1.0                  N           236            42   
2           0.97         1.0                  N           166           166   
3           1.09         1.0                  N           114            68   
4           4.30         1.0                  N            68           163   

   payment_type  fare_amount  extra  mta_tax  tip_amount  tolls_amount  \


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filteredTaxiDataDf.loc[:,'duration'] = duration.dt.total_seconds().div(60)


In [9]:
filteredTaxiDataDf.loc[:,'target_variable'] = filteredTaxiDataDf['total_amount'].values
filteredTaxiDataDf.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filteredTaxiDataDf.loc[:,'target_variable'] = filteredTaxiDataDf['total_amount'].values


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,duration,target_variable
0,1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,N,142,236,1,...,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0,17.816667,21.95
1,1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,N,236,42,1,...,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0,8.4,13.3
2,2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,N,166,166,1,...,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0,8.966667,10.56
3,2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,N,114,68,2,...,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0,10.033333,11.8
4,2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,N,68,163,1,...,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0,37.533333,30.3


In [10]:
#feature_col to hold all the column names

filteredTaxiDataDf['tpep_pickup_datetime'] = filteredTaxiDataDf['tpep_pickup_datetime'].astype(int)
filteredTaxiDataDf['tpep_dropoff_datetime'] = filteredTaxiDataDf['tpep_dropoff_datetime'].astype(int)

# print(filteredTaxiDataDf['store_and_fwd_flag'].value_counts())
    

feature_col = filteredTaxiDataDf.drop('total_amount',axis=1).drop('target_variable',axis=1).drop('store_and_fwd_flag',axis=1).columns.to_list()
print(feature_col)


# some of the modifications are done, in order to train the model
# for example, thre timestamp columns are converted to hold int values, droping store_and_fwd_flag column since 
# linear regression model fit requires only number values
# here i choose to drop the value N/Y which could have been replaced as 0 or 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filteredTaxiDataDf['tpep_pickup_datetime'] = filteredTaxiDataDf['tpep_pickup_datetime'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filteredTaxiDataDf['tpep_dropoff_datetime'] = filteredTaxiDataDf['tpep_dropoff_datetime'].astype(int)


['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'RatecodeID', 'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'congestion_surcharge', 'airport_fee', 'duration']


In [11]:
#Seperate feature and target variables

# X represents feature variable

X = filteredTaxiDataDf.loc[:,feature_col] 
X.head()


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,congestion_surcharge,airport_fee,duration
0,1,1640997340000000,1640998409000000,2.0,3.8,1.0,142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,2.5,0.0,17.816667
1,1,1640997223000000,1640997727000000,1.0,2.1,1.0,236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,0.0,0.0,8.4
2,2,1640998401000000,1640998939000000,1.0,0.97,1.0,166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,0.0,0.0,8.966667
3,2,1640996721000000,1640997323000000,1.0,1.09,1.0,114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,2.5,0.0,10.033333
4,2,1640997408000000,1640999660000000,1.0,4.3,1.0,68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,2.5,0.0,37.533333


In [12]:
#y represents test variable
# from above cell, remember target_variable is created from total_amount column

y = filteredTaxiDataDf.loc[:,'target_variable'] 
y.head()

0    21.95
1    13.30
2    10.56
3    11.80
4    30.30
Name: target_variable, dtype: float64

In [13]:
from sklearn.model_selection import train_test_split


# convert the X dataframe into a numpy array because sci-kit learn takes data as a form of numpy array

X = X.values
# print(X)

# similarly convert test variable y as numpy array

y = y.values
# print(y)

X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=43)



In [14]:

print(X_train.shape,'\n', X_test.shape, '\n ', y_train.shape, '\n', y_test.shape)

(1913942, 18) 
 (478486, 18) 
  (1913942,) 
 (478486,)


In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

model = LinearRegression()

model.fit(X_train, y_train)

In [16]:
y_pred = model.predict(X_test)

In [17]:
meanAbsoluteError = mean_absolute_error(y_test,y_pred)

print("mean absolute error of the total amount is : ", meanAbsoluteError)

mean absolute error of the total amount is :  0.13910022870586847


In [18]:

print(feature_col)
# numeric_features = feature_col
# numeric_transformer = Pipeline(steps=[
#     ('scaler', StandardScaler())
# ])

['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'RatecodeID', 'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'congestion_surcharge', 'airport_fee', 'duration']


In [19]:

# Use Scikit-Learn's ColumnTransformer to preprocess the categorical and
# continuous features independently

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

feature_col_copy = feature_col.copy()

if 'store_and_fwd_flag' not in feature_col_copy:
    feature_col_copy.append('store_and_fwd_flag')
   
numerical_features = list(filter(lambda x : x!='store_and_fwd_flag',feature_col_copy)) 

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_features = ['store_and_fwd_flag']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder())
])

print(numerical_features, categorical_features)

X = filteredTaxiDataDf.loc[:,feature_col_copy]

y = filteredTaxiDataDf['target_variable']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'RatecodeID', 'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'congestion_surcharge', 'airport_fee', 'duration'] ['store_and_fwd_flag']


In [20]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

preprocessor.fit(X_train)


In [21]:
X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# y_train_transformed = preprocessor.transform(y_train)
# y_test_transformed = preprocessor.transfrom(y_test)

print("Transformed training data:\n", X_train_transformed)
print("Transformed testing data:\n", X_test_transformed)



Transformed training data:
 [[ 0.65939889  0.74624192  0.74660054 ...  0.15478606  1.
   0.        ]
 [ 0.65939889  0.10238959  0.10264146 ...  0.11784294  1.
   0.        ]
 [ 0.65939889  0.63612388  0.63620998 ...  0.02726124  1.
   0.        ]
 ...
 [-1.51653274  0.94570265  0.94550326 ... -0.11553813  1.
   0.        ]
 [ 0.65939889  0.81996939  0.81975907 ... -0.11802469  1.
   0.        ]
 [ 0.65939889  0.78195739  0.78227069 ...  0.13240705  1.
   0.        ]]
Transformed testing data:
 [[-1.51653274  0.11024788  0.11006193 ... -0.09102779  0.
   1.        ]
 [-1.51653274 -0.58903716 -0.58934505 ... -0.1340097   1.
   0.        ]
 [-1.51653274  0.01929716  0.01908324 ... -0.10239491  0.
   1.        ]
 ...
 [ 0.65939889  0.21171852  0.21162661 ... -0.04840111  1.
   0.        ]
 [-1.51653274 -0.02268273 -0.02283373 ... -0.07149057  1.
   0.        ]
 [ 0.65939889  0.17012834  0.16978118 ... -0.16917671  1.
   0.        ]]


In [22]:

# Create a pipeline object

from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression

print(feature_col)

pipe = Pipeline([('scaler', StandardScaler()),
                 ('pca', PCA(n_components = .90, random_state=0)),
                 ('logistic', LinearRegression())])



['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'RatecodeID', 'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'congestion_surcharge', 'airport_fee', 'duration']


In [23]:
# fit the pipeline on the training data
pipe.fit(X_train_transformed, y_train)

In [24]:
# # Get Model Performance
print(pipe.score(X_test_transformed, y_test))

0.955211262486142


In [25]:
filteredTaxiDataDf.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,duration,target_variable
0,1,1640997340000000,1640998409000000,2.0,3.8,1.0,N,142,236,1,...,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0,17.816667,21.95
1,1,1640997223000000,1640997727000000,1.0,2.1,1.0,N,236,42,1,...,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0,8.4,13.3
2,2,1640998401000000,1640998939000000,1.0,0.97,1.0,N,166,166,1,...,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0,8.966667,10.56
3,2,1640996721000000,1640997323000000,1.0,1.09,1.0,N,114,68,2,...,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0,10.033333,11.8
4,2,1640997408000000,1640999660000000,1.0,4.3,1.0,N,68,163,1,...,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0,37.533333,30.3


In [26]:
%matplotlib inline

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Regression Models
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor

# Classifer Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier

# To visualize individual decision trees
from sklearn import tree
from sklearn.tree import export_text

In [27]:
print(filteredTaxiDataDf.columns)

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee', 'duration',
       'target_variable'],
      dtype='object')


In [76]:
x_cols = filteredTaxiDataDf.drop('store_and_fwd_flag',axis=1).drop('total_amount',axis=1).drop('target_variable',axis=1).columns.to_list();

X = filteredTaxiDataDf.loc[:,x_cols];
y = filteredTaxiDataDf.loc[:,'target_variable'].values

In [78]:
# train and test data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [84]:
# Random forest regression
reg = RandomForestRegressor(n_estimators=100, random_state = 0,n_jobs=5)

In [86]:
# model fitting

reg.fit(X_train, y_train)

In [88]:
# predict values
reg.predict(X_test.iloc[0].values.reshape(1, -1))



array([21.96])

In [90]:
score = reg.score(X_test, y_test)
print(score)

0.004000966608721956


In [96]:
from sklearn.model_selection import GridSearchCV

# Random forest regression
reg = RandomForestRegressor(random_state = 42)

# define hyper parameters
parameters = {'n_estimators':[100],'max_depth':[10], 'min_samples_split':[2]}; # move it to a config dictionary 

# Step 4: Set up GridSearchCV
grid_search = GridSearchCV(estimator=reg, param_grid=parameters, cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')