In [15]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import OneHotEncoder


In [16]:
df=pd.read_csv('C:\\Users\\LAP TECHNOLOGY\\Desktop\\project\\archive\\merged_rides_weather(100000).csv')

In [17]:
df.head()

Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,name,hour,date,temp,clouds,pressure,rain,humidity,wind
0,1.93,Lyft,2018-12-01 11:17:55.791,Back Bay,Haymarket Square,22.5,1.0,Lux Black,11,2018-12-01,32.58,0.4,1022.14,0.019036,0.79,4.34
1,1.05,Lyft,2018-11-29 06:53:05.635,North End,North Station,11.0,1.0,Lyft XL,6,2018-11-29,37.738571,0.774286,1000.315714,0.017956,0.7,10.212857
2,1.64,Uber,2018-12-16 22:30:08.069,Back Bay,Northeastern University,8.5,1.0,UberX,22,2018-12-16,41.86,0.9,1012.43,0.1011,0.78,11.04
3,2.53,Lyft,2018-12-02 06:08:03.130,Northeastern University,Beacon Hill,22.5,1.0,Lux Black,6,2018-12-02,38.38,1.0,1020.7,0.017956,0.83,2.75
4,1.71,Lyft,2018-12-16 00:40:15.848,South Station,West End,5.0,1.0,Shared,0,2018-12-16,42.69,0.17,1023.05,0.129213,0.71,6.39


In [18]:
df=df.drop(['time_stamp'], axis=1)

In [19]:
df.surge_multiplier.value_counts()

surge_multiplier
1.00    79116
1.25    11042
1.50     5040
1.75     2408
2.00     2228
2.50      154
3.00       12
Name: count, dtype: int64

In [20]:
df.head()

Unnamed: 0,distance,cab_type,destination,source,price,surge_multiplier,name,hour,date,temp,clouds,pressure,rain,humidity,wind
0,1.93,Lyft,Back Bay,Haymarket Square,22.5,1.0,Lux Black,11,2018-12-01,32.58,0.4,1022.14,0.019036,0.79,4.34
1,1.05,Lyft,North End,North Station,11.0,1.0,Lyft XL,6,2018-11-29,37.738571,0.774286,1000.315714,0.017956,0.7,10.212857
2,1.64,Uber,Back Bay,Northeastern University,8.5,1.0,UberX,22,2018-12-16,41.86,0.9,1012.43,0.1011,0.78,11.04
3,2.53,Lyft,Northeastern University,Beacon Hill,22.5,1.0,Lux Black,6,2018-12-02,38.38,1.0,1020.7,0.017956,0.83,2.75
4,1.71,Lyft,South Station,West End,5.0,1.0,Shared,0,2018-12-16,42.69,0.17,1023.05,0.129213,0.71,6.39


In [21]:
df['date'] = pd.to_datetime(df['date'])
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df.drop(columns=['date'], inplace=True)

In [22]:
target = 'price'
X = df.drop(columns=[target])
y = df[target]

In [30]:

numeric_features = ['distance','surge_multiplier','hour','temp','clouds','pressure','rain','humidity','wind','month','day']
cat_features = ['cab_type','destination','source','name']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, cat_features)
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)



In [24]:
X_train.head()

Unnamed: 0,distance,cab_type,destination,source,surge_multiplier,name,hour,temp,clouds,pressure,rain,humidity,wind,month,day
75220,1.7,Uber,West End,South Station,1.0,Black SUV,8,39.26,1.0,1019.53,0.032,0.86,4.11,12,2
48955,1.56,Uber,Back Bay,Boston University,1.0,UberPool,12,45.3,0.95,1022.19,0.018846,0.92,4.02,12,15
44966,2.04,Lyft,South Station,North Station,1.0,Lux Black XL,9,38.54,1.0,1019.06,0.0051,0.91,3.03,12,2
13568,0.61,Uber,West End,Haymarket Square,1.0,UberPool,12,38.97,0.29,1005.71,0.018846,0.63,10.42,11,29
92727,0.98,Lyft,Haymarket Square,Financial District,1.5,Lyft,15,37.28,0.17,1017.58,0.002973,0.71,2.95,11,30


# Linear Regression:

In [25]:
from sklearn.linear_model import LinearRegression
# Linear Regression
pipe_lr = Pipeline([
    ('pre', preprocessor),
    ('scaler', StandardScaler(with_mean=False)),
    ('lr', LinearRegression())
])

pipe_lr.fit(X_train, y_train)
pred_lr = pipe_lr.predict(X_test)

print("Linear Regression:")
print("MAE:", mean_absolute_error(y_test, pred_lr))
print("RMSE:", mean_squared_error(y_test, pred_lr))

Linear Regression:
MAE: 2.212397744817561
RMSE: 10.071683624978146


# SVM-->SVR

In [11]:
# from sklearn.svm import SVR
# from sklearn.model_selection import GridSearchCV

# pipe_svr = Pipeline([
#     ('pre', preprocessor),
#     ('scaler', StandardScaler(with_mean=False)),
#     ('svr', SVR())
# ])

# param_svr = {
#     'svr__kernel': ['rbf', 'poly'],
#     'svr__C': [1, 10, 50],
#     'svr__gamma': ['scale', 0.1, 0.01],
#     'svr__epsilon': [0.1, 0.5, 1.0]
# }

# svr_search = GridSearchCV(pipe_svr, param_svr, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=2)
# svr_search.fit(X_train, y_train)

# pred_svr = svr_search.predict(X_test)

# print("SVR Best Params:", svr_search.best_params_)
# print("MAE:", mean_absolute_error(y_test, pred_svr))
# print("RMSE:", mean_squared_error(y_test, pred_svr))

# RandomForestRegressor

In [26]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

pipe_rf = Pipeline([
    ('pre', preprocessor),
    ('rf', RandomForestRegressor(random_state=42, n_jobs=-1))
])

param_rf = {
    'rf__n_estimators': [200, 400, 600],
    'rf__max_depth': [None, 10, 20, 30],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4],
    'rf__max_features': [1.0, 'sqrt']
}

rf_search = RandomizedSearchCV(pipe_rf, param_rf, n_iter=20, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1, random_state=42)
rf_search.fit(X_train, y_train)

pred_rf = rf_search.predict(X_test)

print("Random Forest Best Params:", rf_search.best_params_)
print("MAE:", mean_absolute_error(y_test, pred_rf))
print("RMSE:", mean_squared_error(y_test, pred_rf))

KeyboardInterrupt: 

# GradientBoostingRegressor

In [12]:
# from sklearn.ensemble import GradientBoostingRegressor
# from sklearn.model_selection import GridSearchCV

# pipe_gb = Pipeline([
#     ('pre', preprocessor),
#     ('gb', GradientBoostingRegressor(random_state=42))
# ])

# param_gb = {
#     'gb__n_estimators': [200, 400],
#     'gb__learning_rate': [0.01, 0.05, 0.1],
#     'gb__max_depth': [3, 5],
#     'gb__subsample': [0.7, 1.0]
# }

# gb_search = GridSearchCV(pipe_gb, param_gb, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
# gb_search.fit(X_train, y_train)

# pred_gb = gb_search.predict(X_test)

# print("Gradient Boosting Best Params:", gb_search.best_params_)
# print("MAE:", mean_absolute_error(y_test, pred_gb))
# print("RMSE:", mean_squared_error(y_test, pred_gb))




# # Gradient Boosting Best Params: {'gb__learning_rate': 0.1, 'gb__max_depth': 5, 'gb__n_estimators': 400, 'gb__subsample': 1.0}
# # MAE: 1.1575776579885517
# # RMSE: 2.9130019932671707

In [13]:
from sklearn.ensemble import GradientBoostingRegressor

# Pipeline + best hyperparameters
pipe_gb = Pipeline([
    ('pre', preprocessor),
    ('gb', GradientBoostingRegressor(
            n_estimators=400,
            learning_rate=0.1,
            max_depth=5,
            subsample=1.0,
            random_state=42
        ))
])

# Train
pipe_gb.fit(X_train, y_train)

# Predict
pred_gb = pipe_gb.predict(X_test)

# Evaluate
print("Gradient Boosting (Best Hyperparameters):")
print("MAE:", mean_absolute_error(y_test, pred_gb))
print("RMSE:", mean_squared_error(y_test, pred_gb))


Gradient Boosting (Best Hyperparameters):
MAE: 1.1575776579885517
RMSE: 2.9130019932671707


# xgboost

In [14]:
# import xgboost as xgb
# from sklearn.model_selection import RandomizedSearchCV

# pipe_xgb = Pipeline([
#     ('pre', preprocessor),
#     ('xgb', xgb.XGBRegressor(
#         objective='reg:squarederror',
#         eval_metric='rmse',
#         random_state=42,
#         n_jobs=-1
#     ))
# ])

# param_xgb = {
#     'xgb__n_estimators': [300, 500],
#     'xgb__learning_rate': [0.05, 0.1],
#     'xgb__max_depth': [5, 7],
#     'xgb__subsample': [0.7, 1.0],
#     'xgb__colsample_bytree': [0.7, 1.0]
# }

# xgb_search = RandomizedSearchCV(pipe_xgb, param_xgb, n_iter=12, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1, random_state=42)
# xgb_search.fit(X_train, y_train)

# pred_xgb = xgb_search.predict(X_test)

# print("XGBoost Best Params:", xgb_search.best_params_)
# print("MAE:", mean_absolute_error(y_test, pred_xgb))
# print("RMSE:", mean_squared_error(y_test, pred_xgb))

# # XGBoost Best Params: {'xgb__subsample': 1.0, 'xgb__n_estimators': 300, 'xgb__max_depth': 7, 'xgb__learning_rate': 0.1, 'xgb__colsample_bytree': 1.0}
# # MAE: 1.1416350977778436
# # RMSE: 2.8354012847644583

In [26]:
import xgboost as xgb

pipe_xgb = Pipeline([
    ('pre', preprocessor),
    ('xgb', xgb.XGBRegressor(
        objective='reg:squarederror',
        eval_metric='rmse',
        random_state=42,
        n_jobs=-1,
        n_estimators=300,
        learning_rate=0.1,
        max_depth=7,
        subsample=1.0,
        colsample_bytree=1.0
    ))
])

# Train final model with BEST params
pipe_xgb.fit(X_train, y_train)

# Predict on test set
pred_xgb = pipe_xgb.predict(X_test)

# Evaluate
print("XGBoost (Best Hyperparameters):")
print("MAE:", mean_absolute_error(y_test, pred_xgb))
print("RMSE:", mean_squared_error(y_test, pred_xgb))


XGBoost (Best Hyperparameters):
MAE: 1.1416350977778436
RMSE: 2.8354012847644583


In [28]:
pred_xgb

array([26.693388, 14.320567,  8.761595, ..., 11.562213, 11.674622,
       31.016111], dtype=float32)

# DNN

In [16]:
import tensorflow as tf
from tensorflow import keras

X_train_proc = preprocessor.fit_transform(X_train)
X_test_proc = preprocessor.transform(X_test)

scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train_proc)
X_test_scaled = scaler.transform(X_test_proc)

model = keras.Sequential([
    keras.layers.Input(shape=(X_train_scaled.shape[1],)),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(64, activation='relu'),
    
    keras.layers.Dense(1)
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])

model.fit(X_train_scaled, y_train, epochs=50, batch_size=256, validation_split=0.1, verbose=2)

pred_nn = model.predict(X_test_scaled).reshape(-1)

print("Neural Network MAE:", mean_absolute_error(y_test, pred_nn))
print("NN RMSE:", mean_squared_error(y_test, pred_nn))

Epoch 1/50
282/282 - 2s - 8ms/step - loss: 68.7052 - mae: 5.6720 - val_loss: 12.8925 - val_mae: 2.1973
Epoch 2/50
282/282 - 1s - 2ms/step - loss: 8.2496 - mae: 1.8841 - val_loss: 7.0023 - val_mae: 1.7329
Epoch 3/50
282/282 - 1s - 2ms/step - loss: 5.6835 - mae: 1.6135 - val_loss: 5.7956 - val_mae: 1.6263
Epoch 4/50
282/282 - 1s - 2ms/step - loss: 5.0313 - mae: 1.5397 - val_loss: 5.6932 - val_mae: 1.6497
Epoch 5/50
282/282 - 1s - 3ms/step - loss: 4.6459 - mae: 1.4879 - val_loss: 4.5164 - val_mae: 1.4668
Epoch 6/50
282/282 - 1s - 2ms/step - loss: 4.4174 - mae: 1.4559 - val_loss: 4.3438 - val_mae: 1.4423
Epoch 7/50
282/282 - 1s - 2ms/step - loss: 4.3185 - mae: 1.4425 - val_loss: 4.2948 - val_mae: 1.4178
Epoch 8/50
282/282 - 1s - 3ms/step - loss: 4.1250 - mae: 1.4091 - val_loss: 4.1431 - val_mae: 1.4444
Epoch 9/50
282/282 - 1s - 2ms/step - loss: 4.0754 - mae: 1.4038 - val_loss: 4.0387 - val_mae: 1.4242
Epoch 10/50
282/282 - 1s - 2ms/step - loss: 3.9569 - mae: 1.3779 - val_loss: 4.0789 - val

In [17]:
model.summary()

In [31]:
import joblib
joblib.dump(pipe_xgb, 'xgboost_model.pkl')

['xgboost_model.pkl']

In [18]:
import joblib
joblib.dump(pipe_lr, "linear_regression_model.pkl")
joblib.dump(pipe_gb, "gradient_boosting_model.pkl")
joblib.dump(pipe_xgb, "xgboost_model.pkl")
# Save the trained Keras model
model.save("neural_network_model.h5")
print("Models saved")



Models saved


In [None]:
Neural Network
MAE:      1.2289108801007271
NN RMSE:  3.2464775968715975
Linear Regression:
MAE:      2.212397744817561
RMSE:     10.071683624978146
XGBoost (Best Hyperparameters):
MAE:      1.1416350977778436
RMSE:     2.8354012847644583
Gradient Boosting (Best Hyperparameters):
MAE:      1.1575776579885517
RMSE:     2.9130019932671707

In [None]:
Model            ,MAE                ,RMSE               ,R2
AE + XGBoost     ,8.786805152893066  ,11.045310248200542 ,0.05011940002441406
LSTM + Attention ,8.802721977233887  ,11.026923090187672 ,0.05327928066253662
Naive_last       ,11.729914665222168 ,15.22105687886901  ,-0.8038598299026489
LinearRegression ,8.811145979666144  ,11.082225978728054 ,0.04375948039537825
XGBoost_raw      ,8.7537202835083    ,11.068408487351336 ,0.046142399311065674