In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [3]:
df=pd.read_csv("data/ride_data_modified.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 182304 entries, 0 to 182303
Data columns (total 15 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   ride_id            182304 non-null  int64  
 1   pickup_location    182304 non-null  object 
 2   drop_location      182304 non-null  object 
 3   request_timestamp  182304 non-null  object 
 4   ride_hour          182304 non-null  int64  
 5   day_of_week        182304 non-null  object 
 6   month              182304 non-null  int64  
 7   traffic_level      182304 non-null  object 
 8   weather            182304 non-null  object 
 9   vehicle_type       182304 non-null  object 
 10  distance_km        182304 non-null  int64  
 11  duration_min       182304 non-null  float64
 12  surge_multiplier   182304 non-null  float64
 13  driver_rating      182304 non-null  float64
 14  fare               182304 non-null  float64
dtypes: float64(4), int64(4), object(7)
memory usage: 20

In [5]:
df.columns

Index(['ride_id', 'pickup_location', 'drop_location', 'request_timestamp',
       'ride_hour', 'day_of_week', 'month', 'traffic_level', 'weather',
       'vehicle_type', 'distance_km', 'duration_min', 'surge_multiplier',
       'driver_rating', 'fare'],
      dtype='object')

In [6]:
X = df[[
    "distance_km", "duration_min", "surge_multiplier", "ride_hour", "month", "driver_rating",
    "pickup_location", "drop_location", "traffic_level", "weather", "vehicle_type", "day_of_week"
]]

y = df["fare"]


In [7]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 182304 entries, 0 to 182303
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   distance_km       182304 non-null  int64  
 1   duration_min      182304 non-null  float64
 2   surge_multiplier  182304 non-null  float64
 3   ride_hour         182304 non-null  int64  
 4   month             182304 non-null  int64  
 5   driver_rating     182304 non-null  float64
 6   pickup_location   182304 non-null  object 
 7   drop_location     182304 non-null  object 
 8   traffic_level     182304 non-null  object 
 9   weather           182304 non-null  object 
 10  vehicle_type      182304 non-null  object 
 11  day_of_week       182304 non-null  object 
dtypes: float64(3), int64(3), object(6)
memory usage: 16.7+ MB


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
numeric_cols=X_train.select_dtypes(include='number').columns.tolist()
categorical_cols=X_train.select_dtypes(include='object').columns.tolist()
numeric_cols

['distance_km',
 'duration_min',
 'surge_multiplier',
 'ride_hour',
 'month',
 'driver_rating']

In [11]:
X_train[categorical_cols].nunique()

pickup_location    12
drop_location      12
traffic_level       3
weather             3
vehicle_type        5
day_of_week         7
dtype: int64

In [13]:
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_cols),
    ("cat", OneHotEncoder(sparse_output=False,handle_unknown="ignore"), categorical_cols)
])


In [15]:
from sklearn.linear_model import LinearRegression

In [16]:
linear_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

linear_model.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [19]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [20]:
def evaluate(model, X_test, y_test):
    preds = model.predict(X_test)
    mae = mean_absolute_error(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)

    print("MAE:", mae)
    print("RMSE:", rmse)
    print("R² Score:", r2)


In [None]:
print("Linear Regression:")
evaluate(linear_model, X_test, y_test)

Linear Regression Performance:
MAE: 25.50031817032999
RMSE: 36.067422535094934
R² Score: 0.9219976967128306


In [24]:
from sklearn.tree import DecisionTreeRegressor

tree_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", DecisionTreeRegressor(max_depth=15, random_state=42))
])

tree_model.fit(X_train, y_train)
print("Decision Tree:")
evaluate(tree_model, X_test, y_test)


Decision Tree:
MAE: 5.908342228569007
RMSE: 7.284894816045036
R² Score: 0.9968178270288768


In [22]:
from sklearn.ensemble import RandomForestRegressor

In [27]:
forest_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(n_estimators=100, random_state=42))
])

forest_model.fit(X_train, y_train)
print("Random Forest:")
evaluate(forest_model, X_test, y_test)


Random Forest:
MAE: 5.438344235027381
RMSE: 6.50573214133112
R² Score: 0.9974621288547245


In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gbr_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", GradientBoostingRegressor(random_state=42))
])

gbr_model.fit(X_train, y_train)
print("Gradient Boosting:")
evaluate(gbr_model, X_test, y_test)


Gradient Boosting:
MAE: 6.822665146414338
RMSE: 8.591003471052316
R² Score: 0.9955744729406002
