In [2]:
import pandas as pd 
df = pd.read_csv("taxi_trip_pricing.csv")

In [3]:
df.head()

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,19.35,Morning,Weekday,3.0,Low,Clear,3.56,0.8,0.32,53.82,36.2624
1,47.59,Afternoon,Weekday,1.0,High,Clear,,0.62,0.43,40.57,
2,36.87,Evening,Weekend,1.0,High,Clear,2.7,1.21,0.15,37.27,52.9032
3,30.33,Evening,Weekday,4.0,Low,,3.48,0.51,0.15,116.81,36.4698
4,,Evening,Weekday,3.0,High,Clear,2.93,0.63,0.32,22.64,15.618


In [4]:
df.isnull().sum()

Trip_Distance_km         50
Time_of_Day              50
Day_of_Week              50
Passenger_Count          50
Traffic_Conditions       50
Weather                  50
Base_Fare                50
Per_Km_Rate              50
Per_Minute_Rate          50
Trip_Duration_Minutes    50
Trip_Price               49
dtype: int64

In [5]:
df.tail(60)

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
940,14.43,Morning,Weekday,4.0,High,Clear,2.81,1.46,0.21,24.69,29.0627
941,28.15,Afternoon,Weekday,2.0,High,Clear,2.06,0.61,0.18,17.02,22.2951
942,32.92,Evening,Weekday,4.0,Low,Clear,4.17,0.7,0.36,32.31,38.8456
943,,Evening,Weekend,1.0,Low,Snow,2.1,1.47,0.27,48.69,76.4865
944,11.11,Evening,Weekend,3.0,Medium,Clear,3.79,1.01,0.17,59.73,25.1652
945,1.54,Afternoon,Weekday,3.0,High,Clear,3.62,1.56,0.38,10.13,9.8718
946,7.71,Afternoon,Weekday,3.0,Low,Clear,4.91,,0.33,116.62,53.726
947,45.1,Morning,Weekday,3.0,Medium,Clear,,0.88,0.44,87.35,81.572
948,43.82,Afternoon,Weekday,4.0,Low,Clear,2.19,1.48,0.15,42.17,73.3691
949,30.27,Afternoon,Weekend,4.0,Medium,Clear,3.66,0.67,0.41,41.65,41.0174


In [6]:
cat_cols = df.select_dtypes(include=['object']).columns

In [7]:
df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])
# We fill missing categorical values using mode to handle NaN values safely.

In [8]:
# mode() can return multiple values if a column has more than one frequent category.
# iloc[0] selects the first mode to fill NaN values.

In [9]:
num_cols = df.select_dtypes(include=['float64','int64']).columns

In [10]:
df[num_cols] = df[num_cols].fillna(df[num_cols].median())
# We fill missing numeric values using median to handle NaN values safely.

In [11]:
df.isnull().sum()

Trip_Distance_km         0
Time_of_Day              0
Day_of_Week              0
Passenger_Count          0
Traffic_Conditions       0
Weather                  0
Base_Fare                0
Per_Km_Rate              0
Per_Minute_Rate          0
Trip_Duration_Minutes    0
Trip_Price               0
dtype: int64

In [12]:
df = pd.get_dummies(df,columns = ["Time_of_Day","Day_of_Week","Traffic_Conditions","Weather"],drop_first=True)
# Convert categorical columns into dummy variables (one-hot encoding) and drop the first category to avoid multicollinearity.

In [13]:
#target
y = df["Trip_Price"]

In [16]:
#features
X = df.drop(columns=["Trip_Price"])

In [17]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=42)

In [19]:
#StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [20]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [21]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha = 1.0)
lasso.fit(X_train_scaled,y_train)

0,1,2
,alpha,1.0
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [26]:
from sklearn.metrics import r2_score,mean_squared_error
import numpy as np

In [30]:
#Measure performance
y_pred = lasso.predict(X_test_scaled)
r2 = r2_score(y_test,y_pred)
rmse = np.sqrt(mean_squared_error(y_test,y_pred))

In [32]:
print("R2 SCORE : ",r2)
print("RMSE : ",rmse)

R2 SCORE :  0.7520380360431425
RMSE :  18.237786717713178


In [35]:
# Create a DataFrame to inspect Lasso coefficients for each feature (feature importance and selection).
coeff_df = pd.DataFrame({
    "Feature": X.columns,
    "Coefficient": lasso.coef_
})

In [36]:
coeff_df

Unnamed: 0,Feature,Coefficient
0,Trip_Distance_km,33.701047
1,Passenger_Count,0.0
2,Base_Fare,0.0
3,Per_Km_Rate,9.044208
4,Per_Minute_Rate,4.830681
5,Trip_Duration_Minutes,7.953705
6,Time_of_Day_Evening,-0.0
7,Time_of_Day_Morning,0.0
8,Time_of_Day_Night,-0.0
9,Day_of_Week_Weekend,-0.0


In [37]:
#Model tuning to find the best alpha value

In [74]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score

alphas = [0.001, 0.01, 0.1, 1, 10, 50, 100, 1000, 10000]

for a in alphas:
    model = Lasso(alpha=a, max_iter=10000)
    score = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring="r2").mean()
    print(a, "-" ,score)

0.001 - 0.8342110538364385
0.01 - 0.8343287477464661
0.1 - 0.835328115181543
1 - 0.8362870296803564
10 - 0.6370887439854332
50 - -0.003562095751005412
100 - -0.003562095751005412
1000 - -0.003562095751005412
10000 - -0.003562095751005412


In [75]:
# The best alpha value is 1 because it achieved the highest mean cross-validation R² score

In [76]:
alpha_best = Lasso(alpha=1)
alpha_best.fit(X_train_scaled,y_train)

0,1,2
,alpha,1
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [77]:
y_pred_best = alpha_best.predict(X_test_scaled)

In [78]:
print("final R2 : ",r2_score(y_test,y_pred_best))
print("final MSE : ",np.sqrt(mean_squared_error(y_test,y_pred_best)))

final R2 :  0.7520380360431425
final MSE :  18.237786717713178
