In [0]:
df = spark.read.table("default.taxi_silver")

print("Total rows:", df.count())
display(df.limit(5))


Total rows: 10000


pickup_datetime,dropoff_datetime,passenger_count,fare_amount,trip_distance_km,trip_duration_min,surge_multiplier,hour,dayofweek,month,year,pickup_zone,dropoff_zone
2025-10-04 06:01:44,2025-10-04 06:27:03,1,248.96,10.732,25.32,1.0,6,7,10,2025,Rajajinagar,Koramangala
2025-10-09 11:00:38,2025-10-09 11:28:12,2,340.41,16.604,27.57,1.0,11,5,10,2025,Bannerghatta,Hebbal
2025-10-07 00:20:47,2025-10-07 00:33:47,2,200.3,8.718,13.01,1.0,0,3,10,2025,Yelahanka,Rajajinagar
2025-10-01 08:37:05,2025-10-01 08:44:57,2,95.69,2.259,7.87,1.0,8,4,10,2025,Whitefield,Whitefield
2025-10-01 03:37:41,2025-10-01 03:52:45,1,201.14,8.568,15.07,1.0,3,4,10,2025,MG_Road,Bannerghatta


In [0]:
feature_cols = [
    "trip_distance_km",
    "trip_duration_min",
    "hour",
    "dayofweek",
    "passenger_count"
]

label_col = "fare_amount"

df_model = df.select(feature_cols + [label_col])
display(df_model.limit(5))


trip_distance_km,trip_duration_min,hour,dayofweek,passenger_count,fare_amount
10.732,25.32,6,7,1,248.96
16.604,27.57,11,5,2,340.41
8.718,13.01,0,3,2,200.3
2.259,7.87,8,4,2,95.69
8.568,15.07,3,4,1,201.14


In [0]:
df_model = df_model.fillna({
    "passenger_count": 1
})

print("Rows after NA handling:", df_model.count())



Rows after NA handling: 10000


In [0]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)

df_features = assembler.transform(df_model).select("features", label_col)
display(df_features.limit(5))


features,fare_amount
"Map(vectorType -> dense, length -> 5, values -> List(10.732, 25.32, 6.0, 7.0, 1.0))",248.96
"Map(vectorType -> dense, length -> 5, values -> List(16.604, 27.57, 11.0, 5.0, 2.0))",340.41
"Map(vectorType -> dense, length -> 5, values -> List(8.718, 13.01, 0.0, 3.0, 2.0))",200.3
"Map(vectorType -> dense, length -> 5, values -> List(2.259, 7.87, 8.0, 4.0, 2.0))",95.69
"Map(vectorType -> dense, length -> 5, values -> List(8.568, 15.07, 3.0, 4.0, 1.0))",201.14


In [0]:
train_df, test_df = df_features.randomSplit([0.8, 0.2], seed=42)

print("Train rows:", train_df.count())
print("Test rows :", test_df.count())


Train rows: 8079
Test rows : 1921


In [0]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(
    featuresCol="features",
    labelCol="fare_amount"
)

lr_model = lr.fit(train_df)


In [0]:
predictions = lr_model.transform(test_df)
display(predictions.select("fare_amount", "prediction").limit(10))


fare_amount,prediction
72.97,58.1027556672542
58.08,60.74377692930943
60.32,62.88788067525049
57.61,59.275247565293135
62.38,64.62679244008339
69.13,60.72934899538718
57.77,60.25827979090727
59.3,61.77254575881866
78.06,62.9385604157286
58.72,61.6560854938465


In [0]:
from pyspark.ml.evaluation import RegressionEvaluator

rmse_eval = RegressionEvaluator(
    labelCol="fare_amount",
    predictionCol="prediction",
    metricName="rmse"
)

r2_eval = RegressionEvaluator(
    labelCol="fare_amount",
    predictionCol="prediction",
    metricName="r2"
)

rmse = rmse_eval.evaluate(predictions)
r2 = r2_eval.evaluate(predictions)

print(f"RMSE: {rmse}")
print(f"R²  : {r2}")


RMSE: 32.17587039073253
R²  : 0.9421661963815748


In [0]:
MODEL_PATH = "dbfs:/models/taxi_fare_lr_v1"

lr_model.write().overwrite().save(MODEL_PATH)

print("✅ Model saved at:", MODEL_PATH)


✅ Model saved at: dbfs:/models/taxi_fare_lr_v1


In [0]:
from pyspark.ml.regression import LinearRegressionModel

loaded_model = LinearRegressionModel.load(MODEL_PATH)

loaded_model.transform(test_df) \
    .select("fare_amount", "prediction") \
    .show(5)


+-----------+------------------+
|fare_amount|        prediction|
+-----------+------------------+
|      72.97|  58.1027556672542|
|      58.08| 60.74377692930943|
|      60.32| 62.88788067525049|
|      57.61|59.275247565293135|
|      62.38| 64.62679244008339|
+-----------+------------------+
only showing top 5 rows

