In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb
import pickle
from google.colab import files

# Step 1: Upload and Load
uploaded = files.upload()
data = pd.read_csv("Taxi_Set.csv")

# Step 2: Fix boolean column
data['surge_applied'] = data['surge_applied'].astype(int)

# Step 3: Select features (includes 'tip')
features = ['trip_duration_sec', 'distance_traveled_Km', 'num_of_passengers', 'surge_applied', 'KPH', 'tip']
X = data[features]
y = data['total_fare_new']

# Step 4: Remove outliers using IQR on fare
q1 = y.quantile(0.25)
q3 = y.quantile(0.75)
iqr = q3 - q1
lower = q1 - 1.5 * iqr
upper = q3 + 1.5 * iqr
mask = (y >= lower) & (y <= upper)
X, y = X[mask], y[mask]

# Step 5: Create interaction features
X['duration_x_distance'] = X['trip_duration_sec'] * X['distance_traveled_Km']
X['speed_x_surge'] = X['KPH'] * X['surge_applied']
X['tip_x_passenger'] = X['tip'] * X['num_of_passengers']

# Step 6: Handle missing values
X = X.fillna(X.mean(numeric_only=True))
y = y.fillna(y.mean())

# Step 7: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 8: Train LightGBM model
model = lgb.LGBMRegressor(n_estimators=200, learning_rate=0.07, max_depth=6, random_state=42)
model.fit(X_train, y_train)

# Step 9: Evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"✅ Mean Squared Error: {mse:.2f}")
print(f"✅ R² Score: {r2:.2f}")

# Step 10: Save improved model
with open('boosted_taxi_fare_model.pkl', 'wb') as f:
    pickle.dump(model, f)
files.download('boosted_taxi_fare_model.pkl')


Saving Taxi_Set.csv to Taxi_Set (5).csv
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018079 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1541
[LightGBM] [Info] Number of data points in the train set: 159896, number of used features: 9
[LightGBM] [Info] Start training from score 39.906651
✅ Mean Squared Error: 24.32
✅ R² Score: 0.95


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>