In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error


In [25]:

# ---------------------------
# 1. Load Dataset
# ---------------------------
df = pd.read_csv("C:/Users/Lenovo/Downloads/uber.csv")   # âœ… change this to your dataset link/path
df.head()


Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [11]:
# ---------------------------
# 2. Pre-processing
# ---------------------------

# Convert datetime
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

# Create features
df['hour'] = df['pickup_datetime'].dt.hour
df['day'] = df['pickup_datetime'].dt.day
df['month'] = df['pickup_datetime'].dt.month
df['year'] = df['pickup_datetime'].dt.year

# Haversine distance function
def haversine(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
    return 6371 * (2*np.arcsin(np.sqrt(a)))

df["distance_km"] = haversine(
    df["pickup_longitude"], df["pickup_latitude"],
    df["dropoff_longitude"], df["dropoff_latitude"]
)

# Basic cleaning
df = df[(df["fare_amount"] > 0) & (df["fare_amount"] < 100)]
df = df[(df["passenger_count"] > 0) & (df["passenger_count"] < 7)]
df = df[df["distance_km"] < 50]    # remove unrealistic trips


In [13]:

# ---------------------------
# 3. Identify Outliers (IQR)
# ---------------------------
def remove_outliers(col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return df[(df[col] > lower) & (df[col] < upper)]

df = remove_outliers("fare_amount")
df = remove_outliers("distance_km")



In [19]:
# ---------------------------
# 5. Train/Test Split
# ---------------------------

features = ["distance_km", "hour", "passenger_count"]
X = df[features]
y = df["fare_amount"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [21]:
# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_lr = lr.predict(X_test)

# Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
y_rf = rf.predict(X_test)


In [22]:
# ---------------------------
# 7. Evaluation Function
# ---------------------------
def metrics(y_true, y_pred, name):
    print("\n---", name, "---")
    print("R2 Score:", r2_score(y_true, y_pred))
    print("RMSE:", np.sqrt(mean_squared_error(y_true, y_pred)))
    print("MAE:", mean_absolute_error(y_true, y_pred))

metrics(y_test, y_lr, "Linear Regression")
metrics(y_test, y_rf, "Random Forest")



--- Linear Regression ---
R2 Score: 0.5757471476839663
RMSE: 2.4177263817908625
MAE: 1.730206205783797

--- Random Forest ---
R2 Score: 0.5424210396557114
RMSE: 2.5108905816537797
MAE: 1.8025601399434072
