In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv('cleaned_dataset.csv')
df.shape

(2039, 6)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2039 entries, 0 to 2038
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   fare_amount        2039 non-null   float64
 1   pickup_longitude   2039 non-null   float64
 2   pickup_latitude    2039 non-null   float64
 3   dropoff_longitude  2039 non-null   float64
 4   dropoff_latitude   2039 non-null   float64
 5   Year               2039 non-null   int64  
dtypes: float64(5), int64(1)
memory usage: 95.7 KB


In [3]:
from sklearn.linear_model import LinearRegression,RidgeCV,LassoCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error, r2_score

In [4]:
df.columns

Index(['fare_amount', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'Year'],
      dtype='object')

In [5]:
X=df.drop(columns='fare_amount')
y=df['fare_amount']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": RidgeCV(),
    "Lasso": LassoCV(),
    "Knn" : KNeighborsRegressor(),
    "SVM" : SVR(),
    "DecisionTreeRegressor" : DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor(),
    "GradientBoosting": GradientBoostingRegressor(),
    "AdaBoost" : AdaBoostRegressor(),
    "XGBoost" : XGBRegressor()
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae=mean_absolute_error(y_test,y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {"MSE": mse, "R2": r2}
results_df = pd.DataFrame(results).T  # transpose
results_df = results_df.sort_values(by="R2", ascending=False)
results_df.reset_index(inplace=True)
results_df.rename(columns={"index": "Model"}, inplace=True)
results_df.insert(0, "Rank", range(1, len(results_df) + 1))
print("Using Train/Test split")
print(results_df)

Using Train/Test split
   Rank                  Model        MSE        R2
0     1       GradientBoosting  22.317146  0.715901
1     2                    Knn  23.246311  0.704073
2     3                XGBoost  26.466943  0.663074
3     4           RandomForest  26.691654  0.660213
4     5  DecisionTreeRegressor  27.457679  0.650462
5     6               AdaBoost  47.369681  0.396981
6     7                  Ridge  60.661569  0.227774
7     8                  Lasso  60.694595  0.227354
8     9       LinearRegression  60.708865  0.227172
9    10                    SVM  83.181922 -0.058911


In [16]:
from sklearn.model_selection import cross_val_score,KFold,StratifiedKFold
kf=KFold(n_splits=5,shuffle=True,random_state=3)
results = {}
for name, model in models.items():
    mse_scores = -cross_val_score(model, X, y, cv=kf, scoring="neg_mean_squared_error")
    r2_scores = cross_val_score(model, X, y, cv=kf, scoring="r2")
    results[name] = {"MSE mean ": mse_scores.mean(),"R2 mean": r2_scores.mean()}
results_df = pd.DataFrame(results).T  # transpose
results_df = results_df.sort_values(by="R2 mean", ascending=False)
results_df.reset_index(inplace=True)
results_df.rename(columns={"index": "Model"}, inplace=True)
results_df.insert(0, "Rank", range(1, len(results_df) + 1))
print("Using K fold")
print(results_df)

Using K fold
   Rank                  Model   MSE mean    R2 mean
0     1           RandomForest   36.150905  0.699238
1     2                    Knn   38.055696  0.677723
2     3                XGBoost   38.292434  0.677163
3     4       GradientBoosting   37.930682  0.670778
4     5  DecisionTreeRegressor   66.810924  0.290899
5     6                  Ridge   72.101632  0.249988
6     7       LinearRegression   72.109468  0.249852
7     8                  Lasso   72.110674  0.249817
8     9               AdaBoost   69.756412  0.202030
9    10                    SVM  102.878079 -0.083227


1.Random Forest
2.KNN
3.XGBoost
4.Gradient Boosting

In [18]:
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
param_grids = {
    "RandomForest": {
        "n_estimators": [100, 200, 300],
        "max_depth": [None, 5, 10, 20],
        "min_samples_split": [2, 5, 10],
    },
    "GradientBoosting": {
        "n_estimators": [100, 200, 300],
        "learning_rate": [0.01, 0.05, 0.1],
        "max_depth": [3, 5, 7]
    },
    "XGBoost": {
        "n_estimators": [100, 200, 300],
        "learning_rate": [0.01, 0.05, 0.1],
        "max_depth": [3, 5, 7],
        "subsample": [0.7, 0.9, 1.0]
    },
    "KNN": {
        "n_neighbors": [3, 5, 7, 9],
        "weights": ["uniform", "distance"],
        "p": [1, 2]  # Manhattan / Euclidean
    }
}

In [19]:
rf = RandomForestRegressor(random_state=42)
grid = GridSearchCV(
    estimator=rf,
    param_grid=param_grids["RandomForest"],
    cv=5,
    scoring="neg_mean_squared_error"
)
grid.fit(X, y)

print("Best Params:", grid.best_params_)
print("Best MSE:", -grid.best_score_)

Best Params: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}
Best MSE: 35.31922052220995


In [23]:
rf = GradientBoostingRegressor(random_state=42)
grid = GridSearchCV(
    estimator=rf,
    param_grid=param_grids["GradientBoosting"],
    cv=5,
    scoring="neg_mean_squared_error",
    n_jobs=-1
)
grid.fit(X, y)

print("Best Params:", grid.best_params_)
print("Best MSE:", -grid.best_score_)

Best Params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300}
Best MSE: 33.84313790805742


In [21]:
rf = XGBRegressor(random_state=42,n_jobs=1)
grid = GridSearchCV(
    estimator=rf,
    param_grid=param_grids["XGBoost"],
    cv=5,
    scoring="neg_mean_squared_error")
grid.fit(X, y)

print("Best Params:", grid.best_params_)
print("Best MSE:", -grid.best_score_)

Best Params: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.7}
Best MSE: 33.42801867503699


In [22]:
knn = KNeighborsRegressor()

grid = GridSearchCV(
    estimator=knn,
    param_grid=param_grids["KNN"], 
    scoring="neg_mean_squared_error",
    n_jobs=-1)

grid.fit(X, y)

print("Best Params:", grid.best_params_)
print("Best MSE:", -grid.best_score_)

Best Params: {'n_neighbors': 3, 'p': 2, 'weights': 'distance'}
Best MSE: 35.10621381343167


In [30]:
results = [
    ("XGBRegressor", {"learning_rate": 0.05, "max_depth": 5, "n_estimators": 200, "subsample": 0.7}, 33.428019),
    ("GradientBoosting", {"learning_rate": 0.1, "max_depth": 3, "n_estimators": 300}, 33.843138),
    ("KNN", {"n_neighbors": 3, "p": 2, "weights": "distance"}, 35.106214),
    ("RandomForest", {"max_depth": None, "min_samples_split": 5, "n_estimators": 200}, 35.319221)
]

df = pd.DataFrame(results, columns=["Model", "Best Params", "Best MSE"])
df["Best Params"] = df["Best Params"].apply(lambda d: ", ".join(f"{k}={v}" for k, v in d.items()))

df["Best Params"] = df["Best Params"].apply(lambda x: x.center(70))

df["Best MSE"] = df["Best MSE"].apply(lambda x: f"{x:10.6f}")

print(df.to_string(index=False))

           Model                                                            Best Params   Best MSE
    XGBRegressor    learning_rate=0.05, max_depth=5, n_estimators=200, subsample=0.7     33.428019
GradientBoosting            learning_rate=0.1, max_depth=3, n_estimators=300             33.843138
             KNN                  n_neighbors=3, p=2, weights=distance                   35.106214
    RandomForest         max_depth=None, min_samples_split=5, n_estimators=200           35.319221


XGBRegressor is best