In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv('cleaned_dataset.csv')
df.shape

(2000, 8)

In [3]:
from sklearn.linear_model import LinearRegression,RidgeCV,LassoCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error, r2_score

In [4]:
X = df[["X1", "X3", "X5", "X6", "X7", "X8"]]
y = df["Y1"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": RidgeCV(),
    "Lasso": LassoCV(),
    "Knn" : KNeighborsRegressor(),
    "SVM" : SVR(),
    "DecisionTreeRegressor" : DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor(),
    "GradientBoosting": GradientBoostingRegressor(),
    "AdaBoost" : AdaBoostRegressor(),
    "XGBoost" : XGBRegressor()
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae=mean_absolute_error(y_test,y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {"MSE": mse, "R2": r2}
results_df = pd.DataFrame(results).T
print("Using Train Test split")
print(results_df)

Using Train Test split
                             MSE        R2
LinearRegression        7.049982  0.912824
Ridge                   7.041604  0.912927
Lasso                   7.010943  0.913306
Knn                     9.548835  0.881924
SVM                    11.021661  0.863712
DecisionTreeRegressor   4.176748  0.948352
RandomForest            2.108250  0.973930
GradientBoosting        1.994488  0.975337
AdaBoost                5.737253  0.929056
XGBoost                 1.827006  0.977408


In [6]:
from sklearn.model_selection import cross_val_score,KFold,StratifiedKFold
kf=KFold(n_splits=5,shuffle=True,random_state=3)
results = {}

for name, model in models.items():
    mse_scores = -cross_val_score(model, X, y, cv=kf, scoring="neg_mean_squared_error")
    r2_scores = cross_val_score(model, X, y, cv=kf, scoring="r2")
    results[name] = {"MSE mean ": mse_scores.mean(),"R2 mean": r2_scores.mean()}
results_df = pd.DataFrame(results).T
print("Using K fold")
print(results_df)

Using K fold
                       MSE mean    R2 mean
LinearRegression        7.290630  0.908260
Ridge                   7.289924  0.908269
Lasso                   7.356122  0.907435
Knn                    10.130552  0.872552
SVM                    12.684347  0.840425
DecisionTreeRegressor   4.290339  0.946109
RandomForest            2.188170  0.971953
GradientBoosting        2.089782  0.973716
AdaBoost                5.542440  0.929574
XGBoost                 2.197507  0.972342


GradientBoosting → MSE = 2.09, R² = 0.974

RandomForest → MSE = 2.18, R² = 0.972

XGBoost → MSE = 2.19, R² = 0.972

In [7]:
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

In [13]:
param_grids = {
    "RandomForest": {
        "n_estimators": [100, 200, 300],
        "max_depth": [None, 5, 10, 20],
        "min_samples_split": [2, 5, 10],
    },
    "GradientBoosting": {
        "n_estimators": [100, 200, 300],
        "learning_rate": [0.01, 0.05, 0.1],
        "max_depth": [3, 5, 7]
    },
    "XGBoost": {
        "n_estimators": [100, 200, 300],
        "learning_rate": [0.01, 0.05, 0.1],
        "max_depth": [3, 5, 7],

    }
}

In [9]:
rf = RandomForestRegressor(random_state=42)
grid = GridSearchCV(
    estimator=rf,
    param_grid=param_grids["RandomForest"],
    cv=5,
    scoring="neg_mean_squared_error"
)
grid.fit(X, y)

print("Best Params:", grid.best_params_)
print("Best MSE:", -grid.best_score_)

Best Params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 300}
Best MSE: 2.467235082770118


In [None]:
rf = GradientBoostingRegressor(random_state=42)
grid = GridSearchCV(
    estimator=rf,
    param_grid=param_grids["GradientBoosting"],
    cv=5,
    scoring="neg_mean_squared_error"
)
grid.fit(X, y)

print("Best Params:", grid.best_params_)
print("Best MSE:", -grid.best_score_)

Best Params: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 200}
Best MSE: 2.208404506345526


In [14]:
rf = XGBRegressor(random_state=42)
grid = GridSearchCV(
    estimator=rf,
    param_grid=param_grids["XGBoost"],
    cv=5,
    scoring="neg_mean_squared_error")
grid.fit(X, y)

print("Best Params:", grid.best_params_)
print("Best MSE:", -grid.best_score_)

Best Params: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 300}
Best MSE: 2.174429440504435


XGBoost performs the best (lowest MSE).