In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv('cleaned_dataset.csv')
df.shape

(2000, 8)

In [3]:
from sklearn.linear_model import LinearRegression,RidgeCV,LassoCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error, r2_score

In [4]:
X = df[["X1", "X3", "X5", "X6", "X7", "X8"]]
y = df["Y1"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": RidgeCV(),
    "Lasso": LassoCV(),
    "Knn" : KNeighborsRegressor(),
    "SVM" : SVR(),
    "DecisionTreeRegressor" : DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor(),
    "GradientBoosting": GradientBoostingRegressor(),
    "AdaBoost" : AdaBoostRegressor(),
    "XGBoost" : XGBRegressor()
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae=mean_absolute_error(y_test,y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {"MSE": mse, "R2": r2}
results_df = pd.DataFrame(results).T
print("Using Train Test split")
print(results_df)

Using Train Test split
                             MSE        R2
LinearRegression        6.788114  0.914741
Ridge                   6.786203  0.914765
Lasso                   6.841158  0.914075
Knn                     9.657288  0.878704
SVM                    11.289046  0.858210
DecisionTreeRegressor   3.390177  0.957419
RandomForest            1.902213  0.976108
GradientBoosting        2.080670  0.973867
AdaBoost                5.736684  0.927947
XGBoost                 1.958622  0.975400


In [6]:
from sklearn.model_selection import cross_val_score,KFold,StratifiedKFold
kf=KFold(n_splits=5,shuffle=True,random_state=3)
results = {}

for name, model in models.items():
    mse_scores = -cross_val_score(model, X, y, cv=kf, scoring="neg_mean_squared_error")
    r2_scores = cross_val_score(model, X, y, cv=kf, scoring="r2")
    results[name] = {"MSE mean ": mse_scores.mean(),"R2 mean": r2_scores.mean()}
results_df = pd.DataFrame(results).T
print("Using K fold")
print(results_df)

Using K fold
                       MSE mean    R2 mean
LinearRegression        7.146332  0.911550
Ridge                   7.145445  0.911561
Lasso                   7.218666  0.910663
Knn                    10.320367  0.872320
SVM                    12.247289  0.848453
DecisionTreeRegressor   4.009320  0.950527
RandomForest            2.055743  0.973781
GradientBoosting        2.041222  0.974707
AdaBoost                5.342795  0.930979
XGBoost                 1.991800  0.975366


GradientBoosting → MSE = 2.09, R² = 0.974

RandomForest → MSE = 2.18, R² = 0.972

XGBoost → MSE = 2.19, R² = 0.972

In [12]:
from sklearn.model_selection import GridSearchCV

In [8]:
param_grids = {
    "RandomForest": {
        "n_estimators": [100, 200, 300],
        "max_depth": [None, 5, 10, 20],
        "min_samples_split": [2, 5, 10],
    },
    "GradientBoosting": {
        "n_estimators": [100, 200, 300],
        "learning_rate": [0.01, 0.05, 0.1],
        "max_depth": [3, 5, 7]
    },
    "XGBoost": {
        "n_estimators": [100, 200, 300],
        "learning_rate": [0.01, 0.05, 0.1],
        "max_depth": [3, 5, 7],

    }
}

In [9]:
rf = RandomForestRegressor(random_state=42)
grid = GridSearchCV(
    estimator=rf,
    param_grid=param_grids["RandomForest"],
    cv=5,
    scoring="neg_mean_squared_error"
)
grid.fit(X, y)

print("Best Params:", grid.best_params_)
print("Best MSE:", -grid.best_score_)

Best Params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 300}
Best MSE: 2.686564436832934


In [10]:
rf = GradientBoostingRegressor(random_state=42)
grid = GridSearchCV(
    estimator=rf,
    param_grid=param_grids["GradientBoosting"],
    cv=5,
    scoring="neg_mean_squared_error"
)
grid.fit(X, y)

print("Best Params:", grid.best_params_)
print("Best MSE:", -grid.best_score_)

Best Params: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
Best MSE: 2.1843651463740583


In [11]:
rf = XGBRegressor(random_state=42)
grid = GridSearchCV(
    estimator=rf,
    param_grid=param_grids["XGBoost"],
    cv=5,
    scoring="neg_mean_squared_error")
grid.fit(X, y)

print("Best Params:", grid.best_params_)
print("Best MSE:", -grid.best_score_)

Best Params: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 300}
Best MSE: 2.0807607500237557


XGBoost performs the best (lowest MSE).