In [5]:
import warnings
warnings.filterwarnings("ignore")


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

# Load dataset
data = pd.read_csv('../data/toy_data.csv')

# Split the data
X = data.drop('target', axis=1)
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear Regression Model
model = LinearRegression()

# Randomized Parameter Search (no hyperparameters for linear regression, but you could tune the fit_intercept option)
param_dist = {
    'fit_intercept': [True, False]
}

# Perform random search and cross-validation
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=5, cv=5, random_state=42)
random_search.fit(X_train, y_train)

# Best Model
best_model = random_search.best_estimator_

# Cross-validation
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

# Predictions and evaluation
predictions = best_model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print(f"Best Linear Regression MSE: {mse}")
print(f"Cross-validation MSE scores: {cv_scores.mean()}")


Best Linear Regression MSE: 0.05207014552743815
Cross-validation MSE scores: -0.052568848851743
