In [6]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [7]:
# Sample DataFrame
data = {
    'feature1': np.random.rand(100),
    'feature2': np.random.rand(100),
    'price': np.random.rand(100) * 1000  # Random house prices
}
df = pd.DataFrame(data)
df.head()

Unnamed: 0,feature1,feature2,price
0,0.727465,0.591918,914.086135
1,0.742418,0.374677,511.250785
2,0.998761,0.578728,878.187978
3,0.831065,0.774435,29.57822
4,0.553689,0.370773,624.805409


In [8]:
## Saving the data
os.chdir(r"D:\OneDrive - Northeastern University\Jupyter Notebook\Machine Learning Algorithms\Datasets")
df.to_csv("Sample_LR_dataset.csv", index=False)

In [9]:
# Step 1: Prepare the data
X = df[['feature1', 'feature2']]
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Step 2: Calculate Baseline Predictions
mean_price = y_train.mean()
median_price = y_train.median()
print(mean_price)
print(median_price)

497.06071277436865
501.81924133936064


In [11]:
mean_baseline_train = np.full_like(y_train, mean_price)
mean_baseline_test = np.full_like(y_test, mean_price)

median_baseline_train = np.full_like(y_train, median_price)
median_baseline_test = np.full_like(y_test, median_price)

print(mean_baseline_train)
#print(mean_baseline_test)
#print(median_baseline_test)
#print(mean_baseline_train) 

[497.06071277 497.06071277 497.06071277 497.06071277 497.06071277
 497.06071277 497.06071277 497.06071277 497.06071277 497.06071277
 497.06071277 497.06071277 497.06071277 497.06071277 497.06071277
 497.06071277 497.06071277 497.06071277 497.06071277 497.06071277
 497.06071277 497.06071277 497.06071277 497.06071277 497.06071277
 497.06071277 497.06071277 497.06071277 497.06071277 497.06071277
 497.06071277 497.06071277 497.06071277 497.06071277 497.06071277
 497.06071277 497.06071277 497.06071277 497.06071277 497.06071277
 497.06071277 497.06071277 497.06071277 497.06071277 497.06071277
 497.06071277 497.06071277 497.06071277 497.06071277 497.06071277
 497.06071277 497.06071277 497.06071277 497.06071277 497.06071277
 497.06071277 497.06071277 497.06071277 497.06071277 497.06071277
 497.06071277 497.06071277 497.06071277 497.06071277 497.06071277
 497.06071277 497.06071277 497.06071277 497.06071277 497.06071277
 497.06071277 497.06071277 497.06071277 497.06071277 497.06071277
 497.06071

In [13]:
# Step 3: Train a Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)

lr_predictions_train = model.predict(X_train)
lr_predictions_test = model.predict(X_test)

In [14]:
# Mean Baseline
mse_mean_baseline = mean_squared_error(y_test, mean_baseline_test)
mae_mean_baseline = mean_absolute_error(y_test, mean_baseline_test)

print(mse_mean_baseline)
print(mae_mean_baseline)

65886.3798028449
219.99668510696014


In [15]:
# Linear Regression
mse_lr = mean_squared_error(y_test, lr_predictions_test)
mae_lr = mean_absolute_error(y_test, lr_predictions_test)

print(f"Mean Baseline Model - MSE: {mse_mean_baseline}, MAE: {mae_mean_baseline}")
print(f"Linear Regression Model - MSE: {mse_lr}, MAE: {mae_lr}")

Mean Baseline Model - MSE: 65886.3798028449, MAE: 219.99668510696014
Linear Regression Model - MSE: 70268.70613566335, MAE: 221.3134082348136
