In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
# Sample DataFrame
data = {
    'feature1': np.random.rand(100),
    'feature2': np.random.rand(100),
    'price': np.random.rand(100) * 100000  # Random house prices
}
df = pd.DataFrame(data)
df.head()

Unnamed: 0,feature1,feature2,price
0,0.614125,0.102929,17972.217561
1,0.789224,0.105684,56467.745198
2,0.769421,0.624308,25360.445406
3,0.2641,0.691542,88732.193684
4,0.048123,0.841424,94393.679347


In [4]:
# Step 1: Prepare the data
X = df[['feature1', 'feature2']]
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
# Step 2: Calculate Baseline Predictions
mean_price = y_train.mean()
median_price = y_train.median()
print(mean_price)
print(median_price)

51540.07671881725
53607.36929983534


In [8]:
mean_baseline_train = np.full_like(y_train, mean_price)
mean_baseline_test = np.full_like(y_test, mean_price)

median_baseline_train = np.full_like(y_train, median_price)
median_baseline_test = np.full_like(y_test, median_price)

print(mean_baseline_train)
#print(mean_baseline_test)
#print(median_baseline_test)
#print(mean_baseline_train) 

[51540.07671882 51540.07671882 51540.07671882 51540.07671882
 51540.07671882 51540.07671882 51540.07671882 51540.07671882
 51540.07671882 51540.07671882 51540.07671882 51540.07671882
 51540.07671882 51540.07671882 51540.07671882 51540.07671882
 51540.07671882 51540.07671882 51540.07671882 51540.07671882
 51540.07671882 51540.07671882 51540.07671882 51540.07671882
 51540.07671882 51540.07671882 51540.07671882 51540.07671882
 51540.07671882 51540.07671882 51540.07671882 51540.07671882
 51540.07671882 51540.07671882 51540.07671882 51540.07671882
 51540.07671882 51540.07671882 51540.07671882 51540.07671882
 51540.07671882 51540.07671882 51540.07671882 51540.07671882
 51540.07671882 51540.07671882 51540.07671882 51540.07671882
 51540.07671882 51540.07671882 51540.07671882 51540.07671882
 51540.07671882 51540.07671882 51540.07671882 51540.07671882
 51540.07671882 51540.07671882 51540.07671882 51540.07671882
 51540.07671882 51540.07671882 51540.07671882 51540.07671882
 51540.07671882 51540.07

In [9]:
# Step 3: Train a Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)

lr_predictions_train = model.predict(X_train)
lr_predictions_test = model.predict(X_test)

In [11]:
# Mean Baseline
mse_mean_baseline = mean_squared_error(y_test, mean_baseline_test)
mae_mean_baseline = mean_absolute_error(y_test, mean_baseline_test)

print(mse_mean_baseline)
print(mae_mean_baseline)

814942355.63489
24679.122171969077


In [12]:
# Linear Regression
mse_lr = mean_squared_error(y_test, lr_predictions_test)
mae_lr = mean_absolute_error(y_test, lr_predictions_test)

print(f"Mean Baseline Model - MSE: {mse_mean_baseline}, MAE: {mae_mean_baseline}")
print(f"Linear Regression Model - MSE: {mse_lr}, MAE: {mae_lr}")

Mean Baseline Model - MSE: 814942355.63489, MAE: 24679.122171969077
Linear Regression Model - MSE: 842450414.2867721, MAE: 25798.706563160034
