In [None]:
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import joblib


### Load and Prepare California Housing Dataset

In [None]:
# === Load California housing dataset ===
housing = fetch_california_housing()
df = pd.DataFrame(housing.data, columns=housing.feature_names)
df['target'] = housing.target  # Median house value in $100,000s

# === Column formatting ===
df.columns = [col.strip().lower().replace(" ", "_").replace("(", "").replace(")", "").replace("/", "_") for col in df.columns]
df.head()


### Split Data into Training and Validation Sets

In [None]:
# === Split data ===
X = df.drop('target', axis=1)
y = df['target']

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.7, random_state=42)


### Train Linear Regression Model

In [None]:
# === Train linear regression model ===
lr = LinearRegression()
lr.fit(X_train, y_train)


### Predict and Evaluate the Model

In [None]:
# === Predict on validation data ===
y_pred = lr.predict(X_val)

# === Evaluate model ===
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R-squared (R²):", r2)


### Visualize Predictions vs Actual

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(y_val, y_pred, color='blue', alpha=0.4, label='Predictions')
plt.plot([y.min(), y.max()], [y.min(), y.max()], color='red', linestyle='--', label='Ideal Fit')
plt.xlabel('Actual Median House Value')
plt.ylabel('Predicted Median House Value')
plt.title('Linear Regression: Actual vs Predicted (California Housing)')
plt.legend()
plt.grid(True)
plt.show()


### Save the Trained Model

In [None]:
# === Save model ===
joblib.dump(lr, "linear_regression_california_housing.joblib")
print("Model saved as 'linear_regression_california_housing.joblib'.")
