In [2]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, root_mean_squared_error, r2_score
import pandas as pd
import numpy as np

# Load the housing dataset
housing = fetch_california_housing()

X = pd.DataFrame(housing.data, columns=housing.feature_names) 
y = pd.Series(housing.target, name='med_house_value')

# display the first 5 rows of the dataset
print(X.head())

# print the feature names and check for missing values
print("Feature Names: ", X.columns)
print("Missing Values: ", X.isnull().sum())

# generate summary statistics (mean, median, max, min, etc)
print(X.describe())

# split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# train linear regression model
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# make predictions
y_pred = lin_reg.predict(X_test)
print(y_pred)
coef = pd.Series(lin_reg.coef_, index = X.columns)
print(coef)

# evaluate model performance
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R² Score: {r2}")

   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  
0    -122.23  
1    -122.22  
2    -122.24  
3    -122.25  
4    -122.25  
Feature Names:  Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude'],
      dtype='object')
Missing Values:  MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64
             MedInc      HouseAge      AveRooms     AveBedrms    Population  \
count  20640.000000  20640.000000  20640.000000  20640.000000  20

Interpretation Questions:
What does the R² score tell us about model performance?
- The R^2 score tells us how well the model explains the variances in the data. A 'good' score changes depending on what discipline you are looking at data for. Overall, the R^2 score represents how far the data strays from the line of best fit. In this example, the model explains 57.6% of the data.

Which features seem to have the strongest impact on predictions based on the model’s coefficients?
- It seems that the average bedrooms has the strongest impact. I know this because its absolute value is the closest to one. Longitute has the strongest negative impact by that same logic (abs value is the closest to one out of all the negative values).

How well do the predicted values match the actual values?
- The RMSE is 0.75, suggesting an error of 0.75 of 1 unit of measurement. The model is usable, but not terribly accurate. There are some patterns that have been captured, but it shouldn't be relied on to make precise predictions. 

In [3]:
# Selecting three features based on correlation and domain knowledge
selected_features = ['MedInc', 'AveRooms', 'HouseAge']

# Train a new model using only selected features
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

model_selected = LinearRegression()
model_selected.fit(X_train_selected, y_train)

# Make predictions
y_pred_selected = model_selected.predict(X_test_selected)
print(y_pred_selected)

# Evaluate simplified model
mse_selected = mean_squared_error(y_test, y_pred_selected)
rmse_selected = np.sqrt(mse_selected)
r2_selected = r2_score(y_test, y_pred_selected)

print(f"Simplified Model MSE: {mse_selected}")
print(f"Simplified Model RMSE: {rmse_selected}")
print(f"Simplified Model R² Score: {r2_selected}")

[1.06791912 1.50634095 2.32862562 ... 4.33948872 1.71316594 1.77105121]
Simplified Model MSE: 0.6589108649336336
Simplified Model RMSE: 0.8117332473994358
Simplified Model R² Score: 0.49717158850807075


Interpretation Questions:
How does the simplified model compare to the full model?
-  The simplified model shows that what was already not a great model becomes even less accurate, with a lower R^2 and a higher RMSE.

Would you use this simplified model in practice? Why or why not?
- I would not, its ability to make accurate predictions is quite low. Housing data has broad applications in sociology, politics, etc, but using data that is making poor predictions would throw all of those downstream actions off.

In [4]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler and apply it to the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Split the scaled data
X_train_scaled, X_test_scaled, _, _ = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize and train the linear regression model on scaled data
lin_reg_scaled = LinearRegression()
lin_reg_scaled.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred_scaled = lin_reg_scaled.predict(X_test_scaled)

# Evaluate model performance
mse_scaled = mean_squared_error(y_test, y_pred_scaled)
r2_scaled = r2_score(y_test, y_pred_scaled)
rmse_scaled = root_mean_squared_error(y_test, y_pred)

print("\nScaled Data Model:")
print(f"Mean Squared Error: {mse_scaled:.2f}")
print(f"Root Mean Squared Error: {rmse_scaled:.2f}")
print(f"R² Score: {r2_scaled:.2f}")
print("Model Coefficients (Scaled):")
print(pd.Series(lin_reg_scaled.coef_, index=X.columns))


Scaled Data Model:
Mean Squared Error: 0.56
Root Mean Squared Error: 0.75
R² Score: 0.58
Model Coefficients (Scaled):
MedInc        0.852382
HouseAge      0.122382
AveRooms     -0.305116
AveBedrms     0.371132
Population   -0.002298
AveOccup     -0.036624
Latitude     -0.896635
Longitude    -0.868927
dtype: float64


Interpretation Questions:
Compare the metrics before and after scaling. What changed, and why?
- They didn't change; we just scaled the data but didn't adjust it at all

Did the R² score improve? Why or why not?
- 

What role does feature scaling play in linear regression?
- 
