<a href="https://colab.research.google.com/github/alisony755/DS4400/blob/main/HW2/DS4400_HW2_Problem4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Problem 4

In [62]:
# Import numpy, pandas, and sklearn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [63]:
# Import Google Drive to upload data
from google.colab import drive
drive.mount('/content/drive')

# Read in data from file path
file_id = '1_hNHlID38Uf5lphkEyXVKz5_afFb1eve'
file_path = f"https://drive.google.com/uc?id={file_id}&export=download"
df_house = pd.read_csv(file_path)

# Drop unnecessary columns
df_house = df_house.drop(columns=['id', 'date', 'zipcode'])

# Print first few rows of data
df_house.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,lat,long,sqft_living15,sqft_lot15
0,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,47.5112,-122.257,1340,5650
1,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,47.721,-122.319,1690,7639
2,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,47.7379,-122.233,2720,8062
3,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,47.5208,-122.393,1360,5000
4,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,47.6168,-122.045,1800,7503


In [64]:
# Separate features and response
X_feature = df_house[['sqft_living']].to_numpy() # Set X_feature = sqft_living
y_target = df_house['price'].to_numpy() / 1000 # Divide price by 1000 to reduce value of the MSE

In [65]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_feature, y_target, test_size=0.2, random_state=42
)

In [66]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### 4.1

In [67]:
def polynomial_features(X, p):
  """ Expands a single feature into polynomial features up to the given degree p

  Args:
    X (numpy.ndarray): Original feature matrix of shape (n_samples, 1)
    p (int): Maximum degree of polynomial

    Returns:
        numpy.ndarray: Feature matrix of shape (n_samples, p), where
                       columns are [X, X^2, ..., X^p]

  """

  # Create a copy of X
  X_poly = X.copy()

  # Add new column X^d to the feature matrix for each degree
  for d in range(2, p + 1):
      X_poly = np.hstack((X_poly, X**d))

  return X_poly

In [68]:
def add_bias(X):
  """ Adds a bias term (column of 1s) to the given feature matrix for the intercept

  Args:
    X (numpy.ndarray): Feature matrix

  Returns:
    numpy.ndarray: Feature matrix with bias column

 """

  # Create a column of 1s for the bias term
  ones = np.ones((X.shape[0], 1))

  return np.hstack((ones, X))

In [69]:
def closed_form_train(X, y):
  """ Trains a linear regression model using the closed-form normal equation

  Args:
    X (numpy.ndarray): Feature matrix of shape (n_samples, n_features)
    y (numpy.ndarray): Response vector of shape (n_samples,) or (n_samples, 1)

    Returns:
        numpy.ndarray: Parameter vector theta of shape (n_features + 1,),
                       where theta[0] is the intercept

  """

  # Add bias column to X
  X_b = add_bias(X)

  # Apply the closed-form normal equation to X
  theta = np.linalg.pinv(X_b.T @ X_b) @ X_b.T @ y
  return theta

In [70]:
def predict(X, theta):
  """ Predicts the response variable for new data using the trained parameters

  Args:
    X (numpy.ndarray): Feature matrix of shape (n_samples, n_features)
    theta (numpy.ndarray): Parameter vector of shape (n_features + 1,),
                           where theta[0] is the intercept

    Returns:
        numpy.ndarray: Predicted response vector of shape (n_samples,)

  """

  # Add bias column to X
  X_b = add_bias(X)

  # Return prediction made
  return X_b @ theta

### 4.2

In [71]:
# Train and evaluate polynomial regression for different degrees
degrees = [1, 3, 5]
results = []

for p in degrees:
    # Create polynomial features
    X_train_poly = polynomial_features(X_train_scaled, p)
    X_test_poly = polynomial_features(X_test_scaled, p)

    # Train model
    theta = closed_form_train(X_train_poly, y_train)

    # Make predictions
    y_train_pred = predict(X_train_poly, theta)
    y_test_pred = predict(X_test_poly, theta)

    # Compute training and testing MSE and R^2
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    results.append({
        'Degree': p,
        'Train MSE': train_mse,
        'Test MSE': test_mse,
        'Train R^2': train_r2,
        'Test R^2': test_r2
    })

# Display results in a table
results_df = pd.DataFrame(results)
print(results_df)

   Degree     Train MSE      Test MSE  Train R^2  Test R^2
0       1  66319.347785  76484.977062   0.492384  0.494069
1       3  58862.529017  83663.526745   0.549459  0.446585
2       5  58812.805203  85501.107851   0.549840  0.434429
