<a href="https://colab.research.google.com/github/alisony755/DS4400/blob/main/HW2/DS4400_HW2_Problem5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Problem 5

In [19]:
# Import numpy, pandas, and sklearn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [20]:
# Import Google Drive to upload data
from google.colab import drive
drive.mount('/content/drive')

# Read in data from file path
file_id = '1_hNHlID38Uf5lphkEyXVKz5_afFb1eve'
file_path = f"https://drive.google.com/uc?id={file_id}&export=download"
df_house = pd.read_csv(file_path)

# Drop unnecessary columns
df_house = df_house.drop(columns=['id', 'date', 'zipcode'])

# Print first few rows of data
df_house.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,lat,long,sqft_living15,sqft_lot15
0,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,47.5112,-122.257,1340,5650
1,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,47.721,-122.319,1690,7639
2,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,47.7379,-122.233,2720,8062
3,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,47.5208,-122.393,1360,5000
4,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,47.6168,-122.045,1800,7503


In [21]:
# Separate features and response
X = df_house.drop(columns=["price"]).values
y = df_house["price"].values / 1000 # Divide price by 1000 to reduce value of the MSE

In [22]:
# Split into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [23]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### 5.1

In [24]:
def add_bias(X):
  """ Adds a bias term (column of 1s) to the given feature matrix for the intercept

  Args:
    X (numpy.ndarray): Feature matrix

  Returns:
    numpy.ndarray: Feature matrix with bias column

 """

  # Create a column of 1s for the bias term
  ones = np.ones((X.shape[0], 1))

  return np.hstack((ones, X))

In [25]:
def gradient_descent(X, y, alpha, num_iters):
  """ Trains linear regression using gradient descent

  Args:
    X (numpy.ndarray): Feature matrix of shape (n_samples, n_features)
    y (numpy.ndarray): Target vector of shape (n_samples,)
    alpha (float): Learning rate
    num_iters (int): Number of iterations

  Returns:
    numpy.ndarray: Learned parameter vector theta

  """

  # Add bias column to features
  X_b = add_bias(X)

  # Get number of samples and features
  n_samples, n_features = X_b.shape

  # Initialize parameters to zero
  theta = np.zeros(n_features)

  # Loop through gradient descent steps
  for i in range(num_iters):

      # Compute predicted values
      y_pred = X_b @ theta

      # Compute prediction errors
      error = y_pred - y

      # Compute gradient of cost function
      gradient = (2 / n_samples) * (X_b.T @ error)

      # Update model parameters
      theta = theta - alpha * gradient

  return theta

### 5.2

In [26]:
# Learning rates to test
learning_rates = [0.01, 0.1, 0.5]

# Iteration values to test
iterations_list = [10, 50, 100]

# List to store results
results = []

# Loop over each learning rate
for alpha in learning_rates:

    # Loop over each iteration count
    for num_iters in iterations_list:

        # Train model using gradient descent on scaled data
        theta = gradient_descent(X_train_scaled, y_train, alpha, num_iters)

        # Add bias to scaled training features
        X_train_b = add_bias(X_train_scaled)

        # Add bias to scaled testing features
        X_test_b = add_bias(X_test_scaled)

        # Predict training values
        y_train_pred = X_train_b @ theta

        # Predict testing values
        y_test_pred = X_test_b @ theta

        # Compute training MSE using sklearn
        train_mse = mean_squared_error(y_train, y_train_pred)

        # Compute testing MSE using sklearn
        test_mse = mean_squared_error(y_test, y_test_pred)

        # Compute training R^2 score
        train_r2 = r2_score(y_train, y_train_pred)

        # Compute testing R^2 score
        test_r2 = r2_score(y_test, y_test_pred)

        # Store experiment results
        results.append([
            alpha,
            num_iters,
            train_mse,
            test_mse,
            train_r2,
            test_r2,
            theta
        ])

# Convert results into DataFrame
results_df = pd.DataFrame(
    results,
    columns=[
        "Learning Rate",
        "Iterations",
        "Train MSE",
        "Test MSE",
        "Train R2",
        "Test R2",
        "Theta"
    ]
)

# Display results table
print(results_df)

   Learning Rate  Iterations      Train MSE       Test MSE       Train R2  \
0           0.01          10   2.595410e+05   2.775394e+05  -9.865578e-01   
1           0.01          50   8.172058e+04   9.051283e+04   3.745010e-01   
2           0.01         100   4.600254e+04   5.304284e+04   6.478911e-01   
3           0.10          10   4.414163e+04   5.102153e+04   6.621347e-01   
4           0.10          50   3.984712e+04   4.599467e+04   6.950054e-01   
5           0.10         100   3.983439e+04   4.599601e+04   6.951029e-01   
6           0.50          10   9.263391e+16   1.001676e+17  -7.090310e+11   
7           0.50          50   8.571168e+65   9.268238e+65  -6.560474e+60   
8           0.50         100  1.383176e+127  1.495666e+127 -1.058699e+122   

         Test R2                                              Theta  
0  -8.358607e-01  [98.37239955647313, 12.686748698459004, 24.242...  
1   4.012780e-01  [341.9292299650289, 3.559407960219976, 33.3436...  
2   6.491335e-01  [