In [1]:
# Problem 2

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load data
train_df = pd.read_csv("train (1).csv")
test_df = pd.read_csv("test (2).csv")

# Drop excluded columns and scale price
DROP_COLS = ["id", "date", "zipcode"]

def preprocess(df):
    df = df.copy()
    df["price"] = df["price"] / 1000.0  # scale
    X = df.drop(columns=["price"] + DROP_COLS, errors="ignore")
    y = df["price"]
    return X, y

X_train_raw, y_train = preprocess(train_df)
X_test_raw, y_test = preprocess(test_df)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_raw)
X_test = scaler.transform(X_test_raw)

# Train linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

# Metrics
train_mse = mean_squared_error(y_train, train_pred)
train_r2 = r2_score(y_train, train_pred)

test_mse = mean_squared_error(y_test, test_pred)
test_r2 = r2_score(y_test, test_pred)

# Coefficients table
coef_table = pd.DataFrame({
    "Feature": X_train_raw.columns,
    "Coefficient": model.coef_
}).sort_values(by="Coefficient", key=lambda x: abs(x), ascending=False)

train_mse, train_r2, test_mse, test_r2, coef_table


(31415.747916100863,
 0.7271450489303788,
 58834.673978213985,
 0.6471195893437872,
           Feature  Coefficient
 9           grade    92.511076
 14            lat    78.129852
 12       yr_built   -68.043173
 6      waterfront    64.230911
 3     sqft_living    57.161582
 10     sqft_above    48.439051
 7            view    47.610288
 16  sqft_living15    45.479128
 11  sqft_basement    27.688812
 2       bathrooms    18.456913
 13   yr_renovated    17.341926
 17     sqft_lot15   -12.906560
 1        bedrooms   -12.807339
 8       condition    12.647609
 4        sqft_lot    11.127338
 0      Unnamed: 0     8.456024
 5          floors     8.151038
 15           long    -1.437669)

In [12]:
#Problem 3

def preprocess(df):
    df = df.copy()
    df["price"] = df["price"] / 1000.0  # divide price by 1000
    X = df.drop(columns=["price"] + DROP_COLS, errors="ignore")
    y = df["price"].to_numpy()
    return X, y

X_train_raw, y_train = preprocess(train_df)
X_test_raw,  y_test  = preprocess(test_df)


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_raw)
X_test  = scaler.transform(X_test_raw)

feature_names = list(X_train_raw.columns)


pkg_model = LinearRegression()
pkg_model.fit(X_train, y_train)

pkg_train_pred = pkg_model.predict(X_train)
pkg_test_pred  = pkg_model.predict(X_test)

pkg_train_mse = mean_squared_error(y_train, pkg_train_pred)
pkg_train_r2  = r2_score(y_train, pkg_train_pred)
pkg_test_mse  = mean_squared_error(y_test,  pkg_test_pred)
pkg_test_r2   = r2_score(y_test,  pkg_test_pred)


def fit_closed_form(X, y):
    """
    Closed-form linear regression with intercept using matrix operations.
    Returns theta of shape (d+1,), where theta[0] is intercept.
    """
    N = X.shape[0]
    Xb = np.hstack([np.ones((N, 1)), X])         
    theta = np.linalg.pinv(Xb.T @ Xb) @ (Xb.T @ y)  
    return theta

def predict_closed_form(X, theta):
    """Predict using closed-form theta (includes intercept)."""
    N = X.shape[0]
    Xb = np.hstack([np.ones((N, 1)), X])
    return Xb @ theta

theta_cf = fit_closed_form(X_train, y_train)

cf_train_pred = predict_closed_form(X_train, theta_cf)
cf_test_pred  = predict_closed_form(X_test,  theta_cf)

cf_train_mse = mean_squared_error(y_train, cf_train_pred)
cf_train_r2  = r2_score(y_train, cf_train_pred)
cf_test_mse  = mean_squared_error(y_test,  cf_test_pred)
cf_test_r2   = r2_score(y_test,  cf_test_pred)


print("Package (sklearn)")
print("Train MSE:", pkg_train_mse)
print("Train R2 :", pkg_train_r2)
print("Test  MSE:", pkg_test_mse)
print("Test  R2 :", pkg_test_r2)

print("\nClosed-form ")
print("Train MSE:", cf_train_mse)
print("Train R2 :", cf_train_r2)
print("Test  MSE:", cf_test_mse)
print("Test  R2 :", cf_test_r2)


# Closed-form coefficients
coef_table_cf = pd.DataFrame({
    "Feature": feature_names,
    "Closed_form_coef": theta_cf[1:],
    "Sklearn_coef": pkg_model.coef_
}).sort_values(by="Closed_form_coef", key=lambda s: np.abs(s), ascending=False)

print("\nIntercept (closed-form):", theta_cf[0])
print("\nTop 10 coefficients by |value| (closed-form):")
print(coef_table_cf.head(10).to_string(index=False))


print("\nMax absolute difference between sklearn and closed-form predictions (train):",
      np.max(np.abs(pkg_train_pred - cf_train_pred)))
print("Max absolute difference between sklearn and closed-form predictions (test):",
      np.max(np.abs(pkg_test_pred - cf_test_pred)))


Package (sklearn)
Train MSE: 31415.747916100863
Train R2 : 0.7271450489303788
Test  MSE: 58834.673978213985
Test  R2 : 0.6471195893437872

Closed-form 
Train MSE: 31415.747916100863
Train R2 : 0.7271450489303788
Test  MSE: 58834.67397821392
Test  R2 : 0.6471195893437875

Intercept (closed-form): 520.4148340000004

Top 10 coefficients by |value| (closed-form):
      Feature  Closed_form_coef  Sklearn_coef
        grade         92.511076     92.511076
          lat         78.129852     78.129852
     yr_built        -68.043173    -68.043173
   waterfront         64.230911     64.230911
  sqft_living         57.161582     57.161582
   sqft_above         48.439051     48.439051
         view         47.610288     47.610288
sqft_living15         45.479128     45.479128
sqft_basement         27.688812     27.688812
    bathrooms         18.456913     18.456913

Max absolute difference between sklearn and closed-form predictions (train): 3.240074875066057e-12
Max absolute difference between 

In [11]:
#Problem 4

# y: 
y_train = (train_df["price"] / 1000.0).to_numpy()
y_test  = (test_df["price"] / 1000.0).to_numpy()

# x:
x_train_raw = train_df["sqft_living"].to_numpy().reshape(-1, 1)
x_test_raw  = test_df["sqft_living"].to_numpy().reshape(-1, 1)

# scale x
scaler_x = StandardScaler()
x_train = scaler_x.fit_transform(x_train_raw)
x_test  = scaler_x.transform(x_test_raw)

def poly_design(x, p):
    return np.hstack([x**k for k in range(1, p+1)]) 

def fit_closed_form(X, y):
    N = X.shape[0]
    Xb = np.hstack([np.ones((N, 1)), X])
    theta = np.linalg.pinv(Xb.T @ Xb) @ (Xb.T @ y)
    return theta

def predict_closed_form(X, theta):
    N = X.shape[0]
    Xb = np.hstack([np.ones((N, 1)), X])
    return Xb @ theta

rows = []
for p in [1,2,3,4,5]:
    Xp_train = poly_design(x_train, p)
    Xp_test  = poly_design(x_test, p)

    theta = fit_closed_form(Xp_train, y_train)
    pred_train = predict_closed_form(Xp_train, theta)
    pred_test  = predict_closed_form(Xp_test, theta)

    rows.append([p,
                 mean_squared_error(y_train, pred_train), r2_score(y_train, pred_train),
                 mean_squared_error(y_test, pred_test),   r2_score(y_test, pred_test)])

results = pd.DataFrame(rows, columns=["Degree p","Train MSE","Train R2","Test MSE","Test R2"])
print(results)


   Degree p     Train MSE  Train R2       Test MSE   Test R2
0         1  57947.526161  0.496709   88575.978543  0.468736
1         2  54822.665116  0.523849   71791.679479  0.569406
2         3  53785.194716  0.532860   99833.483763  0.401216
3         4  52795.774758  0.541453  250979.274285 -0.505331
4         5  52626.111955  0.542927  570616.914822 -2.422464


In [15]:
def preprocess(df):
    df = df.copy()
    y = (df["price"] / 1000.0).to_numpy()
    X = df.drop(columns=["price"] + DROP_COLS, errors="ignore")
    return X, y

X_train_raw, y_train = preprocess(train_df)
X_test_raw,  y_test  = preprocess(test_df)

# scale X 
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_raw)
X_test  = scaler.transform(X_test_raw)

def add_intercept(X):
    return np.hstack([np.ones((X.shape[0], 1)), X])

Xb_train = add_intercept(X_train)
Xb_test  = add_intercept(X_test)

def gd_linear_regression(Xb, y, alpha=0.1, iters=100):
    N, d = Xb.shape
    theta = np.zeros(d)
    for _ in range(iters):
        error = Xb @ theta - y
        grad = (2.0 / N) * (Xb.T @ error)
        theta = theta - alpha * grad
    return theta

def predict(Xb, theta):
    return Xb @ theta

alphas = [0.01, 0.1, 0.5]
iters_list = [10, 50, 100]

rows = []
thetas = {}

for a in alphas:
    for it in iters_list:
        theta = gd_linear_regression(Xb_train, y_train, alpha=a, iters=it)
        yhat_train = predict(Xb_train, theta)
        yhat_test  = predict(Xb_test, theta)

        rows.append([
            a, it,
            mean_squared_error(y_train, yhat_train),
            r2_score(y_train, yhat_train),
            mean_squared_error(y_test, yhat_test),
            r2_score(y_test, yhat_test),
            np.linalg.norm(theta)
        ])
        thetas[(a,it)] = theta

results = pd.DataFrame(rows, columns=["alpha","iters","Train MSE","Train R2","Test MSE","Test R2","||theta||"])
print(results)

# Example
print("\nTheta for alpha=0.1, iters=100:\n", thetas[(0.1,100)])


   alpha  iters      Train MSE       Train R2       Test MSE        Test R2  \
0   0.01     10   2.357311e+05  -1.047393e+00   2.828668e+05  -6.965872e-01   
1   0.01     50   6.969578e+04   3.946717e-01   9.432003e+04   4.342845e-01   
2   0.01    100   3.676495e+04   6.806857e-01   6.127904e+04   6.324587e-01   
3   0.10     10   3.504793e+04   6.955985e-01   6.000379e+04   6.401074e-01   
4   0.10     50   3.142706e+04   7.270468e-01   5.889054e+04   6.467845e-01   
5   0.10    100   3.141602e+04   7.271427e-01   5.883993e+04   6.470881e-01   
6   0.50     10   1.464434e+17  -1.271904e+12   1.632452e+17  -9.791172e+11   
7   0.50     50   1.293867e+67  -1.123761e+62   1.442316e+67  -8.650767e+61   
8   0.50    100  3.504812e+129 -3.044031e+124  3.906928e+129 -2.343309e+124   

      ||theta||  
0  1.221860e+02  
1  3.633916e+02  
2  4.823524e+02  
3  4.953658e+02  
4  5.527760e+02  
5  5.533903e+02  
6  1.676476e+08  
7  1.575823e+33  
8  2.593552e+64  

Theta for alpha=0.1, iters=1

In [16]:

def preprocess(df):
    df = df.copy()
    y = (df["price"]/1000.0).to_numpy()
    X = df.drop(columns=["price"] + DROP_COLS, errors="ignore")
    return X, y

X_train_raw, y_train = preprocess(train_df)
X_test_raw,  y_test  = preprocess(test_df)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_raw)
X_test  = scaler.transform(X_test_raw)

def add_intercept(X):
    return np.hstack([np.ones((X.shape[0],1)), X])

Xb_train = add_intercept(X_train)
Xb_test  = add_intercept(X_test)

def ridge_gd(Xb, y, alpha=0.1, iters=200, lam=1.0):
    N, d = Xb.shape
    theta = np.zeros(d)
    for _ in range(iters):
        err = Xb @ theta - y
        grad = (2.0/N) * (Xb.T @ err)

        reg = 2.0 * lam * theta
        reg[0] = 0.0   

        theta = theta - alpha * (grad + reg)
    return theta

def eval_model(Xb, y, theta):
    pred = Xb @ theta
    return mean_squared_error(y, pred), r2_score(y, pred)

lams = [0.0, 0.1, 1.0, 10.0, 100.0]
iters = 200

rows = []
for lam in lams:
    alpha = 0.1/(1+2*lam) 
    theta = ridge_gd(Xb_train, y_train, alpha=alpha, iters=iters, lam=lam)

    tr_mse, tr_r2 = eval_model(Xb_train, y_train, theta)
    te_mse, te_r2 = eval_model(Xb_test,  y_test,  theta)

    rows.append([lam, alpha, iters, tr_mse, tr_r2, te_mse, te_r2])

ridge_results = pd.DataFrame(rows, columns=["lambda","alpha_used","iters","Train MSE","Train R2","Test MSE","Test R2"])
print(ridge_results)


   lambda  alpha_used  iters      Train MSE  Train R2       Test MSE   Test R2
0     0.0    0.100000    200   31415.748191  0.727145   58834.860066  0.647118
1     0.1    0.083333    200   31658.736387  0.725035   59432.008064  0.643537
2     1.0    0.033333    200   38464.838480  0.665922   69591.805851  0.582600
3    10.0    0.004762    200   84245.358654  0.268304  130185.316090  0.219171
4   100.0    0.000498    200  291177.836202 -1.528965  359654.885496 -1.157149
