Import and load data sets

In [2]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

TRAIN_PATH = "train.csv"
TEST_PATH = "test.csv"

train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

train_df.head()

Unnamed: 0,ID,Gender,Age,Occupation,Sleep_Duration,Sleep_Quality,Activity_Level,Stress_Level,BMI_Category,Blood_Pressure,Heart_Rate,Daily_Steps,Sleep_Disorder
0,373,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
1,193,Male,43,Salesperson,6.5,6,45,7,Overweight,130/85,72,6000,Insomnia
2,76,Male,33,Doctor,6.0,6,30,8,Normal,125/80,72,5000,
3,85,Male,35,Software Engineer,7.5,8,60,5,Normal Weight,120/80,70,8000,
4,363,Female,59,Nurse,8.2,9,75,3,Overweight,140/95,68,7000,Sleep Apnea


detail of data set

In [8]:
train_df.columns.tolist()

['ID',
 'Gender',
 'Age',
 'Occupation',
 'Sleep_Duration',
 'Sleep_Quality',
 'Activity_Level',
 'Stress_Level',
 'BMI_Category',
 'Blood_Pressure',
 'Heart_Rate',
 'Daily_Steps',
 'Sleep_Disorder']

Problem 2(a)

In [9]:
def preprocess_drop_non_numeric_and_id(df: pd.DataFrame, target_col: str):
  # Drop non-numeric columns and ID column.
  removed = []
  df2 = df.copy()
  if "ID" in df2.columns:
    removed.append("ID")
    df2 = df2.drop(columns=["ID"])
  non_numeric_cols = [c for c in df2.columns if c != target_col and not pd.api.types.is_numeric_dtype(df2[c])]
  removed.extend(non_numeric_cols)
  df2 = df2.drop(columns=non_numeric_cols)
  return df2, removed

TARGET_COL = "Sleep_Quality"

train_num, removed_cols = preprocess_drop_non_numeric_and_id(train_df, TARGET_COL)
test_num, _ = preprocess_drop_non_numeric_and_id(test_df, TARGET_COL)

removed_cols

['ID',
 'Gender',
 'Occupation',
 'BMI_Category',
 'Blood_Pressure',
 'Sleep_Disorder']

2(b)

In [10]:
def split_X_y(df: pd.DataFrame, target_col: str):
  # Split dataframe into X and y.
  X = df.drop(columns=[target_col]).to_numpy(dtype=float)
  y = df[target_col].to_numpy(dtype=float)
  return X, y

X_train, y_train = split_X_y(train_num, TARGET_COL)
X_test, y_test = split_X_y(test_num, TARGET_COL)

lr = LinearRegression()
lr.fit(X_train, y_train)

train_pred = lr.predict(X_train)

intercept = float(lr.intercept_)
coefs = pd.Series(lr.coef_, index=train_num.drop(columns=[TARGET_COL]).columns)

train_mse = mean_squared_error(y_train, train_pred)
train_r2 = r2_score(y_train, train_pred)

intercept, coefs, train_mse, train_r2

(4.614076015572389,
 Age               0.013840
 Sleep_Duration    0.663838
 Activity_Level   -0.000594
 Stress_Level     -0.322086
 Heart_Rate       -0.021018
 Daily_Steps       0.000092
 dtype: float64,
 0.13100264319632637,
 0.9064110852787419)

2(c)

In [11]:
test_pred = lr.predict(X_test)
test_mse = mean_squared_error(y_test, test_pred)
test_r2 = r2_score(y_test, test_pred)

test_mse, test_r2

(0.12919025461695258, 0.9127041171912103)

2(d)

In [18]:
CAT_COLS = ["Gender", "Occupation"]

def add_two_categoricals_onehot(train_raw: pd.DataFrame, test_raw: pd.DataFrame, target_col: str, cat_cols: list[str]):
  # One-hot encode two categorical columns and align train/test columns.
  def base(df):
    d = df.copy()
    if "ID" in d.columns:
      d = d.drop(columns=["ID"])
    return d

  tr = base(train_raw)
  te = base(test_raw)

  tr_cat = pd.get_dummies(tr[cat_cols].astype(str), drop_first=False)
  te_cat = pd.get_dummies(te[cat_cols].astype(str), drop_first=False)

  tr_cat, te_cat = tr_cat.align(te_cat, join="outer", axis=1, fill_value=0)

  tr_num, _ = preprocess_drop_non_numeric_and_id(train_raw, target_col)
  te_num, _ = preprocess_drop_non_numeric_and_id(test_raw, target_col)

  tr_final = pd.concat([tr_num.drop(columns=[target_col]), tr_cat, tr_num[[target_col]]], axis=1)
  te_final = pd.concat([te_num.drop(columns=[target_col]), te_cat, te_num[[target_col]]], axis=1)

  return tr_final, te_final

train_enc, test_enc = add_two_categoricals_onehot(train_df, test_df, TARGET_COL, CAT_COLS)

X_train2, y_train2 = split_X_y(train_enc, TARGET_COL)
X_test2, y_test2 = split_X_y(test_enc, TARGET_COL)

lr2 = LinearRegression()
lr2.fit(X_train2, y_train2)

pred_train2 = lr2.predict(X_train2)
pred_test2 = lr2.predict(X_test2)

train_mse2 = mean_squared_error(y_train2, pred_train2)
train_r2_2 = r2_score(y_train2, pred_train2)

test_mse2 = mean_squared_error(y_test2, pred_test2)
test_r2_2 = r2_score(y_test2, pred_test2)

train_mse2, train_r2_2, test_mse2, test_r2_2


(0.06497758618036169,
 0.9535797017262199,
 0.07438596173481302,
 0.9497362380972562)

2(e)

In [19]:
coefs2 = pd.Series(lr2.coef_, index=train_enc.drop(columns=[TARGET_COL]).columns)
coefs2.abs().sort_values(ascending=False).head(10)

Occupation_Sales Representative    0.975234
Occupation_Accountant              0.618205
Occupation_Software Engineer       0.575148
Occupation_Scientist               0.511589
Occupation_Lawyer                  0.426314
Occupation_Salesperson             0.395816
Stress_Level                       0.391997
Sleep_Duration                     0.357944
Occupation_Teacher                 0.313042
Occupation_Doctor                  0.238636
dtype: float64

Problem 3(a)

In [20]:
def fit_closed_form(X: np.ndarray, y: np.ndarray, add_intercept: bool = True):
  # Fit linear regression using the closed-form solution.
  X = np.asarray(X, dtype=float)
  y = np.asarray(y, dtype=float).reshape(-1, 1)
  if add_intercept:
    Xb = np.column_stack([np.ones((X.shape[0], 1)), X])
  else:
    Xb = X
  XtX = Xb.T @ Xb
  Xty = Xb.T @ y
  w = np.linalg.pinv(XtX) @ Xty
  return w

def predict_closed_form(X: np.ndarray, w: np.ndarray, add_intercept: bool = True):
  # Predict outputs using learned weights.
  X = np.asarray(X, dtype=float)
  if add_intercept:
    Xb = np.column_stack([np.ones((X.shape[0], 1)), X])
  else:
    Xb = X
  yhat = Xb @ w
  return yhat.reshape(-1)

w_cf = fit_closed_form(X_train, y_train, add_intercept=True)
pred_train_cf = predict_closed_form(X_train, w_cf, add_intercept=True)
pred_test_cf = predict_closed_form(X_test, w_cf, add_intercept=True)

w_cf[:5].reshape(-1)

array([ 4.61407601e+00,  1.38404261e-02,  6.63838357e-01, -5.94043438e-04,
       -3.22086472e-01])

In [21]:
train_mse_cf = mean_squared_error(y_train, pred_train_cf)
train_r2_cf = r2_score(y_train, pred_train_cf)

test_mse_cf = mean_squared_error(y_test, pred_test_cf)
test_r2_cf = r2_score(y_test, pred_test_cf)

train_mse_cf, train_r2_cf, test_mse_cf, test_r2_cf

(0.13100264319632648,
 0.9064110852787418,
 0.12919025463765643,
 0.9127041171772203)

3(b)

In [22]:
compare_df = pd.DataFrame(
  {
    "Model": ["sklearn LinearRegression", "closed-form"],
    "Train MSE": [train_mse, train_mse_cf],
    "Train R2": [train_r2, train_r2_cf],
    "Test MSE": [test_mse, test_mse_cf],
    "Test R2": [test_r2, test_r2_cf],
  }
)
compare_df

Unnamed: 0,Model,Train MSE,Train R2,Test MSE,Test R2
0,sklearn LinearRegression,0.131003,0.906411,0.12919,0.912704
1,closed-form,0.131003,0.906411,0.12919,0.912704


Problem 4(a)

In [24]:
def build_poly_features(x: np.ndarray, degree: int):
  # Build polynomial features [x, x^2, ..., x^p].
  x = np.asarray(x, dtype=float).reshape(-1, 1)
  feats = [x ** d for d in range(1, degree + 1)]
  return np.hstack(feats)

def fit_poly_closed_form(x: np.ndarray, y: np.ndarray, degree: int):
  # Fit polynomial regression using closed-form solution.
  Xp = build_poly_features(x, degree)
  w = fit_closed_form(Xp, y, add_intercept=True)
  return w

def predict_poly_closed_form(x: np.ndarray, w: np.ndarray, degree: int):
  # Predict polynomial regression outputs using learned weights.
  Xp = build_poly_features(x, degree)
  yhat = predict_closed_form(Xp, w, add_intercept=True)
  return yhat

4(b)

In [25]:
X_COL = "Sleep_Duration"

if X_COL not in train_df.columns:
  raise KeyError(f"Column not found: {X_COL}. Available columns: {list(train_df.columns)}")

x_train_sd = train_df[X_COL].to_numpy(dtype=float)
y_train_sd = train_df[TARGET_COL].to_numpy(dtype=float)

x_test_sd = test_df[X_COL].to_numpy(dtype=float)
y_test_sd = test_df[TARGET_COL].to_numpy(dtype=float)

degrees = [1, 2, 3, 5]

rows = []
for p in degrees:
  w_p = fit_poly_closed_form(x_train_sd, y_train_sd, degree=p)
  pred_tr = predict_poly_closed_form(x_train_sd, w_p, degree=p)
  pred_te = predict_poly_closed_form(x_test_sd, w_p, degree=p)

  rows.append(
    {
      "p": p,
      "Train MSE": mean_squared_error(y_train_sd, pred_tr),
      "Train R2": r2_score(y_train_sd, pred_tr),
      "Test MSE": mean_squared_error(y_test_sd, pred_te),
      "Test R2": r2_score(y_test_sd, pred_te),
    }
  )

poly_results = pd.DataFrame(rows).sort_values("p")
poly_results

Unnamed: 0,p,Train MSE,Train R2,Test MSE,Test R2
0,1,0.304066,0.782774,0.357803,0.758227
1,2,0.299796,0.785824,0.351232,0.762667
2,3,0.29225,0.791215,0.331388,0.776076
3,5,0.290183,0.792692,0.327831,0.778479


4(c)

In [26]:
degrees_full = [1, 2, 3, 4, 5]
rows_full = []
for p in degrees_full:
  w_p = fit_poly_closed_form(x_train_sd, y_train_sd, degree=p)
  pred_tr = predict_poly_closed_form(x_train_sd, w_p, degree=p)
  pred_te = predict_poly_closed_form(x_test_sd, w_p, degree=p)
  rows_full.append(
    {
      "p": p,
      "Train MSE": mean_squared_error(y_train_sd, pred_tr),
      "Test MSE": mean_squared_error(y_test_sd, pred_te),
    }
  )

mse_by_degree = pd.DataFrame(rows_full).sort_values("p")
mse_by_degree


Unnamed: 0,p,Train MSE,Test MSE
0,1,0.304066,0.357803
1,2,0.299796,0.351232
2,3,0.29225,0.331388
3,4,0.291129,0.329395
4,5,0.290183,0.327831


problem 5 helpers

In [7]:
def preprocess_drop_non_numeric_and_id(df: pd.DataFrame, target_col: str):
  # Drop non-numeric columns and ID column.
  df2 = df.copy()
  if "ID" in df2.columns:
    df2 = df2.drop(columns=["ID"])
  non_numeric_cols = [c for c in df2.columns if c != target_col and not pd.api.types.is_numeric_dtype(df2[c])]
  df2 = df2.drop(columns=non_numeric_cols)
  return df2

def split_X_y(df: pd.DataFrame, target_col: str):
  # Split dataframe into X and y.
  X = df.drop(columns=[target_col]).to_numpy(dtype=float)
  y = df[target_col].to_numpy(dtype=float)
  return X, y

TARGET_COL = "Sleep_Quality"

train_num = preprocess_drop_non_numeric_and_id(train_df, TARGET_COL)
test_num = preprocess_drop_non_numeric_and_id(test_df, TARGET_COL)

X_train, y_train = split_X_y(train_num, TARGET_COL)
X_test, y_test = split_X_y(test_num, TARGET_COL)

X_train.shape, X_test.shape

def add_intercept(X: np.ndarray):
  # Add intercept column to X.
  X = np.asarray(X, dtype=float)
  return np.column_stack([np.ones((X.shape[0], 1)), X])

def fit_standardizer(X: np.ndarray):
  # Fit a standardizer (mean/std) for each column.
  X = np.asarray(X, dtype=float)
  mu = X.mean(axis=0)
  sigma = X.std(axis=0)
  sigma = np.where(sigma == 0, 1.0, sigma)
  return mu, sigma

def transform_standardizer(X: np.ndarray, mu: np.ndarray, sigma: np.ndarray):
  # Apply standardization using provided mean/std.
  X = np.asarray(X, dtype=float)
  return (X - mu) / sigma

def predict_linear(Xb: np.ndarray, w: np.ndarray):
  # Predict y given X with intercept already added.
  return (Xb @ w).reshape(-1)

def mse(y_true: np.ndarray, y_pred: np.ndarray):
  # Compute mean squared error.
  y_true = np.asarray(y_true, dtype=float).reshape(-1)
  y_pred = np.asarray(y_pred, dtype=float).reshape(-1)
  return float(np.mean((y_true - y_pred) ** 2))

def r2(y_true: np.ndarray, y_pred: np.ndarray):
  # Compute R^2 score.
  y_true = np.asarray(y_true, dtype=float).reshape(-1)
  y_pred = np.asarray(y_pred, dtype=float).reshape(-1)
  ss_res = np.sum((y_true - y_pred) ** 2)
  ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
  return float(1.0 - ss_res / ss_tot) if ss_tot != 0 else 0.0

def gradient_descent_linear_regression(Xb: np.ndarray, y: np.ndarray, alpha: float, num_iters: int, w0=None):
  # Train linear regression using gradient descent.
  Xb = np.asarray(Xb, dtype=float)
  y = np.asarray(y, dtype=float).reshape(-1, 1)
  n, d = Xb.shape
  w = np.zeros((d, 1)) if w0 is None else np.asarray(w0, dtype=float).reshape(d, 1)

  history = []
  for _ in range(num_iters):
    yhat = Xb @ w
    grad = (2.0 / n) * (Xb.T @ (yhat - y))
    w = w - alpha * grad
    history.append(float(np.mean((yhat - y) ** 2)))
  return w, history

Problem 5(a)

In [9]:
alpha_try = 0.1
iters_try = 100

Xb_train_raw = add_intercept(X_train)
w_raw, hist_raw = gradient_descent_linear_regression(Xb_train_raw, y_train, alpha_try, iters_try)

raw_last = hist_raw[-1]
raw_has_nan = (not np.isfinite(raw_last)) or (np.any(~np.isfinite(w_raw)))
raw_last, raw_has_nan

  history.append(float(np.mean((yhat - y) ** 2)))
  yhat = Xb @ w
  w = w - alpha * grad


(nan, True)

In [10]:
mu, sigma = fit_standardizer(X_train)
X_train_norm = transform_standardizer(X_train, mu, sigma)
X_test_norm = transform_standardizer(X_test, mu, sigma)

Xb_train_norm = add_intercept(X_train_norm)
Xb_test_norm = add_intercept(X_test_norm)

w_norm, hist_norm = gradient_descent_linear_regression(Xb_train_norm, y_train, alpha_try, iters_try)

train_pred_norm = predict_linear(Xb_train_norm, w_norm)
test_pred_norm = predict_linear(Xb_test_norm, w_norm)

(hist_norm[-1], mse(y_train, train_pred_norm), r2(y_train, train_pred_norm), mse(y_test, test_pred_norm), r2(y_test, test_pred_norm))

(0.13130580569648728,
 0.13129585157531815,
 0.9062016158107412,
 0.12829102622635433,
 0.9133117399289836)

Problem 5(b)

In [11]:
alphas = [0.01, 0.1, 0.5]
iters_list = [10, 50, 100]

rows = []
for a in alphas:
  for iters in iters_list:
    w, hist = gradient_descent_linear_regression(Xb_train_norm, y_train, a, iters)
    yhat_tr = predict_linear(Xb_train_norm, w)
    yhat_te = predict_linear(Xb_test_norm, w)
    rows.append({
      "alpha": a,
      "iterations": iters,
      "Train MSE": mse(y_train, yhat_tr),
      "Train R2": r2(y_train, yhat_tr),
      "Test MSE": mse(y_test, yhat_te),
      "Test R2": r2(y_test, yhat_te),
      "Train MSE (last step)": float(hist[-1]),
      "Finite": bool(np.isfinite(hist[-1]) and np.all(np.isfinite(w)))
    })

gd_results = pd.DataFrame(rows).sort_values(["alpha", "iterations"])
gd_results

Unnamed: 0,alpha,iterations,Train MSE,Train R2,Test MSE,Test R2,Train MSE (last step),Finite
0,0.01,10,36.86246,-25.33472,34.77731,-22.49957,38.40583,True
1,0.01,50,7.357906,-4.256523,7.201585,-3.866224,7.655951,True
2,0.01,100,1.095132,0.2176324,1.072739,0.2751334,1.134612,True
3,0.1,10,0.7656346,0.453027,0.7475378,0.494877,1.118761,True
4,0.1,50,0.1325647,0.9052952,0.1276342,0.9137556,0.132618,True
5,0.1,100,0.1312959,0.9062016,0.128291,0.9133117,0.1313058,True
6,0.5,10,8683.347,-6202.425,9328.417,-6302.358,3560.724,True
7,0.5,50,2.660853e+19,-1.900926e+19,2.858224e+19,-1.931347e+19,1.091094e+19,True
8,0.5,100,6.0666120000000005e+38,-4.334017e+38,6.516609e+38,-4.403375e+38,2.4876410000000003e+38,True


Problem 5(c)

In [12]:
best_by_test_mse = gd_results.loc[gd_results["Test MSE"].idxmin()].to_dict()
best_by_test_mse

{'alpha': 0.1,
 'iterations': 50,
 'Train MSE': 0.13256465940033338,
 'Train R2': 0.9052951734334294,
 'Test MSE': 0.12763418280729785,
 'Test R2': 0.9137555793370233,
 'Train MSE (last step)': 0.13261804262775762,
 'Finite': True}

In [13]:
trend = gd_results[["alpha", "iterations", "Train MSE", "Test MSE", "Train R2", "Test R2", "Finite"]]
trend

Unnamed: 0,alpha,iterations,Train MSE,Test MSE,Train R2,Test R2,Finite
0,0.01,10,36.86246,34.77731,-25.33472,-22.49957,True
1,0.01,50,7.357906,7.201585,-4.256523,-3.866224,True
2,0.01,100,1.095132,1.072739,0.2176324,0.2751334,True
3,0.1,10,0.7656346,0.7475378,0.453027,0.494877,True
4,0.1,50,0.1325647,0.1276342,0.9052952,0.9137556,True
5,0.1,100,0.1312959,0.128291,0.9062016,0.9133117,True
6,0.5,10,8683.347,9328.417,-6202.425,-6302.358,True
7,0.5,50,2.660853e+19,2.858224e+19,-1.900926e+19,-1.931347e+19,True
8,0.5,100,6.0666120000000005e+38,6.516609e+38,-4.334017e+38,-4.403375e+38,True
