Import and load data sets

In [2]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

TRAIN_PATH = "train.csv"
TEST_PATH = "test.csv"

train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

train_df.head()

Unnamed: 0,ID,Gender,Age,Occupation,Sleep_Duration,Sleep_Quality,Activity_Level,Stress_Level,BMI_Category,Blood_Pressure,Heart_Rate,Daily_Steps,Sleep_Disorder
0,373,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
1,193,Male,43,Salesperson,6.5,6,45,7,Overweight,130/85,72,6000,Insomnia
2,76,Male,33,Doctor,6.0,6,30,8,Normal,125/80,72,5000,
3,85,Male,35,Software Engineer,7.5,8,60,5,Normal Weight,120/80,70,8000,
4,363,Female,59,Nurse,8.2,9,75,3,Overweight,140/95,68,7000,Sleep Apnea


detail of data set

In [8]:
train_df.columns.tolist()

['ID',
 'Gender',
 'Age',
 'Occupation',
 'Sleep_Duration',
 'Sleep_Quality',
 'Activity_Level',
 'Stress_Level',
 'BMI_Category',
 'Blood_Pressure',
 'Heart_Rate',
 'Daily_Steps',
 'Sleep_Disorder']

Problem 2(a)

In [9]:
def preprocess_drop_non_numeric_and_id(df: pd.DataFrame, target_col: str):
  # Drop non-numeric columns and ID column.
  removed = []
  df2 = df.copy()
  if "ID" in df2.columns:
    removed.append("ID")
    df2 = df2.drop(columns=["ID"])
  non_numeric_cols = [c for c in df2.columns if c != target_col and not pd.api.types.is_numeric_dtype(df2[c])]
  removed.extend(non_numeric_cols)
  df2 = df2.drop(columns=non_numeric_cols)
  return df2, removed

TARGET_COL = "Sleep_Quality"

train_num, removed_cols = preprocess_drop_non_numeric_and_id(train_df, TARGET_COL)
test_num, _ = preprocess_drop_non_numeric_and_id(test_df, TARGET_COL)

removed_cols

['ID',
 'Gender',
 'Occupation',
 'BMI_Category',
 'Blood_Pressure',
 'Sleep_Disorder']

2(b)

In [10]:
def split_X_y(df: pd.DataFrame, target_col: str):
  # Split dataframe into X and y.
  X = df.drop(columns=[target_col]).to_numpy(dtype=float)
  y = df[target_col].to_numpy(dtype=float)
  return X, y

X_train, y_train = split_X_y(train_num, TARGET_COL)
X_test, y_test = split_X_y(test_num, TARGET_COL)

lr = LinearRegression()
lr.fit(X_train, y_train)

train_pred = lr.predict(X_train)

intercept = float(lr.intercept_)
coefs = pd.Series(lr.coef_, index=train_num.drop(columns=[TARGET_COL]).columns)

train_mse = mean_squared_error(y_train, train_pred)
train_r2 = r2_score(y_train, train_pred)

intercept, coefs, train_mse, train_r2

(4.614076015572389,
 Age               0.013840
 Sleep_Duration    0.663838
 Activity_Level   -0.000594
 Stress_Level     -0.322086
 Heart_Rate       -0.021018
 Daily_Steps       0.000092
 dtype: float64,
 0.13100264319632637,
 0.9064110852787419)

2(c)

In [11]:
test_pred = lr.predict(X_test)
test_mse = mean_squared_error(y_test, test_pred)
test_r2 = r2_score(y_test, test_pred)

test_mse, test_r2

(0.12919025461695258, 0.9127041171912103)

2(d)

In [18]:
CAT_COLS = ["Gender", "Occupation"]

def add_two_categoricals_onehot(train_raw: pd.DataFrame, test_raw: pd.DataFrame, target_col: str, cat_cols: list[str]):
  # One-hot encode two categorical columns and align train/test columns.
  def base(df):
    d = df.copy()
    if "ID" in d.columns:
      d = d.drop(columns=["ID"])
    return d

  tr = base(train_raw)
  te = base(test_raw)

  tr_cat = pd.get_dummies(tr[cat_cols].astype(str), drop_first=False)
  te_cat = pd.get_dummies(te[cat_cols].astype(str), drop_first=False)

  tr_cat, te_cat = tr_cat.align(te_cat, join="outer", axis=1, fill_value=0)

  tr_num, _ = preprocess_drop_non_numeric_and_id(train_raw, target_col)
  te_num, _ = preprocess_drop_non_numeric_and_id(test_raw, target_col)

  tr_final = pd.concat([tr_num.drop(columns=[target_col]), tr_cat, tr_num[[target_col]]], axis=1)
  te_final = pd.concat([te_num.drop(columns=[target_col]), te_cat, te_num[[target_col]]], axis=1)

  return tr_final, te_final

train_enc, test_enc = add_two_categoricals_onehot(train_df, test_df, TARGET_COL, CAT_COLS)

X_train2, y_train2 = split_X_y(train_enc, TARGET_COL)
X_test2, y_test2 = split_X_y(test_enc, TARGET_COL)

lr2 = LinearRegression()
lr2.fit(X_train2, y_train2)

pred_train2 = lr2.predict(X_train2)
pred_test2 = lr2.predict(X_test2)

train_mse2 = mean_squared_error(y_train2, pred_train2)
train_r2_2 = r2_score(y_train2, pred_train2)

test_mse2 = mean_squared_error(y_test2, pred_test2)
test_r2_2 = r2_score(y_test2, pred_test2)

train_mse2, train_r2_2, test_mse2, test_r2_2


(0.06497758618036169,
 0.9535797017262199,
 0.07438596173481302,
 0.9497362380972562)

2(e)

In [19]:
coefs2 = pd.Series(lr2.coef_, index=train_enc.drop(columns=[TARGET_COL]).columns)
coefs2.abs().sort_values(ascending=False).head(10)

Occupation_Sales Representative    0.975234
Occupation_Accountant              0.618205
Occupation_Software Engineer       0.575148
Occupation_Scientist               0.511589
Occupation_Lawyer                  0.426314
Occupation_Salesperson             0.395816
Stress_Level                       0.391997
Sleep_Duration                     0.357944
Occupation_Teacher                 0.313042
Occupation_Doctor                  0.238636
dtype: float64

Problem 3(a)

In [20]:
def fit_closed_form(X: np.ndarray, y: np.ndarray, add_intercept: bool = True):
  # Fit linear regression using the closed-form solution.
  X = np.asarray(X, dtype=float)
  y = np.asarray(y, dtype=float).reshape(-1, 1)
  if add_intercept:
    Xb = np.column_stack([np.ones((X.shape[0], 1)), X])
  else:
    Xb = X
  XtX = Xb.T @ Xb
  Xty = Xb.T @ y
  w = np.linalg.pinv(XtX) @ Xty
  return w

def predict_closed_form(X: np.ndarray, w: np.ndarray, add_intercept: bool = True):
  # Predict outputs using learned weights.
  X = np.asarray(X, dtype=float)
  if add_intercept:
    Xb = np.column_stack([np.ones((X.shape[0], 1)), X])
  else:
    Xb = X
  yhat = Xb @ w
  return yhat.reshape(-1)

w_cf = fit_closed_form(X_train, y_train, add_intercept=True)
pred_train_cf = predict_closed_form(X_train, w_cf, add_intercept=True)
pred_test_cf = predict_closed_form(X_test, w_cf, add_intercept=True)

w_cf[:5].reshape(-1)

array([ 4.61407601e+00,  1.38404261e-02,  6.63838357e-01, -5.94043438e-04,
       -3.22086472e-01])

In [21]:
train_mse_cf = mean_squared_error(y_train, pred_train_cf)
train_r2_cf = r2_score(y_train, pred_train_cf)

test_mse_cf = mean_squared_error(y_test, pred_test_cf)
test_r2_cf = r2_score(y_test, pred_test_cf)

train_mse_cf, train_r2_cf, test_mse_cf, test_r2_cf

(0.13100264319632648,
 0.9064110852787418,
 0.12919025463765643,
 0.9127041171772203)

3(b)

In [22]:
compare_df = pd.DataFrame(
  {
    "Model": ["sklearn LinearRegression", "closed-form"],
    "Train MSE": [train_mse, train_mse_cf],
    "Train R2": [train_r2, train_r2_cf],
    "Test MSE": [test_mse, test_mse_cf],
    "Test R2": [test_r2, test_r2_cf],
  }
)
compare_df

Unnamed: 0,Model,Train MSE,Train R2,Test MSE,Test R2
0,sklearn LinearRegression,0.131003,0.906411,0.12919,0.912704
1,closed-form,0.131003,0.906411,0.12919,0.912704


Problem 4(a)

In [24]:
def build_poly_features(x: np.ndarray, degree: int):
  # Build polynomial features [x, x^2, ..., x^p].
  x = np.asarray(x, dtype=float).reshape(-1, 1)
  feats = [x ** d for d in range(1, degree + 1)]
  return np.hstack(feats)

def fit_poly_closed_form(x: np.ndarray, y: np.ndarray, degree: int):
  # Fit polynomial regression using closed-form solution.
  Xp = build_poly_features(x, degree)
  w = fit_closed_form(Xp, y, add_intercept=True)
  return w

def predict_poly_closed_form(x: np.ndarray, w: np.ndarray, degree: int):
  # Predict polynomial regression outputs using learned weights.
  Xp = build_poly_features(x, degree)
  yhat = predict_closed_form(Xp, w, add_intercept=True)
  return yhat

4(b)

In [25]:
X_COL = "Sleep_Duration"

if X_COL not in train_df.columns:
  raise KeyError(f"Column not found: {X_COL}. Available columns: {list(train_df.columns)}")

x_train_sd = train_df[X_COL].to_numpy(dtype=float)
y_train_sd = train_df[TARGET_COL].to_numpy(dtype=float)

x_test_sd = test_df[X_COL].to_numpy(dtype=float)
y_test_sd = test_df[TARGET_COL].to_numpy(dtype=float)

degrees = [1, 2, 3, 5]

rows = []
for p in degrees:
  w_p = fit_poly_closed_form(x_train_sd, y_train_sd, degree=p)
  pred_tr = predict_poly_closed_form(x_train_sd, w_p, degree=p)
  pred_te = predict_poly_closed_form(x_test_sd, w_p, degree=p)

  rows.append(
    {
      "p": p,
      "Train MSE": mean_squared_error(y_train_sd, pred_tr),
      "Train R2": r2_score(y_train_sd, pred_tr),
      "Test MSE": mean_squared_error(y_test_sd, pred_te),
      "Test R2": r2_score(y_test_sd, pred_te),
    }
  )

poly_results = pd.DataFrame(rows).sort_values("p")
poly_results

Unnamed: 0,p,Train MSE,Train R2,Test MSE,Test R2
0,1,0.304066,0.782774,0.357803,0.758227
1,2,0.299796,0.785824,0.351232,0.762667
2,3,0.29225,0.791215,0.331388,0.776076
3,5,0.290183,0.792692,0.327831,0.778479


4(c)

In [26]:
degrees_full = [1, 2, 3, 4, 5]
rows_full = []
for p in degrees_full:
  w_p = fit_poly_closed_form(x_train_sd, y_train_sd, degree=p)
  pred_tr = predict_poly_closed_form(x_train_sd, w_p, degree=p)
  pred_te = predict_poly_closed_form(x_test_sd, w_p, degree=p)
  rows_full.append(
    {
      "p": p,
      "Train MSE": mean_squared_error(y_train_sd, pred_tr),
      "Test MSE": mean_squared_error(y_test_sd, pred_te),
    }
  )

mse_by_degree = pd.DataFrame(rows_full).sort_values("p")
mse_by_degree


Unnamed: 0,p,Train MSE,Test MSE
0,1,0.304066,0.357803
1,2,0.299796,0.351232
2,3,0.29225,0.331388
3,4,0.291129,0.329395
4,5,0.290183,0.327831
