# HW4 – Assignment 2

This notebook trains logistic regression models on the `smarket.csv` data set and focuses on the Lag1/Lag2-only specification required in Assignment 2.

In [None]:

from pathlib import Path
import csv
import math
from typing import List, Sequence, Tuple

DATA_PATH = Path("smarket.csv")
FEATURE_NAMES = [f"Lag{i}" for i in range(1, 6)] + ["Volume"]

def load_smarket(path: Path) -> Tuple[List[dict], List[int], List[int]]:
    rows: List[dict] = []
    labels: List[int] = []
    years: List[int] = []
    with path.open(newline="") as f:
        reader = csv.DictReader(f)
        for row in reader:
            years.append(int(row["Year"]))
            rows.append({name: float(row[name]) for name in FEATURE_NAMES})
            labels.append(1 if row["Direction"].strip().lower() == "up" else 0)
    return rows, labels, years

rows, labels, years = load_smarket(DATA_PATH)

def split_by_year(selected_features: Sequence[str]):
    train_X: List[List[float]] = []
    train_y: List[int] = []
    test_X: List[List[float]] = []
    test_y: List[int] = []
    for feat_row, label, year in zip(rows, labels, years):
        vector = [feat_row[name] for name in selected_features]
        if year < 2005:
            train_X.append(vector)
            train_y.append(label)
        elif year == 2005:
            test_X.append(vector)
            test_y.append(label)
    return train_X, train_y, test_X, test_y

def compute_scaler(dataset: Sequence[Sequence[float]]):
    n_features = len(dataset[0])
    means = [sum(row[i] for row in dataset) / len(dataset) for i in range(n_features)]
    stds: List[float] = []
    for i in range(n_features):
        mean_i = means[i]
        variance = sum((row[i] - mean_i) ** 2 for row in dataset) / len(dataset)
        stds.append(variance ** 0.5 if variance > 0 else 1.0)
    return means, stds

def apply_scaler(dataset: Sequence[Sequence[float]], means: Sequence[float], stds: Sequence[float]):
    scaled: List[List[float]] = []
    for row in dataset:
        scaled.append([(row[i] - means[i]) / stds[i] for i in range(len(row))])
    return scaled

def add_bias(dataset: Sequence[Sequence[float]]):
    return [[1.0] + list(row) for row in dataset]

def sigmoid(z: float) -> float:
    if z >= 0:
        ez = math.exp(-z)
        return 1.0 / (1.0 + ez)
    ez = math.exp(z)
    return ez / (1.0 + ez)

def solve_linear_system(matrix: List[List[float]], vector: List[float]):
    n = len(vector)
    augmented = [row[:] + [vector[i]] for i, row in enumerate(matrix)]
    for col in range(n):
        pivot = max(range(col, n), key=lambda r: abs(augmented[r][col]))
        if abs(augmented[pivot][col]) < 1e-12:
            raise ValueError("Singular matrix encountered in Newton step")
        if pivot != col:
            augmented[col], augmented[pivot] = augmented[pivot], augmented[col]
        pivot_val = augmented[col][col]
        augmented[col] = [val / pivot_val for val in augmented[col]]
        for row in range(n):
            if row == col:
                continue
            factor = augmented[row][col]
            if factor == 0:
                continue
            augmented[row] = [augmented[row][i] - factor * augmented[col][i] for i in range(n + 1)]
    return [augmented[i][-1] for i in range(n)]

def train_logistic(features: Sequence[Sequence[float]], labels: Sequence[int], *, l2: float = 1e-3,
                   max_iter: int = 30, tol: float = 1e-9):
    n_features = len(features[0])
    weights = [0.0] * n_features
    for _ in range(max_iter):
        gradient = [0.0] * n_features
        hessian = [[0.0] * n_features for _ in range(n_features)]
        for row, target in zip(features, labels):
            z = sum(w * x for w, x in zip(weights, row))
            p = sigmoid(z)
            diff = p - target
            for j in range(n_features):
                gradient[j] += diff * row[j]
            weight = p * (1.0 - p)
            for i in range(n_features):
                for j in range(n_features):
                    hessian[i][j] += weight * row[i] * row[j]
        for j in range(1, n_features):
            gradient[j] += l2 * weights[j]
            hessian[j][j] += l2
        step = solve_linear_system(hessian, gradient)
        max_delta = max(abs(delta) for delta in step)
        weights = [w - delta for w, delta in zip(weights, step)]
        if max_delta < tol:
            break
    return weights

def predict_probabilities(features: Sequence[Sequence[float]], weights: Sequence[float]):
    return [sigmoid(sum(w * x for w, x in zip(weights, row))) for row in features]

def accuracy_from_probs(probs: Sequence[float], labels: Sequence[int], threshold: float = 0.5) -> float:
    preds = [1 if prob >= threshold else 0 for prob in probs]
    return sum(int(pred == target) for pred, target in zip(preds, labels)) / len(labels)


In [None]:

# Assignment 1 baseline: Lag1-Lag5 plus Volume
X_train_all, y_train_all, X_test_all, y_test_all = split_by_year(FEATURE_NAMES)
train_means_all, train_stds_all = compute_scaler(X_train_all)
X_train_all_scaled = add_bias(apply_scaler(X_train_all, train_means_all, train_stds_all))
X_test_all_scaled = add_bias(apply_scaler(X_test_all, train_means_all, train_stds_all))
weights_all = train_logistic(X_train_all_scaled, y_train_all)
probs_all = predict_probabilities(X_test_all_scaled, weights_all)
acc_all = accuracy_from_probs(probs_all, y_test_all)
print(f"Assignment 1 – LR with Lag1-Lag5 & Volume accuracy: {acc_all:.3f}")

# Assignment 2 model: Lag1 and Lag2 only
lag12_features = ["Lag1", "Lag2"]
X_train_12, y_train_12, X_test_12, y_test_12 = split_by_year(lag12_features)
train_means_12, train_stds_12 = compute_scaler(X_train_12)
X_train_12_scaled = add_bias(apply_scaler(X_train_12, train_means_12, train_stds_12))
X_test_12_scaled = add_bias(apply_scaler(X_test_12, train_means_12, train_stds_12))
weights_12 = train_logistic(X_train_12_scaled, y_train_12)
probs_12 = predict_probabilities(X_test_12_scaled, weights_12)
acc_12 = accuracy_from_probs(probs_12, y_test_12)
print(f"Assignment 2 – LR with Lag1 & Lag2 accuracy: {acc_12:.3f}")


The Lag1/Lag2-only logistic regression attains higher accuracy than the Assignment 1 model with all six predictors, indicating that the additional lags and volume add noise for the 2005 test data rather than helpful signal.