# HW 4 – Assignment 1

This notebook follows the instructions from **HW 4** to train a logistic regression model on the `smarket.csv` dataset. The goal is to use the historical market indicators `Lag1`-`Lag5` along with `Volume` to predict whether the market direction (`Direction`) will be **Up** or **Down** on the following day. Training is performed on records with `Year < 2005` and the 2005 data is reserved for testing.


In [None]:
import csv
import math
from pathlib import Path


In [None]:
DATA_PATH = Path('smarket.csv')


def load_smarket(path: Path):
    """Return features, labels, and years from the CSV file."""
    X, y, years = [], [], []
    with path.open(newline='') as f:
        reader = csv.DictReader(f)
        for row in reader:
            years.append(int(row['Year']))
            features = [float(row[f'Lag{i}']) for i in range(1, 6)]
            features.append(float(row['Volume']))
            X.append(features)
            y.append(1 if row['Direction'].strip().lower() == 'up' else 0)
    return X, y, years


X, y, years = load_smarket(DATA_PATH)
train_mask = [year < 2005 for year in years]
test_mask = [year == 2005 for year in years]

X_train = [x for x, keep in zip(X, train_mask) if keep]
y_train = [label for label, keep in zip(y, train_mask) if keep]
X_test = [x for x, keep in zip(X, test_mask) if keep]
y_test = [label for label, keep in zip(y, test_mask) if keep]

print(f"Loaded {len(X)} total observations: {len(X_train)} for training and {len(X_test)} for testing.")


In [None]:
def compute_standardization(dataset):
    n_features = len(dataset[0])
    means = [sum(row[i] for row in dataset) / len(dataset) for i in range(n_features)]
    stds = []
    for i in range(n_features):
        mean_i = means[i]
        variance = sum((row[i] - mean_i) ** 2 for row in dataset) / len(dataset)
        stds.append(variance ** 0.5 if variance > 0 else 1.0)
    return means, stds


def apply_standardization(dataset, means, stds):
    scaled = []
    for row in dataset:
        scaled.append([(row[i] - means[i]) / stds[i] for i in range(len(row))])
    return scaled


feature_means, feature_stds = compute_standardization(X_train)
X_train_scaled = apply_standardization(X_train, feature_means, feature_stds)
X_test_scaled = apply_standardization(X_test, feature_means, feature_stds)


In [None]:
def sigmoid(z: float) -> float:
    if z >= 0:
        ez = math.exp(-z)
        return 1.0 / (1.0 + ez)
    ez = math.exp(z)
    return ez / (1.0 + ez)


def train_logistic_regression(features, labels, lr=0.1, epochs=2000):
    n_features = len(features[0])
    weights = [0.0] * (n_features + 1)
    n_samples = len(features)
    for epoch in range(epochs):
        gradient = [0.0] * (n_features + 1)
        for row, target in zip(features, labels):
            z = weights[0]
            for w, value in zip(weights[1:], row):
                z += w * value
            prediction = sigmoid(z)
            error = prediction - target
            gradient[0] += error
            for j in range(n_features):
                gradient[j + 1] += error * row[j]
        step = lr / n_samples
        for j in range(n_features + 1):
            weights[j] -= step * gradient[j]
    return weights


def predict_classes(features, weights):
    preds = []
    for row in features:
        score = weights[0]
        for w, value in zip(weights[1:], row):
            score += w * value
        preds.append(1 if sigmoid(score) >= 0.5 else 0)
    return preds


weights = train_logistic_regression(X_train_scaled, y_train)
predictions = predict_classes(X_test_scaled, weights)
accuracy = sum(1 for pred, true in zip(predictions, y_test) if pred == true) / len(y_test)
print(f"Classification accuracy on 2005 test data: {accuracy:.3f}")
