# 🧪 HW: Mismatched training vs. dev/test (Computational Only)

**Rules:** Only calculations and code. No essays. All answers must be produced by your code cells.
Set `SEED = 42` unless otherwise stated.


In [4]:
# ====== Setup ======
import numpy as np, pandas as pd, json, math, sys, os, time
from scipy import stats
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

SEED = 42
np.random.seed(SEED)

def kl_gaussians(mu0, Sigma0, mu1, Sigma1):
    # KL(N0 || N1) for multivariate Gaussians (closed-form)
    mu0, mu1 = np.atleast_1d(mu0), np.atleast_1d(mu1)
    d = mu0.shape[0]
    invS1 = np.linalg.inv(Sigma1)
    term_trace = np.trace(invS1 @ Sigma0)
    diff = (mu1 - mu0).reshape(-1,1)
    term_quad = float(diff.T @ invS1 @ diff)
    term_logdet = np.log(np.linalg.det(Sigma1) / np.linalg.det(Sigma0))
    return 0.5 * (term_trace + term_quad - d + term_logdet)

def psi(actual, expected, bins=10, eps=1e-9):
    # Population Stability Index between two 1D arrays; returns scalar PSI.
    q = np.quantile(expected, np.linspace(0,1,bins+1))
    q[0], q[-1] = -np.inf, np.inf
    e = np.histogram(expected, bins=q)[0] / (len(expected)+eps)
    a = np.histogram(actual,   bins=q)[0] / (len(actual)+eps)
    terms = (a - e) * np.log((a + eps) / (e + eps))
    return float(np.sum(terms))

def ks_1d(a, b):
    ks = stats.ks_2samp(a, b)
    return ks.statistic, ks.pvalue


## Task 0 — Student Info (computed fields allowed)

Fill variables (strings) below. No free text elsewhere.


In [5]:
# @title 1) Student Info & Config
# All code comments are in English.


# === ОБЯЗАТЕЛЬНО ЗАПОЛНИТЬ ===
full_name = "Doe John"     # например: "Тощев Александр"
student_group = "11-111"      # например: "208"
assignment_id = "HW_MISMATCH_01"
assert full_name != "Фамилия Имя", "Заполните full_name"
assert student_group != "Группа", "Заполните student_group"
print("✔ Student Info OK")

# Typical human accuracy (benchmark) for MNIST may be ~97-99%.
HUMAN_ACCURACY = 98.0  # @param {type:"number"}

print("Student:", full_name)
print("Human reference accuracy (%):", HUMAN_ACCURACY)

✔ Student Info OK
Student: Doe John
Human reference accuracy (%): 98.0


In [6]:
from datetime import datetime, timezone, timedelta

# Установите окна приёма (пример):

start_at_iso = "2025-10-20T09:00-04:00"  #@param {type:"string"}
due_at_iso   = "2025-11-03T23:59-04:00"  #@param {type:"string"}
start_dt = datetime.fromisoformat(start_at_iso)
due_dt   = datetime.fromisoformat(due_at_iso)
# Для протокола: время сдачи берём текущее (можно заменить на mtime файла)
import os
from datetime import datetime, timezone

# 📅 Add submission date based on file modification time
try:
    nb_path = __file__ if "__file__" in globals() else "HLP_AvoidableBias_Assignment_RU_EN.ipynb"
    mtime = os.path.getmtime(nb_path)
    submission_dt = datetime.fromtimestamp(mtime, tz=timezone.utc)
except Exception:
    submission_dt = datetime.utcnow().replace(tzinfo=timezone.utc)

def penalty_fraction(start_dt, due_dt, submission_dt):
    """Возвращает долю штрафа [0..1].
    0 — без штрафа (<= due_dt). Линейно растёт от due_dt к due_dt + (due_dt - start_dt).
    Не выходит за 1.0.
    """
    if submission_dt <= due_dt:
        return 0.0
    total = (due_dt - start_dt).total_seconds()
    late  = (submission_dt - due_dt).total_seconds()
    if total <= 0:
        return 1.0 if late > 0 else 0.0
    return min(1.0, max(0.0, late / total))

print(f"Окно приёма: {start_dt.isoformat()} — {due_dt.isoformat()} (UTC)")
print(f"Время сдачи: {submission_dt.isoformat()} (UTC)")

Окно приёма: 2025-10-20T09:00:00-04:00 — 2025-11-03T23:59:00-04:00 (UTC)
Время сдачи: 2025-10-20T17:42:44.637086+00:00 (UTC)


  submission_dt = datetime.utcnow().replace(tzinfo=timezone.utc)


## Task 1 — Synthetic shift, KL, PSI, KS (25 pts)

Generate two 2D Gaussian samples:
- `train`:  N([0, 0], [[1, 0.3], [0.3, 1]]), size=10_000
- `test`:   N([0.8, 0.0], [[1.0, 0.3], [0.3, 1.0]]), size=10_000

Compute:
1. Empirical means and covariances for train/test.
2. Closed-form **KL(N_train || N_test)** (use theoretical params above, not empirical).
3. **PSI** for feature 0: test vs train (bins=10).
4. **KS** statistic for feature 1: test vs train.

Return a dict `T1 = {"kl": float, "psi_x0": float, "ks_x1": float}`


In [7]:
# YOUR CODE HERE
mu_train = np.array([0.0, 0.0])
cov = np.array([[1.0, 0.3],[0.3, 1.0]])
mu_test = np.array([0.8, 0.0])
cov_test = cov.copy()

train = np.random.multivariate_normal(mean=mu_train, cov=cov, size=10_000)
test  = np.random.multivariate_normal(mean=mu_test,  cov=cov_test, size=10_000)

# 1) empirical stats (not directly graded but may be inspected)
emp_mu_train = train.mean(axis=0)
emp_cov_train = np.cov(train, rowvar=False)
emp_mu_test  = test.mean(axis=0)
emp_cov_test = np.cov(test, rowvar=False)

# 2) closed-form KL using theoretical params
KL = kl_gaussians(mu_train, cov, mu_test, cov_test)

# 3) PSI for feature 0
PSI_x0 = psi(test[:,0], train[:,0], bins=10)

# 4) KS for feature 1
KS_x1_stat, KS_x1_p = ks_1d(test[:,1], train[:,1])

T1 = {"kl": float(KL), "psi_x0": float(PSI_x0), "ks_x1": float(KS_x1_stat)}
T1


  term_quad = float(diff.T @ invS1 @ diff)


{'kl': 0.35164835164835173, 'psi_x0': 0.6222852995930186, 'ks_x1': 0.0101}

In [8]:
# === Auto-checks (approximate) ===
assert 0.25 < T1["kl"] < 0.40, f"KL unexpected: {T1['kl']:.4f}"
assert 0.01 < T1["psi_x0"] < 0.15, f"PSI(x0) unexpected: {T1['psi_x0']:.4f}"
assert 0.0 <= T1["ks_x1"] < 0.05, f"KS(x1) too large: {T1['ks_x1']:.4f}"
print("Task 1 checks passed.")


AssertionError: PSI(x0) unexpected: 0.6223

## Task 2 — Classification under covariate shift (25 pts)

Create a binary classification dataset on **train** (make_classification):
- `n_samples=20000, n_features=12, n_informative=6, n_redundant=2, class_sep=1.0, flip_y=0.01, random_state=SEED`

Create **dev** as a random 20% split from train distribution.

Create **test_shifted** by taking a fresh dataset with the *same generator params* but adding `+0.8` to features `[0,1,2]` only.

Train `LogisticRegression(max_iter=200, random_state=SEED)` on train.
Compute accuracies on: train, dev, test_shifted.

Return dict `T2 = {"acc_train":..., "acc_dev":..., "acc_test":..., "gap_dev":acc_train-acc_dev, "gap_test":acc_train-acc_test}`


In [9]:
# YOUR CODE HERE
X, y = make_classification(n_samples=20000, n_features=12, n_informative=6, n_redundant=2,
                           class_sep=1.0, flip_y=0.01, random_state=SEED)
X_tr, X_dev, y_tr, y_dev = train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)

X_test, y_test = make_classification(n_samples=8000, n_features=12, n_informative=6, n_redundant=2,
                                     class_sep=1.0, flip_y=0.01, random_state=SEED+1)
X_test[:,0:3] += 0.8  # shifted

clf = LogisticRegression(max_iter=200, random_state=SEED)
clf.fit(X_tr, y_tr)

acc_train = accuracy_score(y_tr, clf.predict(X_tr))
acc_dev   = accuracy_score(y_dev, clf.predict(X_dev))
acc_test  = accuracy_score(y_test, clf.predict(X_test))

T2 = {
    "acc_train": float(acc_train),
    "acc_dev": float(acc_dev),
    "acc_test": float(acc_test),
    "gap_dev": float(acc_train - acc_dev),
    "gap_test": float(acc_train - acc_test),
}
T2


{'acc_train': 0.838875,
 'acc_dev': 0.8365,
 'acc_test': 0.627125,
 'gap_dev': 0.002375000000000016,
 'gap_test': 0.21175}

In [10]:
assert 0.85 <= T2["acc_train"] <= 0.98, "Train acc out of expected range"
assert 0.80 <= T2["acc_dev"]   <= 0.95, "Dev acc out of expected range"
assert 0.70 <= T2["acc_test"]  <= 0.92, "Shifted test acc out of expected range"
assert T2["gap_test"] > (T2["gap_dev"] + 0.03), "Shifted test should degrade more than dev (+0.03)"
print("Task 2 checks passed.")


AssertionError: Train acc out of expected range

## Task 3 — Domain classifier AUC (train vs test) (20 pts)

Build a **domain classifier** to distinguish `train` (label=0) vs `test_shifted` (label=1) using the features from Task 2.
Use `LogisticRegression(max_iter=200, random_state=SEED)` and report ROC AUC.

Return `T3 = {"auc_domain": ...}`


In [11]:
# YOUR CODE HERE
X_dom = np.vstack([X_tr, X_test])
y_dom = np.hstack([np.zeros(len(X_tr), dtype=int), np.ones(len(X_test), dtype=int)])

dom_clf = LogisticRegression(max_iter=200, random_state=SEED)
dom_clf.fit(X_dom, y_dom)
auc_domain = roc_auc_score(y_dom, dom_clf.predict_proba(X_dom)[:,1])

T3 = {"auc_domain": float(auc_domain)}
T3


{'auc_domain': 0.8440739921874999}

In [None]:
assert 0.70 <= T3["auc_domain"] <= 0.98, f"AUC_domain out of expected range: {T3['auc_domain']:.3f}"
print("Task 3 checks passed.")


## Task 4 — Importance weighting via domain classifier (20 pts)

Compute sample-weights `w = p(test)/p(train)` using the domain classifier (logistic) on the pooled data.
Hint: For predicted probability `p = P(domain=1|x)`, set `w = p/(1-p)` for train samples.

Retrain the **task classifier** on train with these weights and recompute accuracy on `test_shifted`.
Return `T4 = {"acc_test_unweighted":..., "acc_test_weighted":..., "delta": weighted - unweighted}`


In [12]:
# YOUR CODE HERE
# Estimate domain probs on train split only
p_train = dom_clf.predict_proba(X_tr)[:,1]
w = p_train / (1.0 - p_train + 1e-12)

clf_w = LogisticRegression(max_iter=200, random_state=SEED)
clf_w.fit(X_tr, y_tr, sample_weight=w)

acc_test_unweighted = T2["acc_test"]
acc_test_weighted = accuracy_score(y_test, clf_w.predict(X_test))

T4 = {
    "acc_test_unweighted": float(acc_test_unweighted),
    "acc_test_weighted": float(acc_test_weighted),
    "delta": float(acc_test_weighted - acc_test_unweighted)
}
T4


{'acc_test_unweighted': 0.627125,
 'acc_test_weighted': 0.507,
 'delta': -0.12012500000000004}

In [13]:
# We expect a non-negative improvement most of the time with this synthetic shift.
assert T4["delta"] >= -0.02, f"Weighted shouldn't be much worse; got {T4['delta']:.4f}"
print("Task 4 checks passed (not strict on improvement).")


AssertionError: Weighted shouldn't be much worse; got -0.1201

## Task 5 — KS per feature + Benjamini–Hochberg (10 pts)

For each feature `j in [0..11]` compute KS statistic between `X_tr[:,j]` and `X_test[:,j]`.
Apply Benjamini–Hochberg FDR control at `alpha=0.05` to count significant shifts.

Return `T5 = {"shifted_features": int, "indices": list_of_ints}`


In [14]:
# YOUR CODE HERE
pvals = []
stats_list = []
for j in range(X_tr.shape[1]):
    ks_res = stats.ks_2samp(X_tr[:,j], X_test[:,j])
    pvals.append(ks_res.pvalue)

pvals = np.array(pvals)
m = len(pvals)
order = np.argsort(pvals)
thresholds = (np.arange(1, m+1) / m) * 0.05
passed = pvals[order] <= thresholds
k = np.max(np.where(passed)) + 1 if np.any(passed) else 0
sig_idx = order[:k].tolist()

T5 = {"shifted_features": int(len(sig_idx)), "indices": sig_idx}
T5


{'shifted_features': 10, 'indices': [1, 5, 3, 10, 4, 6, 9, 2, 8, 0]}

In [15]:
assert T5["shifted_features"] >= 1, "Expected at least one shifted feature"
print("Task 5 checks passed.")


Task 5 checks passed.


## Final Score (computed)
All points are assigned via passing checks above. If a check fails, fix your code.


In [16]:
score = 0
try:
    _ = T1; score += 25
    _ = T2; score += 25
    _ = T3; score += 20
    _ = T4; score += 20
    _ = T5; score += 10
except Exception as e:
    pass

raw_score = score


In [17]:
import json

# применяем штраф
try:
    pf = penalty_fraction(start_dt, due_dt, submission_dt)
except NameError:
    from datetime import timezone
    pf = 0.0
# ✅ Итоговый результат
max_points=100
final_score = max(0.0, raw_score * (1.0 - min(1.0, pf)))

print(f"Сырой балл: {raw_score}/{max_points}")
print(f"Штраф (доля): {pf:.4f}")
print(f"Итоговый балл после штрафа: {final_score:.2f}/{max_points}")

# Последняя строка — JSON, который читает harness
final = {
    "name": full_name,
    "group": student_group,
    "assignment": assignment_id,
    "score": float(final_score)
}

Сырой балл: 100/100
Штраф (доля): 0.0000
Итоговый балл после штрафа: 100.00/100


In [18]:
print(json.dumps(final, ensure_ascii=False))

{"name": "Doe John", "group": "11-111", "assignment": "HW_MISMATCH_01", "score": 100.0}
