In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.datasets import fetch_openml, load_iris

np.random.seed(42)


def generate_highly_correlated_data(n_samples=500, n_features=7, rho=0.9, noise_std=1.0):
    cov = np.full((n_features, n_features), rho)
    np.fill_diagonal(cov, 1.0)
    X = np.random.multivariate_normal(np.zeros(n_features), cov, size=n_samples)
    true_beta = np.linspace(1, n_features*0.5, n_features)
    y = X.dot(true_beta) + np.random.normal(0, noise_std, size=n_samples)
    return X, y

def ridge_gradient_descent(X, y, lr, alpha, n_iter=5000):
    n, d = X.shape
    w = np.zeros(d)
    b = 0.0
    prev_cost = np.inf

    for _ in range(n_iter):
        preds = X.dot(w) + b
        error = preds - y
        cost = (1/(2*n))*np.sum(error**2) + (alpha/2)*np.sum(w**2)

        if not np.isfinite(cost) or cost > 1e10:
            return None, None, np.inf

        grad_w = (1/n)*(X.T.dot(error)) + alpha*w
        grad_b = (1/n)*np.sum(error)

        w -= lr * grad_w
        b -= lr * grad_b


        if abs(prev_cost - cost) < 1e-8:
            break
        prev_cost = cost

    return w, b, cost


X, y = generate_highly_correlated_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler().fit(X_train)
X_train_s, X_test_s = scaler.transform(X_train), scaler.transform(X_test)


learning_rates = [1e-4, 1e-3, 1e-2, 0.05]
alphas = [1e-10, 1e-5, 1e-3, 0.1, 1, 10]

results = []
for lr in learning_rates:
    for alpha in alphas:
        w, b, cost = ridge_gradient_descent(X_train_s, y_train, lr, alpha)
        if cost == np.inf:
            continue
        preds = X_test_s.dot(w) + b
        r2 = r2_score(y_test, preds)
        mse = mean_squared_error(y_test, preds)
        results.append({'lr': lr, 'alpha': alpha, 'cost': cost, 'R2': r2, 'MSE': mse})

df = pd.DataFrame(results)
best = df.loc[df['R2'].idxmax()]
print("\nQ1: Ridge Regression (Gradient Descent)")
print(df)
print("\nBest parameters:")
print(best)



boston = fetch_openml(name='Boston', version=1, as_frame=True)


Xb = boston.data.copy()
yb = boston.target.astype(float)

for col in Xb.columns:
    if Xb[col].dtype.name == "category" or Xb[col].dtype == object:
        Xb[col] = pd.to_numeric(Xb[col], errors='coerce')

Xb = Xb.fillna(Xb.mean())

Xb_train, Xb_test, yb_train, yb_test = train_test_split(Xb, yb, test_size=0.2, random_state=42)

scaler_b = StandardScaler().fit(Xb_train)
Xb_train_s = scaler_b.transform(Xb_train)
Xb_test_s = scaler_b.transform(Xb_test)

alphas_cv = np.logspace(-3, 3, 50)

ridgecv = RidgeCV(alphas=alphas_cv, store_cv_values=True).fit(Xb_train_s, yb_train)
lassocv = LassoCV(alphas=alphas_cv, cv=5, max_iter=10000).fit(Xb_train_s, yb_train)

ridge_r2 = r2_score(yb_test, ridgecv.predict(Xb_test_s))
lasso_r2 = r2_score(yb_test, lassocv.predict(Xb_test_s))

print("Q3: RidgeCV & LassoCV (Boston Dataset)")
print(f"RidgeCV -> Best alpha: {ridgecv.alpha_:.5f}, Test R²: {ridge_r2:.4f}")
print(f"LassoCV -> Best alpha: {lassocv.alpha_:.5f}, Test R²: {lasso_r2:.4f}")

if ridge_r2 > lasso_r2:
    print("\nRidge Regression performs better — it's more stable when features are correlated.")
else:
    print("\nLasso Regression performs better — it performs feature selection by shrinking some weights to 0.")



print("\nQ4: Multiclass Logistic Regression (One-vs-Rest)")
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler().fit(X_train)
X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)

def sigmoid(z): return 1 / (1 + np.exp(-z))

def train_logistic(X, y, lr=0.1, n_iter=1000):
    n, d = X.shape
    w, b = np.zeros(d), 0
    for _ in range(n_iter):
        preds = sigmoid(X.dot(w) + b)
        w -= lr * (X.T.dot(preds - y) / n)
        b -= lr * np.mean(preds - y)
    return w, b

classes = np.unique(y_train)
weights = {}
for c in classes:
    y_bin = (y_train == c).astype(float)
    w, b = train_logistic(X_train, y_bin)
    weights[c] = (w, b)

def predict_ovr(X, weights):
    scores = np.column_stack([sigmoid(X.dot(w) + b) for w, b in weights.values()])
    return np.argmax(scores, axis=1)

y_pred = predict_ovr(X_test, weights)
accuracy = np.mean(y_pred == y_test)
print(f"Accuracy (One-vs-Rest Logistic Regression): {accuracy:.3f}")



Q1: Ridge Regression (Gradient Descent)
        lr         alpha       cost        R2        MSE
0   0.0001  1.000000e-10   1.660622  0.984561   3.086058
1   0.0001  1.000000e-05   1.660802  0.984561   3.086074
2   0.0001  1.000000e-03   1.678582  0.984553   3.087597
3   0.0001  1.000000e-01   3.430182  0.983637   3.270813
4   0.0001  1.000000e+00  17.258362  0.964194   7.157198
5   0.0001  1.000000e+01  72.467379  0.624297  75.099020
6   0.0010  1.000000e-10   0.630606  0.994664   1.066649
7   0.0010  1.000000e-05   0.630794  0.994664   1.066652
8   0.0010  1.000000e-03   0.649376  0.994662   1.066985
9   0.0010  1.000000e-01   2.465418  0.994277   1.143880
10  0.0010  1.000000e+00  16.473348  0.975803   4.836793
11  0.0010  1.000000e+01  71.750016  0.620788  75.800572
12  0.0100  1.000000e-10   0.533332  0.995546   0.890295
13  0.0100  1.000000e-05   0.533541  0.995546   0.890293
14  0.0100  1.000000e-03   0.554161  0.995547   0.890156
15  0.0100  1.000000e-01   2.448603  0.994780  

