In [1]:
"""
Step-by-step Lasso on a 3×2 toy set
-----------------------------------
 Data (already centred enough for a demo):

 sample   x1   x2     y
   1      1    2      5
   2      0    1      2
   3     -1   -1     -2

 We use λ = 1.0 so you can clearly see the shrinkage.
"""

'\nStep-by-step Lasso on a 3×2 toy set\n-----------------------------------\n Data (already centred enough for a demo):\n\n sample   x1   x2     y\n   1      1    2      5\n   2      0    1      2\n   3     -1   -1     -2\n\n We use λ = 1.0 so you can clearly see the shrinkage.\n'

Least Absolute Shrinkage and Selection Operator
When Should You Use Lasso?
When you have many features and only some are truly important.

When you want to reduce overfitting and simplify your model.

When you want to automatically select features without manual work.


In [2]:
import numpy as np

In [5]:
x= np.array([[ 1,  2],
              [ 0,  1],
              [-1, -1]], dtype=float)
y = np.array([ 5,  2, -2], dtype=float)

In [8]:
n,p=x.shape
print(x.shape)

(3, 2)


In [11]:
lam=1
w=np.zeros(p)
b=0.0
col_len = (x**2).sum(axis=0) / n     # ⟨x_j²⟩ for scaling


In [14]:
def S(x, l):                         # soft-threshold helper if x>lambda then sub , 0 if less , 
    return np.sign(x) * max(abs(x) - l, 0.0)
print("Initial state  →  w = [0, 0],  b = 0")
print("="*50)


Initial state  →  w = [0, 0],  b = 0


In [None]:
resid= y - (x @ w + b)
b+= resid.mean()
print(f"[1] After bias update: b = {b:5.2f}")

[1] After bias update: b =  1.67


In [None]:
j=0
r_j= y- (x @ w -x[:,j]* w[j]+b) #-> muting current contribution.
corr=(x[:,j] @ r_j)/n
w[j]=S(corr,lam) /col_len[j]
print(f"[1] After w1 update:  corr={corr:5.2f},  w1 = {w[j]:5.2f}")


[1] After w1 update:  corr= 2.33,  w1 =  2.00


In [18]:
j = 1
r_j = y - (x @ w - x[:, j]*w[j] + b)
corr = (x[:, j] @ r_j) / n
w[j] = S(corr, lam) / col_len[j]
print(f"[1] After w2 update:  corr={corr:5.2f},  w2 = {w[j]:5.2f}")

print(f"[1] End of sweep #1 → w = {np.round(w,2)},  b = {b:5.2f}")
print("="*50)

[1] After w2 update:  corr= 1.56,  w2 =  0.28
[1] End of sweep #1 → w = [2.   0.28],  b =  1.67


In [20]:
# ------------------------------------------------------------------
# 2)  SECOND SWEEP  (same steps again to show the drift)
# ------------------------------------------------------------------
# 2-a. bias
resid = y - (x @ w + b)
b += resid.mean()
print(f"[2] After bias update: b = {b:5.2f}")


[2] After bias update: b =  1.48


In [21]:
j = 0
r_j = y - (x @ w - x[:, j]*w[j] + b)
corr = (x[:, j] @ r_j) / n
w[j] = S(corr, lam) / col_len[j]
print(f"[2] After w1 update:  corr={corr:5.2f},  w1 = {w[j]:5.2f}")


[2] After w1 update:  corr= 2.06,  w1 =  1.58


In [23]:
j = 1
r_j = y - (x @ w - x[:, j]*w[j] + b)
corr = (x[:, j] @ r_j) / n
w[j] = S(corr, lam) / col_len[j]
print(f"[2] After w2 update:  corr={corr:5.2f},  w2 = {w[j]:5.2f}")

print(f"[2] End of sweep #2 → w = {np.round(w,2)},  b = {b:5.2f}")
print("="*50)

# ------------------------------------------------------------------
# 3)  Final quick check
# ------------------------------------------------------------------
y_hat = x @ w + b
print("Predictions:", np.round(y_hat, 2))
print("Residual mean (should be ~0):", (y - y_hat).mean())

[2] After w2 update:  corr= 2.10,  w2 =  0.55
[2] End of sweep #2 → w = [1.58 0.55],  b =  1.48
Predictions: [ 4.16  2.03 -0.65]
Residual mean (should be ~0): -0.1800411522633747


In [None]:
"""
X = np.array([[ 1,  2],
              [ 0,  1],
              [-1, -1]], dtype=float)

w = np.array([1.99, 0.23])   # after our first sweep
j = 0                        # look at feature x₁
python
Copy
Edit
X[:, j]        # → array([ 1.,  0., -1.])
w[j]           # → 1.99  (scalar)
X[:, j] * w[j] # → array([ 1.99,  0.  , -1.99])
"""

In [None]:
"""
X @ w               # full prediction from all features
- X[:, j] * w[j]    # remove feature j's share
"""

In [None]:
"""
In coordinate-descent Lasso we update one weight at a time.
To know how much this feature should change, we first ‘mute’ its current contribution, look at the leftover error (the partial residual), see how well the feature lines up with that error, then apply soft-thresholding. That single step gives the closed-form optimum for that coordinate.”
"""

In [None]:
"""
Analogy you can use
Imagine the model is a choir: each feature is a singer, its weight is their volume.

Mute one singer → see what notes are missing (partial residual).

How well could that singer fill the gap? → cor  relation.

Charge an entry fee λ → if they can’t sing loud enough to justify the fee, keep them silent (weight = 0).

Repeat for every singer until the song matches the target audience’s ears (low error) with the smallest possible choir (sparse model).
"""

In [None]:
# Ridge: When you want to keep all features (just make them smaller).

# Lasso: When you want some features to drop out entirely (become zero).