# Linear Algebra for AI/ML - Part 3: Norms, Products & Projections

This notebook continues with norms, inner products, and projection concepts.

**Prerequisites:** Complete Parts 1 and 2 first.

In [1]:
"""
Setup: Import Required Libraries
"""
import numpy as np
import matplotlib.pyplot as plt
from scipy import linalg
from mpl_toolkits.mplot3d import Axes3D

np.set_printoptions(precision=4, suppress=True)
print(f"NumPy version: {np.__version__}")

NumPy version: 2.2.6


## 21. Dimension of Vector Space

The dimension is the number of vectors in any basis of the vector space.

**ML Application:** Feature dimensionality, model capacity, curse of dimensionality.

In [2]:
"""
Dimension of Vector Space

Definition:
The dimension of a vector space V is the number of vectors in any basis of V.

Key Facts:
- All bases of V have the same number of vectors
- dim(‚Ñù‚Åø) = n
- dim(span({v‚ÇÅ, ..., v‚Çñ})) ‚â§ k
- dim(V) = rank of any matrix whose columns span V

ML Applications:
- Number of features in dataset
- Intrinsic dimensionality (manifold learning)
- Effective rank in PCA
"""

print("="*60)
print("Dimension of Vector Spaces")
print("="*60)

# Example 1: Standard spaces
print("\nExample 1: Standard Vector Spaces")
print("-"*60)

spaces = [
    ("‚Ñù¬π (real line)", 1),
    ("‚Ñù¬≤ (plane)", 2),
    ("‚Ñù¬≥ (3D space)", 3),
    ("‚Ñù‚Åø (n-dimensional)", "n"),
]

for space, dim in spaces:
    print(f"dim({space}) = {dim}")

# Example 2: Subspaces
print("\n" + "="*60)
print("Example 2: Dimension of Subspaces")
print("="*60)

# Case 1: Line in ‚Ñù¬≥
print("\nCase 1: Line through origin in ‚Ñù¬≥")
v = np.array([1, 2, 3])
print(f"Spanned by v = {v}")
print(f"Subspace: {{Œªv : Œª ‚àà ‚Ñù}}")
print(f"Dimension: 1 (need 1 vector to span)")

# Case 2: Plane in ‚Ñù¬≥
print("\nCase 2: Plane through origin in ‚Ñù¬≥")
v1 = np.array([1, 0, 0])
v2 = np.array([0, 1, 0])
print(f"Spanned by v‚ÇÅ = {v1}, v‚ÇÇ = {v2}")
print(f"Subspace: {{Œª‚ÇÅv‚ÇÅ + Œª‚ÇÇv‚ÇÇ : Œª‚ÇÅ, Œª‚ÇÇ ‚àà ‚Ñù}}")
print(f"Dimension: 2 (need 2 independent vectors)")

# Case 3: All of ‚Ñù¬≥
print("\nCase 3: All of ‚Ñù¬≥")
v1 = np.array([1, 0, 0])
v2 = np.array([0, 1, 0])
v3 = np.array([0, 0, 1])
print(f"Spanned by standard basis")
print(f"Dimension: 3 (need 3 independent vectors)")

# Example 3: Computing dimension via rank
print("\n" + "="*60)
print("Example 3: Computing Dimension via Rank")
print("="*60)

# Set of vectors
v1 = np.array([1, 2, 3])
v2 = np.array([4, 5, 6])
v3 = np.array([7, 8, 9])
v4 = np.array([2, 4, 6])  # = 2*v1

# Create matrix with these as columns
A = np.column_stack([v1, v2, v3, v4])

print("\nVectors (as columns of A):")
print(f"v‚ÇÅ = {v1}")
print(f"v‚ÇÇ = {v2}")
print(f"v‚ÇÉ = {v3}")
print(f"v‚ÇÑ = {v4}")

print(f"\nMatrix A:")
print(A)

rank = np.linalg.matrix_rank(A)
print(f"\nRank of A: {rank}")
print(f"Number of vectors: {A.shape[1]}")
print(f"\nDimension of span{{v‚ÇÅ, v‚ÇÇ, v‚ÇÉ, v‚ÇÑ}} = {rank}")
print(f"\nInterpretation:")
print(f"‚Üí Only {rank} vectors are linearly independent")
print(f"‚Üí The span is {rank}-dimensional")
print(f"‚Üí Can express as span of {rank} basis vectors")

# Find which vectors are independent
print(f"\nNote: v‚ÇÑ = 2√óv‚ÇÅ (dependent)")
print(f"Verify: 2 √ó {v1} = {2*v1}")

# Example 4: Dimension vs Ambient Space
print("\n" + "="*60)
print("Example 4: Subspace Dimension < Ambient Dimension")
print("="*60)

# 2D subspace in ‚Ñù‚Å¥
v1 = np.array([1, 0, 0, 0])
v2 = np.array([0, 1, 0, 0])

A = np.column_stack([v1, v2])

print(f"\nVectors in ‚Ñù‚Å¥:")
print(f"v‚ÇÅ = {v1}")
print(f"v‚ÇÇ = {v2}")

print(f"\nAmbient space: ‚Ñù‚Å¥ (dimension 4)")
print(f"Subspace dimension: {np.linalg.matrix_rank(A)}")
print(f"\n‚Üí 2D subspace embedded in 4D space")
print(f"‚Üí Like a flat sheet in a room")

# ML Application: Intrinsic Dimensionality
print("\n" + "="*60)
print("ML Application: Intrinsic Dimensionality")
print("="*60)

# Generate high-dimensional data with low intrinsic dimension
np.random.seed(42)
n_samples = 200

# True latent factors (2D)
z1 = np.random.randn(n_samples)
z2 = np.random.randn(n_samples)

# Generate 5 observed features from 2 latent factors
X = np.column_stack([
    z1 + 0.1*np.random.randn(n_samples),           # Feature 1 ‚âà z1
    z2 + 0.1*np.random.randn(n_samples),           # Feature 2 ‚âà z2
    z1 + z2 + 0.1*np.random.randn(n_samples),      # Feature 3 ‚âà z1 + z2
    2*z1 - z2 + 0.1*np.random.randn(n_samples),    # Feature 4 ‚âà 2z1 - z2
    0.5*z1 + 1.5*z2 + 0.1*np.random.randn(n_samples)  # Feature 5 ‚âà 0.5z1 + 1.5z2
])

print(f"\nObserved data shape: {X.shape}")
print(f"Nominal dimension (# features): {X.shape[1]}")

# Compute effective rank via SVD
U, s, Vt = np.linalg.svd(X, full_matrices=False)

print(f"\nSingular values:")
print(s)

# Count significant singular values (threshold)
threshold = 1e-10
effective_rank = np.sum(s > threshold)

print(f"\nEffective rank (œÉ > {threshold}): {effective_rank}")
print(f"\nInterpretation:")
print(f"‚Üí Data lives in a {effective_rank}-dimensional subspace")
print(f"‚Üí Intrinsic dimension ‚âà {effective_rank} (much less than {X.shape[1]})")
print(f"‚Üí Can reduce from {X.shape[1]}D to {effective_rank}D with minimal loss")

# Variance explained
variance_explained = np.cumsum(s**2) / np.sum(s**2)
print(f"\nVariance explained by components:")
for i, var in enumerate(variance_explained):
    print(f"  First {i+1} components: {var*100:.2f}%")

print(f"\n‚Üí First 2 components capture {variance_explained[1]*100:.2f}% of variance")
print(f"‚Üí Confirms true intrinsic dimension is 2")

Dimension of Vector Spaces

Example 1: Standard Vector Spaces
------------------------------------------------------------
dim(‚Ñù¬π (real line)) = 1
dim(‚Ñù¬≤ (plane)) = 2
dim(‚Ñù¬≥ (3D space)) = 3
dim(‚Ñù‚Åø (n-dimensional)) = n

Example 2: Dimension of Subspaces

Case 1: Line through origin in ‚Ñù¬≥
Spanned by v = [1 2 3]
Subspace: {Œªv : Œª ‚àà ‚Ñù}
Dimension: 1 (need 1 vector to span)

Case 2: Plane through origin in ‚Ñù¬≥
Spanned by v‚ÇÅ = [1 0 0], v‚ÇÇ = [0 1 0]
Subspace: {Œª‚ÇÅv‚ÇÅ + Œª‚ÇÇv‚ÇÇ : Œª‚ÇÅ, Œª‚ÇÇ ‚àà ‚Ñù}
Dimension: 2 (need 2 independent vectors)

Case 3: All of ‚Ñù¬≥
Spanned by standard basis
Dimension: 3 (need 3 independent vectors)

Example 3: Computing Dimension via Rank

Vectors (as columns of A):
v‚ÇÅ = [1 2 3]
v‚ÇÇ = [4 5 6]
v‚ÇÉ = [7 8 9]
v‚ÇÑ = [2 4 6]

Matrix A:
[[1 4 7 2]
 [2 5 8 4]
 [3 6 9 6]]

Rank of A: 2
Number of vectors: 4

Dimension of span{v‚ÇÅ, v‚ÇÇ, v‚ÇÉ, v‚ÇÑ} = 2

Interpretation:
‚Üí Only 2 vectors are linearly independent
‚Üí The span is 2-di

## 22. Norms

A norm measures the "size" or "length" of a vector.

**Properties:**
1. Positivity: ||x|| ‚â• 0, equals 0 only if x = 0
2. Scaling: ||Œ±x|| = |Œ±| ||x||
3. Triangle inequality: ||x + y|| ‚â§ ||x|| + ||y||

**ML Application:** Regularization, distance metrics, gradient clipping, loss functions.

In [3]:
"""
Vector and Matrix Norms

VECTOR NORMS:
- L1 (Manhattan): ||x||‚ÇÅ = Œ£|x·µ¢|
- L2 (Euclidean): ||x||‚ÇÇ = ‚àö(Œ£x·µ¢¬≤)
- L‚àû (Max): ||x||‚àû = max|x·µ¢|
- Lp (general): ||x||p = (Œ£|x·µ¢|·µñ)^(1/p)

MATRIX NORMS:
- Frobenius: ||A||_F = ‚àö(Œ£·µ¢‚±º a·µ¢‚±º¬≤)
- Spectral: ||A||‚ÇÇ = largest singular value

ML Applications:
- L1: Lasso regression (sparsity)
- L2: Ridge regression (weight decay)
- Frobenius: Matrix regularization
- Distance metrics in clustering
"""

print("="*60)
print("Vector Norms")
print("="*60)

# Example vector
x = np.array([3, -4, 0, 2])

print(f"\nVector x = {x}")

# L1 Norm (Manhattan/Taxicab)
print("\n" + "-"*60)
print("L1 Norm (Manhattan Distance)")
print("-"*60)

l1_manual = np.sum(np.abs(x))
l1_numpy = np.linalg.norm(x, ord=1)

print(f"\nFormula: ||x||‚ÇÅ = |x‚ÇÅ| + |x‚ÇÇ| + |x‚ÇÉ| + |x‚ÇÑ|")
print(f"       = |{x[0]}| + |{x[1]}| + |{x[2]}| + |{x[3]}|")
print(f"       = {np.abs(x[0])} + {np.abs(x[1])} + {np.abs(x[2])} + {np.abs(x[3])}")
print(f"       = {l1_manual}")

print(f"\nNumPy: {l1_numpy}")

print(f"\nInterpretation:")
print(f"‚Üí Sum of absolute values")
print(f"‚Üí Distance in a grid (Manhattan blocks)")
print(f"‚Üí ML: Lasso regularization encourages sparsity")

# L2 Norm (Euclidean)
print("\n" + "-"*60)
print("L2 Norm (Euclidean Distance)")
print("-"*60)

l2_manual = np.sqrt(np.sum(x**2))
l2_numpy = np.linalg.norm(x, ord=2)
l2_default = np.linalg.norm(x)  # Default is L2

print(f"\nFormula: ||x||‚ÇÇ = ‚àö(x‚ÇÅ¬≤ + x‚ÇÇ¬≤ + x‚ÇÉ¬≤ + x‚ÇÑ¬≤)")
print(f"       = ‚àö({x[0]}¬≤ + {x[1]}¬≤ + {x[2]}¬≤ + {x[3]}¬≤)")
print(f"       = ‚àö({x[0]**2} + {x[1]**2} + {x[2]**2} + {x[3]**2})")
print(f"       = ‚àö{np.sum(x**2)}")
print(f"       = {l2_manual:.4f}")

print(f"\nNumPy: {l2_numpy:.4f}")

print(f"\nInterpretation:")
print(f"‚Üí Straight-line distance")
print(f"‚Üí Most common norm")
print(f"‚Üí ML: Ridge regularization, MSE loss")

# L‚àû Norm (Max/Chebyshev)
print("\n" + "-"*60)
print("L‚àû Norm (Maximum/Chebyshev)")
print("-"*60)

linf_manual = np.max(np.abs(x))
linf_numpy = np.linalg.norm(x, ord=np.inf)

print(f"\nFormula: ||x||‚àû = max(|x‚ÇÅ|, |x‚ÇÇ|, |x‚ÇÉ|, |x‚ÇÑ|)")
print(f"       = max(|{x[0]}|, |{x[1]}|, |{x[2]}|, |{x[3]}|)")
print(f"       = max({np.abs(x[0])}, {np.abs(x[1])}, {np.abs(x[2])}, {np.abs(x[3])})")
print(f"       = {linf_manual}")

print(f"\nNumPy: {linf_numpy}")

print(f"\nInterpretation:")
print(f"‚Üí Maximum absolute value")
print(f"‚Üí Largest single coordinate")
print(f"‚Üí ML: Adversarial robustness, gradient clipping")

# Compare all norms
print("\n" + "="*60)
print("Comparison of Norms")
print("="*60)

print(f"\nFor x = {x}:")
print(f"  ||x||‚ÇÅ  = {l1_numpy:.4f}")
print(f"  ||x||‚ÇÇ  = {l2_numpy:.4f}")
print(f"  ||x||‚àû  = {linf_numpy:.4f}")

print(f"\nGeneral relationship: ||x||‚àû ‚â§ ||x||‚ÇÇ ‚â§ ||x||‚ÇÅ")
print(f"Verify: {linf_numpy:.4f} ‚â§ {l2_numpy:.4f} ‚â§ {l1_numpy:.4f} ‚úì")

# Norm properties
print("\n" + "="*60)
print("Verifying Norm Properties")
print("="*60)

# Use L2 norm for demonstration
y = np.array([1, -2, 3, -1])
alpha = 2.5

print(f"\nVectors: x = {x}, y = {y}")
print(f"Scalar: Œ± = {alpha}")

# Property 1: Positivity
print("\n1. Positivity: ||x|| ‚â• 0")
norm_x = np.linalg.norm(x)
print(f"   ||x|| = {norm_x:.4f} ‚â• 0 ‚úì")

zero = np.zeros(4)
norm_zero = np.linalg.norm(zero)
print(f"   ||0|| = {norm_zero:.4f} = 0 ‚úì")

# Property 2: Scaling
print("\n2. Scaling: ||Œ±x|| = |Œ±| ¬∑ ||x||")
norm_alpha_x = np.linalg.norm(alpha * x)
scaled_norm = np.abs(alpha) * np.linalg.norm(x)
print(f"   ||{alpha}x|| = {norm_alpha_x:.4f}")
print(f"   |{alpha}| ¬∑ ||x|| = {scaled_norm:.4f}")
print(f"   Equal? {np.isclose(norm_alpha_x, scaled_norm)} ‚úì")

# Property 3: Triangle inequality
print("\n3. Triangle Inequality: ||x + y|| ‚â§ ||x|| + ||y||")
norm_sum = np.linalg.norm(x + y)
sum_norms = np.linalg.norm(x) + np.linalg.norm(y)
print(f"   ||x + y|| = {norm_sum:.4f}")
print(f"   ||x|| + ||y|| = {sum_norms:.4f}")
print(f"   {norm_sum:.4f} ‚â§ {sum_norms:.4f}? {norm_sum <= sum_norms} ‚úì")

# Matrix Norms
print("\n" + "="*60)
print("Matrix Norms")
print("="*60)

A = np.array([
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9]
])

print(f"\nMatrix A:")
print(A)

# Frobenius norm
print("\nFrobenius Norm (like L2 for matrices):")
print("-"*60)

frob_manual = np.sqrt(np.sum(A**2))
frob_numpy = np.linalg.norm(A, 'fro')

print(f"\nFormula: ||A||_F = ‚àö(Œ£·µ¢‚±º a·µ¢‚±º¬≤)")
print(f"       = ‚àö({np.sum(A**2)})")
print(f"       = {frob_manual:.4f}")

print(f"\nNumPy: {frob_numpy:.4f}")

print(f"\nInterpretation:")
print(f"‚Üí Square root of sum of all squared elements")
print(f"‚Üí ML: Matrix regularization in neural networks")

# Spectral norm
print("\nSpectral Norm (largest singular value):")
print("-"*60)

spectral = np.linalg.norm(A, 2)
U, s, Vt = np.linalg.svd(A)
largest_sv = s[0]

print(f"\nSpectral norm: {spectral:.4f}")
print(f"Largest singular value: {largest_sv:.4f}")
print(f"Match? {np.isclose(spectral, largest_sv)} ‚úì")

print(f"\nInterpretation:")
print(f"‚Üí Maximum 'stretching' of the matrix")
print(f"‚Üí ML: Lipschitz constraints, GAN training")

# ML Application: Regularization
print("\n" + "="*60)
print("ML Application: L1 vs L2 Regularization")
print("="*60)

# Model weights
weights_dense = np.array([0.5, 0.3, 0.4, 0.2, 0.6])
weights_sparse = np.array([0.8, 0.0, 0.0, 0.0, 0.9])

print(f"\nDense weights: {weights_dense}")
print(f"Sparse weights: {weights_sparse}")

# L1 penalty (Lasso)
l1_dense = np.linalg.norm(weights_dense, 1)
l1_sparse = np.linalg.norm(weights_sparse, 1)

print(f"\nL1 Penalty (Lasso):")
print(f"  Dense:  ||w||‚ÇÅ = {l1_dense:.4f}")
print(f"  Sparse: ||w||‚ÇÅ = {l1_sparse:.4f}")
print(f"  ‚Üí L1 penalty similar for both")

# L2 penalty (Ridge)
l2_dense = np.linalg.norm(weights_dense, 2)
l2_sparse = np.linalg.norm(weights_sparse, 2)

print(f"\nL2 Penalty (Ridge):")
print(f"  Dense:  ||w||‚ÇÇ = {l2_dense:.4f}")
print(f"  Sparse: ||w||‚ÇÇ = {l2_sparse:.4f}")
print(f"  ‚Üí L2 penalty LARGER for sparse (penalizes large values)")

print(f"\nKey Insight:")
print(f"‚Üí L1 promotes sparsity (many zeros)")
print(f"‚Üí L2 promotes small weights (spread out)")
print(f"‚Üí L1: Feature selection")
print(f"‚Üí L2: Weight decay")

Vector Norms

Vector x = [ 3 -4  0  2]

------------------------------------------------------------
L1 Norm (Manhattan Distance)
------------------------------------------------------------

Formula: ||x||‚ÇÅ = |x‚ÇÅ| + |x‚ÇÇ| + |x‚ÇÉ| + |x‚ÇÑ|
       = |3| + |-4| + |0| + |2|
       = 3 + 4 + 0 + 2
       = 9

NumPy: 9.0

Interpretation:
‚Üí Sum of absolute values
‚Üí Distance in a grid (Manhattan blocks)
‚Üí ML: Lasso regularization encourages sparsity

------------------------------------------------------------
L2 Norm (Euclidean Distance)
------------------------------------------------------------

Formula: ||x||‚ÇÇ = ‚àö(x‚ÇÅ¬≤ + x‚ÇÇ¬≤ + x‚ÇÉ¬≤ + x‚ÇÑ¬≤)
       = ‚àö(3¬≤ + -4¬≤ + 0¬≤ + 2¬≤)
       = ‚àö(9 + 16 + 0 + 4)
       = ‚àö29
       = 5.3852

NumPy: 5.3852

Interpretation:
‚Üí Straight-line distance
‚Üí Most common norm
‚Üí ML: Ridge regularization, MSE loss

------------------------------------------------------------
L‚àû Norm (Maximum/Chebyshev)
---------------------

## 23. Inner, Outer, and Dot Products

Different ways to combine vectors.

**Dot Product:** Scalar result measuring alignment
**Inner Product:** Generalized dot product (can have weights)
**Outer Product:** Matrix result (all pairwise products)

**ML Application:** Attention mechanisms, similarity metrics, neural network layers.

In [None]:
"""
Inner, Outer, and Dot Products

DOT PRODUCT (standard inner product):
- u ¬∑ v = Œ£ u·µ¢v·µ¢
- Result: scalar
- Measures: alignment/similarity

INNER PRODUCT (weighted):
- ‚ü®u, v‚ü©_M = u·µÄMv
- M: positive definite matrix
- Generalizes dot product

OUTER PRODUCT:
- u ‚äó v = uv·µÄ
- Result: matrix
- Each element: u·µ¢v‚±º

ML Applications:
- Dot: cosine similarity, attention scores
- Inner: Mahalanobis distance, kernel methods
- Outer: rank-1 updates, covariance matrices
"""

print("="*60)
print("Dot Product")
print("="*60)

u = np.array([1, 2, 3])
v = np.array([4, 5, 6])

print(f"\nu = {u}")
print(f"v = {v}")

# Compute dot product
dot_manual = u[0]*v[0] + u[1]*v[1] + u[2]*v[2]
dot_numpy = np.dot(u, v)
dot_at = u @ v  # Alternative syntax

print(f"\nFormula: u ¬∑ v = u‚ÇÅv‚ÇÅ + u‚ÇÇv‚ÇÇ + u‚ÇÉv‚ÇÉ")
print(f"       = {u[0]}√ó{v[0]} + {u[1]}√ó{v[1]} + {u[2]}√ó{v[2]}")
print(f"       = {u[0]*v[0]} + {u[1]*v[1]} + {u[2]*v[2]}")
print(f"       = {dot_manual}")

print(f"\nNumPy (np.dot): {dot_numpy}")
print(f"NumPy (@ operator): {dot_at}")

# Geometric interpretation
print("\n" + "-"*60)
print("Geometric Interpretation")
print("-"*60)

# Dot product = ||u|| ||v|| cos(Œ∏)
norm_u = np.linalg.norm(u)
norm_v = np.linalg.norm(v)
cos_theta = dot_numpy / (norm_u * norm_v)
theta_rad = np.arccos(cos_theta)
theta_deg = np.degrees(theta_rad)

print(f"\nFormula: u ¬∑ v = ||u|| ||v|| cos(Œ∏)")
print(f"\n||u|| = {norm_u:.4f}")
print(f"||v|| = {norm_v:.4f}")
print(f"\ncos(Œ∏) = (u ¬∑ v) / (||u|| ||v||)")
print(f"       = {dot_numpy} / ({norm_u:.4f} √ó {norm_v:.4f})")
print(f"       = {cos_theta:.4f}")

print(f"\nAngle Œ∏ = {theta_deg:.2f}¬∞")

print(f"\nInterpretation:")
print(f"‚Üí Measures how much vectors point in same direction")
print(f"‚Üí Positive: vectors point similar direction")
print(f"‚Üí Zero: vectors are perpendicular")
print(f"‚Üí Negative: vectors point opposite directions")

# Special cases
print("\n" + "="*60)
print("Special Cases")
print("="*60)

# Orthogonal vectors
print("\nCase 1: Orthogonal Vectors (perpendicular)")
a = np.array([1, 0])
b = np.array([0, 1])
dot_ab = np.dot(a, b)

print(f"a = {a}")
print(f"b = {b}")
print(f"a ¬∑ b = {dot_ab}")
print(f"‚Üí Dot product = 0 ‚Üí vectors are ORTHOGONAL ‚úì")

# Parallel vectors
print("\nCase 2: Parallel Vectors (same direction)")
a = np.array([1, 2, 3])
b = 2 * a  # Parallel
dot_ab = np.dot(a, b)
norm_prod = np.linalg.norm(a) * np.linalg.norm(b)

print(f"a = {a}")
print(f"b = {b} = 2a")
print(f"a ¬∑ b = {dot_ab:.4f}")
print(f"||a|| ||b|| = {norm_prod:.4f}")
print(f"‚Üí a ¬∑ b = ||a|| ||b|| ‚Üí cos(Œ∏) = 1 ‚Üí Œ∏ = 0¬∞ ‚úì")

# Anti-parallel vectors
print("\nCase 3: Anti-parallel Vectors (opposite direction)")
a = np.array([1, 2, 3])
b = -1 * a  # Opposite
dot_ab = np.dot(a, b)
norm_prod = np.linalg.norm(a) * np.linalg.norm(b)

print(f"a = {a}")
print(f"b = {b} = -a")
print(f"a ¬∑ b = {dot_ab:.4f}")
print(f"-||a|| ||b|| = {-norm_prod:.4f}")
print(f"‚Üí a ¬∑ b = -||a|| ||b|| ‚Üí cos(Œ∏) = -1 ‚Üí Œ∏ = 180¬∞ ‚úì")

# Outer Product
print("\n" + "="*60)
print("Outer Product")
print("="*60)

u = np.array([1, 2, 3])
v = np.array([4, 5])

print(f"\nu = {u} (shape: {u.shape})")
print(f"v = {v} (shape: {v.shape})")

# Compute outer product
outer = np.outer(u, v)

print(f"\nOuter product u ‚äó v:")
print(outer)
print(f"Shape: {outer.shape}")

print(f"\nElement-wise breakdown:")
print(f"u ‚äó v = [[u‚ÇÅv‚ÇÅ, u‚ÇÅv‚ÇÇ],")
print(f"         [u‚ÇÇv‚ÇÅ, u‚ÇÇv‚ÇÇ],")
print(f"         [u‚ÇÉv‚ÇÅ, u‚ÇÉv‚ÇÇ]]")
print(f"")
print(f"      = [[{u[0]}√ó{v[0]}, {u[0]}√ó{v[1]}],")
print(f"         [{u[1]}√ó{v[0]}, {u[1]}√ó{v[1]}],")
print(f"         [{u[2]}√ó{v[0]}, {u[2]}√ó{v[1]}]]")
print(f"")
print(f"      = {outer}")

print(f"\nInterpretation:")
print(f"‚Üí Creates a matrix from two vectors")
print(f"‚Üí Each element is u·µ¢v‚±º")
print(f"‚Üí Rank-1 matrix (all rows are multiples of each other)")

# Verify rank-1
rank = np.linalg.matrix_rank(outer)
print(f"\nRank of outer product: {rank}")
print(f"‚Üí Always rank-1 (except for zero vectors) ‚úì")

# Inner Product (weighted)
print("\n" + "="*60)
print("Inner Product (Weighted)")
print("="*60)

u = np.array([1, 2, 3])
v = np.array([4, 5, 6])

# Weight matrix (positive definite)
M = np.array([
    [2, 0, 0],
    [0, 3, 0],
    [0, 0, 1]
])

print(f"\nu = {u}")
print(f"v = {v}")
print(f"\nWeight matrix M:")
print(M)

# Standard dot product
standard_dot = np.dot(u, v)

# Weighted inner product: ‚ü®u, v‚ü©_M = u·µÄMv
weighted_inner = u @ M @ v

print(f"\nStandard dot product: u ¬∑ v = {standard_dot}")
print(f"Weighted inner product: ‚ü®u, v‚ü©_M = {weighted_inner}")

print(f"\nComputation:")
print(f"‚ü®u, v‚ü©_M = u·µÄMv")
Mv = M @ v
print(f"         = u ¬∑ (Mv)")
print(f"         = {u} ¬∑ {Mv}")
print(f"         = {weighted_inner}")

print(f"\nInterpretation:")
print(f"‚Üí M gives different weights to different dimensions")
print(f"‚Üí When M = I, reduces to standard dot product")
print(f"‚Üí ML: Mahalanobis distance uses covariance as M")

# ML Application: Cosine Similarity
print("\n" + "="*60)
print("ML Application: Cosine Similarity")
print("="*60)

# Document vectors (word counts)
doc1 = np.array([2, 3, 1, 0])  # Document 1
doc2 = np.array([1, 2, 2, 1])  # Document 2 (similar)
doc3 = np.array([0, 0, 3, 4])  # Document 3 (different)

def cosine_similarity(a, b):
    """Compute cosine similarity: cos(Œ∏) = (a¬∑b) / (||a|| ||b||)"""
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

sim_12 = cosine_similarity(doc1, doc2)
sim_13 = cosine_similarity(doc1, doc3)
sim_23 = cosine_similarity(doc2, doc3)

print(f"\nDocument vectors (word counts):")
print(f"Doc 1: {doc1}")
print(f"Doc 2: {doc2}")
print(f"Doc 3: {doc3}")

print(f"\nCosine Similarities:")
print(f"Doc 1 vs Doc 2: {sim_12:.4f}")
print(f"Doc 1 vs Doc 3: {sim_13:.4f}")
print(f"Doc 2 vs Doc 3: {sim_23:.4f}")

print(f"\nInterpretation:")
print(f"‚Üí Values range from -1 to 1")
print(f"‚Üí 1: identical direction (very similar)")
print(f"‚Üí 0: orthogonal (unrelated)")
print(f"‚Üí -1: opposite direction (very different)")

print(f"\n‚Üí Doc 1 and Doc 2 are more similar ({sim_12:.4f})")
print(f"‚Üí Doc 1 and Doc 3 are less similar ({sim_13:.4f})")

# ML Application: Attention Mechanism
print("\n" + "="*60)
print("ML Application: Attention Scores (Simplified)")
print("="*60)

# Query and Key vectors
query = np.array([1.0, 0.5, 0.2])
key1 = np.array([0.9, 0.6, 0.1])  # Similar to query
key2 = np.array([0.1, 0.2, 0.9])  # Different from query

print(f"\nQuery: {query}")
print(f"Key 1: {key1} (similar to query)")
print(f"Key 2: {key2} (different from query)")

# Attention scores (dot product)
score1 = np.dot(query, key1)
score2 = np.dot(query, key2)

print(f"\nAttention scores (before softmax):")
print(f"Score 1 (Query ¬∑ Key1): {score1:.4f}")
print(f"Score 2 (Query ¬∑ Key2): {score2:.4f}")

# Softmax normalization
scores = np.array([score1, score2])
attention_weights = np.exp(scores) / np.sum(np.exp(scores))

print(f"\nAttention weights (after softmax):")
print(f"Weight 1: {attention_weights[0]:.4f}")
print(f"Weight 2: {attention_weights[1]:.4f}")
print(f"Sum: {np.sum(attention_weights):.4f}")

print(f"\nInterpretation:")
print(f"‚Üí Higher dot product = higher attention")
print(f"‚Üí Query attends more to Key1 ({attention_weights[0]*100:.1f}%)")
print(f"‚Üí Used in Transformers, BERT, GPT, etc.")

# üìä Summary Table: Linear Algebra Concepts (Part 3)

| Concept | Definition | Key Property | ML Application |
|---------|------------|--------------|----------------|
| **Dimension** | Number of vectors in any basis of a space | All bases have same size; dimension = rank | Feature dimensionality, model capacity, intrinsic data dimension |
| **Norms** | Measures "size" or "length" of vectors/matrices | L‚ÇÅ (Manhattan), L‚ÇÇ (Euclidean), L‚àû (Max), Frobenius (matrix) | Regularization (Lasso/Ridge), distance metrics, gradient clipping |
| **Dot Product** | Scalar: u¬∑v = Œ£u·µ¢v·µ¢ = \|\|u\|\|\|v\|cos(Œ∏) | Measures alignment (cosine similarity) | Attention mechanisms, similarity scores, neural activations |
| **Outer Product** | Matrix: u‚äóv = uv·µÄ (each element u·µ¢v‚±º) | Always rank-1 (except zero vectors) | Rank-1 updates, covariance estimation |
| **Inner Product** | Generalized: ‚ü®u,v‚ü©_M = u·µÄMv (M positive definite) | Reduces to dot product when M=I | Mahalanobis distance, kernel methods |

## Part 3 Progress

**Completed:**
- Dimension of Vector Space (21)
- Norms (22) - L1, L2, L‚àû, Frobenius, Spectral
- Inner, Outer, Dot Products (23)

**Next Part 4 will cover:**
- Projection Matrices
- Orthogonality & Orthonormal Basis
- Eigenvalues & Eigenvectors
