In [1]:
import numpy as np

In [35]:
#vector
v = np.array([1,2,3])

# matrice
A = np.array([[1,2,3], [2,3,5],[2,3,7]])
B = 2 * A
print("Addition", A + B)

Addition [[ 3  6  9]
 [ 6  9 15]
 [ 6  9 21]]


In [36]:
# Matrix Multiplication
dot_product = np.dot(A, B)
dot_product

array([[ 22,  34,  68],
       [ 36,  56, 112],
       [ 44,  68, 140]])

In [37]:
# Identity Matrix
I = np.eye(5)
I

array([[1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.]])

In [38]:
# Zero Matrix
Z = np.zeros((5,3))
Z

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [39]:
# Diagonal Matrix
D = np.diag([1,2,3])
D

array([[1, 0, 0],
       [0, 2, 0],
       [0, 0, 3]])

In [40]:
# determinant of a matrix
determinant = np.linalg.det(A)
determinant

np.float64(-2.0)

In [41]:
# inverse of a matrix
inverse = np.linalg.inv(A)
inverse

array([[-3. ,  2.5, -0.5],
       [ 2. , -0.5, -0.5],
       [ 0. , -0.5,  0.5]])

In [42]:
# if det(A)=0 A is singular, not invertible

In [43]:
np.dot(inverse, A)

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

### eigenvalues and eigenvectors

If A.v=lamba.v

**v** is an **eigenvector** and **lambda** is the **eigenvalue**

#### geometric interpretation
- Eigenvectors point in the direction where the matrix transformation stretches or compresses vectors
- Eigenvalues indicate the factor of stretching Ä±r compression
- det(A - lambda*I)=0

In [44]:
eigenvalues, eigenvectors = np.linalg.eig(A)


### Matrix Decomposition
breaking a matrix into sipler components to analyze or solve problems

#### Singular Value Decomposition (svd)
**A=U.S.V^T**
- **U**: Left singular vectors
- **S**: Diagonal matrix of singular values (non-negative)
- **V^T**: Right singular vectors

In [46]:
U, S, Vt = np.linalg.svd(A)
print(U)

[[-0.34933571 -0.45862471 -0.81708502]
 [-0.57732763 -0.58148154  0.57321203]
 [-0.73800906  0.67196919 -0.06164445]]


In [47]:
print(S)

[10.64313952  0.81911642  0.2294112 ]


In [48]:
print(Vt)

[[-0.27999342 -0.43640145 -0.85507746]
 [-0.3389621  -0.78839402  0.51336104]
 [ 0.89816946 -0.43357657 -0.07282159]]


In [51]:
# Reconstruct
Sigma = np.zeros((3,3))

np.fill_diagonal(Sigma, S)
#or
diag = np.diag(S)

reconstructed = U @ diag @ Vt
reconstructed


array([[1., 2., 3.],
       [2., 3., 5.],
       [2., 3., 7.]])

### Derivatives

In [52]:
import sympy as sp

In [56]:
x = sp.Symbol("x")
f = x**2

derivative = sp.diff(f, x)
derivative

2*x

### Partial derivatives

**Gradient:** Vector of all partial derivatives, indicating the direction of the steepest ascent

In [57]:
x, y = sp.symbols("x y")
f = x**2 + y**2

grad_x = sp.diff(f, x)
grad_y = sp.diff(f, y)

print(grad_x)
print(grad_y)

2*x
2*y


#### Gradient Deescent Optimization Alghorithm
- Iterative optimization algorithm used to minimize a function
- Updates parameters in the direction of the negative gradient to find the minimum

##### Update Rule: theta = theta - alpha*gradf(theta)
- theta: parameters of the model
- alpha: Learning rate (step size)

In [109]:
# gradient descent function
def gradient_descent(X, y, theta, learning_rate, iteration):
    m = len(y)
    for _ in range(iteration):
        predictions = np.dot(X, theta)
        errors = predictions - y
        gradients = (1/m) * np.dot(X.T, errors)
        theta -= learning_rate * gradients
    return theta

In [110]:
# Sample Data
X = np.array([[1,1], [1,2], [1,3]])
y = np.array([2,2.5,3])

theta = np.array([0.1, 0.1])
learning_rate = 0.1
iteration = 1000

# Performing Algorithm
optimized_theta = gradient_descent(X, y, theta, learning_rate, iteration)
optimized_theta

array([1.49999426, 0.50000253])

### Integrals

- Compute the area under a curve, represanting accumulation

In [111]:
f = x**2
definite_integral = sp.integrate(f, (x, 0, 2))
definite_integral

8/3

In [112]:
indefinite_integral = sp.integrate(f, x)
indefinite_integral

x**3/3

### Optimization Concepts
- Local Minima, Global Minima

#### Convex Func
- f(lambda*x_1 + (1-lambda)x_2) <= lambda*f(x_1) + (1 - lambda)f(x_2) for all lambda element of [0,1]
- Ensures that any local minimum is also a global minumum

#### Non-Convex Functions
- Most neural network loss functions

### Stochastic Gradient Descent (SGD) and Its Variants
- It is a optimization algorithm that uses random subsets of the data to compute gradients and update parameters

##### Why?
- Faster convergence for large data sets compared to batch gradient descent

##### Variants of SGD
- Mini Batch SGD, updates parameters using small batches instead of single examples
- Momentum, adds a fraction of the previous update to current update to accelerate the convergence
- Adam Optimizer, combines momentum with adaptive learning rates for faster convergence