In [9]:
import time
import numpy as np
import torch
import torch.nn.functional as F
import torch.nn as nn

In [2]:
device = 'mps' if torch.backends.mps.is_available() else 'cpu'
print(device)

mps


In [3]:
randint = torch.randint(-100, 100, (6,))
randint

tensor([ 51,   7,  -8,  78, -13, -12])

In [4]:
tensor = torch.tensor([[0.1, 1.2], [2.2, 3], [4.9, 5.2]])
tensor

tensor([[0.1000, 1.2000],
        [2.2000, 3.0000],
        [4.9000, 5.2000]])

In [5]:
zeros = torch.zeros(2,3)
zeros

tensor([[0., 0., 0.],
        [0., 0., 0.]])

In [6]:
ones = torch.ones(3,4)
ones

tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]])

In [7]:
input = torch.empty(2, 3)
input

tensor([[0., 0., 0.],
        [0., 0., 0.]])

In [8]:
arange = torch.arange(5)
arange

tensor([0, 1, 2, 3, 4])

In [9]:
linspace = torch.linspace(3, 10, steps = 5)
linspace

tensor([ 3.0000,  4.7500,  6.5000,  8.2500, 10.0000])

In [10]:
logspace = torch.logspace(start = -10, end = 10, steps = 5)
logspace

tensor([1.0000e-10, 1.0000e-05, 1.0000e+00, 1.0000e+05, 1.0000e+10])

In [11]:
eye = torch.eye(5)  #Identity matrix
eye

tensor([[1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1.]])

In [12]:
a = torch.empty((2,3), dtype=torch.int64)
empty_like = torch.empty_like(a)
empty_like

tensor([[0, 0, 0],
        [0, 0, 0]])

## CPU vs GPU in PyTorch

In [13]:
start_time = time.time()
    #Matrix Operations
zeros = torch.zeros(1,1)
end_time = time.time()

elapsed_time = end_time - start_time
print(f"{elapsed_time:.4f}")

0.0001


In [15]:
torch_rand1 = torch.rand(100, 100, 100, 100).to(device)
torch_rand2 = torch.rand(100, 100, 100, 100).to(device)
np_rand1 = torch.rand(100, 100, 100, 100)
np_rand2 = torch.rand(100, 100, 100, 100)

start_time = time.time()

rand = (torch_rand1 @ torch_rand2)

end_time = time.time()

gpu_elapse = end_time - start_time
print(f"{gpu_elapse:.8f}")

start_time = time.time()

rand = np.multiply(np_rand1, np_rand2)

end_time = time.time()

cpu_elapse = end_time - start_time
print(f"{cpu_elapse:.8f}")


0.15798974
0.08695292


  rand = np.multiply(np_rand1, np_rand2)


### Some interesting PyTorch Functions

In [19]:
#Probability tensor
probabilities = torch.tensor([0.1, 0.9])
# 10% or 0.1 corresponds to probability of getting 0
# 90% or 0.9 corresponds to probability of getting 1

samples = torch.multinomial(probabilities, num_samples = 10, replacement = True)
print(samples)

tensor([1, 1, 1, 1, 0, 1, 1, 1, 1, 1])


In [21]:
tensor = torch.tensor([1, 2, 3, 4])
out = torch.cat((tensor, torch.tensor([1, 2])), dim = 0)
out

tensor([1, 2, 3, 4, 1, 2])

In [22]:
# Lower triangular Matrix
out = torch.tril(torch.ones(5,5))
out

tensor([[1., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1.]])

In [24]:
# Upper Triangular Matrix
out = torch.triu(torch.ones(5,5))
out

tensor([[1., 1., 1., 1., 1.],
        [0., 1., 1., 1., 1.],
        [0., 0., 1., 1., 1.],
        [0., 0., 0., 1., 1.],
        [0., 0., 0., 0., 1.]])

In [25]:
out = torch.zeros(5, 5).masked_fill(torch.tril(torch.ones(5,5)) == 0, float('-inf'))
out

tensor([[0., -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0.]])

Exponential of the above matrix gives us lower triangular matrix

In [26]:
torch.exp(out)

tensor([[1., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1.]])

In [32]:
input = torch.zeros(1, 2, 3)
out = input.transpose(0, 2)
out.shape

torch.Size([3, 2, 1])

In [30]:
input = torch.zeros(2, 3, 4, 5, 6)
out = input.transpose(1, 3)
out.shape

torch.Size([2, 5, 4, 3, 6])

In [33]:
tensor1 = torch.tensor([1, 2, 3])
tensor2 = torch.tensor([4, 5, 6])
tensor3 = torch.tensor([7, 8, 9])

#Stacking tensors along a new dimension
stacked_tensor = torch.stack([tensor1, tensor2, tensor3])
stacked_tensor

tensor([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]])

In [35]:
print(tensor1.shape)
print(stacked_tensor.shape)

torch.Size([3])
torch.Size([3, 3])


In [36]:
sample = torch.tensor([10., 10., 10.])
linear = nn.Linear(3, 3, bias = False)
print(linear(sample))

tensor([-9.0176,  5.7571, -9.1507], grad_fn=<SqueezeBackward4>)


**Documentation:** nn.Linear applies an affine linear transformation to the incoming data:  
\\[
y = xA^T + b
\\]

In [37]:
import torch.nn.functional as F

#Create a tensor
tensor1 = torch.tensor([1.0, 2.0, 3.0])

# Applying Softmax function
softmax_output = F.softmax(tensor1, dim = 0)

print(softmax_output)

tensor([0.0900, 0.2447, 0.6652])


# 🌐 Embedding Vectors

**Embedding vectors** are **dense, trainable representations** of categorical variables. They are primarily used to convert **discrete inputs**—such as words, item IDs, or user IDs—into **continuous vectors** that neural networks can process efficiently.

In **Natural Language Processing** (NLP), each word in a vocabulary is mapped to a fixed-size vector of real numbers. These vectors are stored in an embedding matrix, and each row corresponds to the vector for a specific token (like a word or character). The values in this matrix are learned during training, allowing the model to capture semantic relationships between inputs.

---

## 🧠 Why Use Embeddings?

- Categorical data like words or product IDs can't be directly processed by neural networks.
- One-hot encoding is inefficient and high-dimensional.
- Embeddings reduce dimensionality and **capture relationships** between categories (e.g., similar words have similar embeddings).

---

Rule of Thumb: ```embedding = nn.Embedding(vocab_size, embedding_dim)```. embedding_dim is a hyperparameter.

## 🔧 PyTorch Example

You can create and use embeddings in PyTorch using `nn.Embedding`. Here's a simple example:

```python
import torch
import torch.nn as nn

# Create an embedding layer
embedding = nn.Embedding(num_embeddings=10, embedding_dim=5)

# Input tensor with categorical indices
input = torch.tensor([1, 2, 3])

# Get the embedding vectors
output = embedding(input)
print(output.shape)  # Output shape: (3, 5)




In [3]:
#Initialize an embedding layer
vocab_size = 1000
embedding_dim = 100
embedding = nn.Embedding(vocab_size, embedding_dim)

#Create some input indices
input_indices = torch.LongTensor([1, 5, 3, 2])

#Applying the embedding layer
embedded_output = embedding(input_indices)



The output will be a tensor of shape (4, 100) where 4 is the number of inputs and 100 is the dimensionality of the input vectors.

In [4]:
print(embedded_output.shape)

torch.Size([4, 100])


# Dot Product and Matrix Multiplication

In [5]:
def DotProd(A, B):
    res = 0
    for i in range(len(A)):
        res += A[i] * B[i]
    return res

In [6]:
DotProd([1, 2, 3], [4, 5, 6])

32

In [9]:
# Matrix Multiplication
a = torch.tensor([[1,2], [3, 4], [5, 6]])
b = torch.tensor([[7, 8, 9], [10, 11, 12]])
print(a @ b)

tensor([[ 27,  30,  33],
        [ 61,  68,  75],
        [ 95, 106, 117]])


In [11]:
int_64 = torch.randint(1, (3, 2))
#type int64
float_32 = torch.rand(2, 3)
#type float32
print(int_64.dtype, float_32.dtype)
result = torch.matmul(int_64, float_32)
print(result)

torch.int64 torch.float32


RuntimeError: expected m1 and m2 to have the same dtype, but got: long long != float

The above error can be avoided through this:

In [13]:
int_64 = torch.randint(2, (3, 2)).float()
#type int64
float_32 = torch.rand(2, 3)
#type float32
print(int_64.dtype, float_32.dtype)
result = torch.matmul(int_64, float_32)
print(result)

torch.float32 torch.float32
tensor([[0.9278, 0.4474, 0.8127],
        [1.0828, 0.5110, 0.9031],
        [1.0828, 0.5110, 0.9031]])


## Bigram Class - Forward Function working

In [3]:
input = torch.rand((4, 8, 10))
B, T, C = input.shape
output = input.view(B*T, C)
print(output)
print(output[:, -1])

tensor([[0.9956, 0.3139, 0.0915, 0.4824, 0.8314, 0.6532, 0.4025, 0.1159, 0.8527,
         0.7673],
        [0.2866, 0.0593, 0.1632, 0.3909, 0.8187, 0.3670, 0.5797, 0.0939, 0.9752,
         0.6992],
        [0.5972, 0.7096, 0.2683, 0.0851, 0.5697, 0.2152, 0.0115, 0.9413, 0.2567,
         0.9497],
        [0.0529, 0.9492, 0.6111, 0.5841, 0.1667, 0.2814, 0.1860, 0.8599, 0.5533,
         0.9053],
        [0.9985, 0.3350, 0.8465, 0.0031, 0.3436, 0.5445, 0.0718, 0.6499, 0.8506,
         0.5818],
        [0.4896, 0.3608, 0.7811, 0.1758, 0.2314, 0.3331, 0.9527, 0.9024, 0.3727,
         0.8831],
        [0.3671, 0.2318, 0.2568, 0.0132, 0.3762, 0.9510, 0.2914, 0.0447, 0.5394,
         0.3180],
        [0.4184, 0.7837, 0.8294, 0.0502, 0.3810, 0.0268, 0.5376, 0.2696, 0.3349,
         0.6585],
        [0.9371, 0.3863, 0.0820, 0.1946, 0.2860, 0.2713, 0.8975, 0.5152, 0.0701,
         0.0543],
        [0.6349, 0.2216, 0.4960, 0.9618, 0.3308, 0.0261, 0.8145, 0.4827, 0.7035,
         0.5275],
        [0

In [4]:
output.shape

torch.Size([32, 10])

## Activation Functions

In [11]:
x = torch.tensor([-0.05])
y = F.relu(x)
y

tensor([0.])

In [14]:
x = torch.tensor([0.])
y = F.sigmoid(x)  # 1/(1 + exp(-x))
y

tensor([0.5000])