#### Understanding Neural Networks
##### Structure of a Neural Network:
- **Input Layer**: Takes input features (e.g., pixels in an image).
- **Hidden Layers**: Learn complex patterns from data.
- **Output Layer**: Provides the final prediction

```
  Input Layer    Hidden Layer    Output Layer
      O               O               O
    /   \           /   \           /
  O       O -----> O     O -----> O  
    \   /           \   /          
      O               O  
```

#### Implementing a Simple Neural Network from Scratch

In [2]:
import numpy as np

#### Create the Dataset

In [3]:
# Input (Hours Studied, Sleep Hours)
X = np.array([[1, 2], [2, 3], [3, 5], [4, 2], [5, 6], [6, 7], [7, 8], [8, 7]])
# Output (1 = Pass, 0 = Fail)
y = np.array([[0], [0], [1], [0], [1], [1], [1], [1]])


#### Define Activation Functions

In [4]:
# Sigmoid Activation Function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Derivative of Sigmoid (for Backpropagation)
def sigmoid_derivative(x):
    return x * (1 - x)


####  Initialize Weights and Biases

In [5]:
np.random.seed(42)  # For reproducibility
weights = np.random.rand(2, 1)  # Two input features, one output
bias = np.random.rand(1)
learning_rate = 0.1


#### Train the Neural Network (Forward + Backpropagation)

In [6]:
epochs = 10000  # Number of iterations

for epoch in range(epochs):
    # Forward Propagation
    Z = np.dot(X, weights) + bias
    A = sigmoid(Z)
    
    # Compute Error
    error = y - A

    # Backpropagation
    dZ = error * sigmoid_derivative(A)  # Gradient of Loss w.r.t Z
    weights += np.dot(X.T, dZ) * learning_rate  # Update Weights
    bias += np.sum(dZ) * learning_rate  # Update Bias

    # Print loss every 1000 epochs
    if epoch % 1000 == 0:
        loss = np.mean(error ** 2)
        print(f"Epoch {epoch}, Loss: {loss:.4f}")


Epoch 0, Loss: 0.3564
Epoch 1000, Loss: 0.0105
Epoch 2000, Loss: 0.0057
Epoch 3000, Loss: 0.0039
Epoch 4000, Loss: 0.0029
Epoch 5000, Loss: 0.0023
Epoch 6000, Loss: 0.0019
Epoch 7000, Loss: 0.0017
Epoch 8000, Loss: 0.0015
Epoch 9000, Loss: 0.0013


#### Make Predictions

In [7]:
# Predict for new data
new_student = np.array([[4, 5]])  # 4 study hours, 5 sleep hours
prediction = sigmoid(np.dot(new_student, weights) + bias)
print("Prediction (Probability of Passing):", prediction)


Prediction (Probability of Passing): [[0.8900576]]


## Training a Deep Learning Model using PyTorch

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np


#### Prepare the Dataset

In [5]:
# Input (Hours Studied, Sleep Hours)
X = np.array([[1, 2], [2, 3], [3, 5], [4, 2], [5, 6], [6, 7], [7, 8], [8, 7]], dtype=np.float32)
# Output (1 = Pass, 0 = Fail)
y = np.array([[0], [0], [1], [0], [1], [1], [1], [1]], dtype=np.float32)

# Convert to PyTorch tensors
X_tensor = torch.tensor(X)
y_tensor = torch.tensor(y)


#### Define the Neural Network Model

In [7]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.hidden = nn.Linear(2, 4)  # 2 input features → 4 neurons in the hidden layer
        self.relu = nn.ReLU()  # Activation function
        self.output = nn.Linear(4, 1)  # 4 hidden neurons → 1 output neuron
        self.sigmoid = nn.Sigmoid()  # Sigmoid activation for binary classification

    def forward(self, x):
        x = self.hidden(x)
        x = self.relu(x)
        x = self.output(x)
        x = self.sigmoid(x)
        return x

# Create model instance
model = NeuralNetwork()


#### Define Loss Function and Optimizer

In [8]:
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss (for binary classification)
optimizer = optim.Adam(model.parameters(), lr=0.01)  # Adam optimizer with learning rate 0.01


#### Train the Model

In [9]:
epochs = 1000  # Number of training iterations

for epoch in range(epochs):
    # Forward pass
    outputs = model(X_tensor)
    loss = criterion(outputs, y_tensor)

    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Print loss every 100 epochs
    if epoch % 100 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")


Epoch 0, Loss: 0.7094
Epoch 100, Loss: 0.2709
Epoch 200, Loss: 0.0643
Epoch 300, Loss: 0.0275
Epoch 400, Loss: 0.0154
Epoch 500, Loss: 0.0098
Epoch 600, Loss: 0.0067
Epoch 700, Loss: 0.0049
Epoch 800, Loss: 0.0037
Epoch 900, Loss: 0.0029


####  Make Predictions

In [10]:
# Test data (Predict if a student who studied 4 hours and slept 5 hours will pass)
new_student = torch.tensor([[4, 5]], dtype=torch.float32)
prediction = model(new_student).detach().numpy()
print("Prediction (Probability of Passing):", prediction)


Prediction (Probability of Passing): [[0.9874164]]


#### Implementing Scaled Dot-Product Attention in Python

####  Define Input Sentences
Each word will be represented as a vector (embeddings).

In [11]:
import numpy as np

# Example sentence with 4 words (each word is a 3D vector)
np.random.seed(42)  # For reproducibility
words = 4  # Number of words
embedding_dim = 3  # Word vector size

# Random word embeddings
Q = np.random.rand(words, embedding_dim)
K = np.random.rand(words, embedding_dim)
V = np.random.rand(words, embedding_dim)

print("Query (Q):\n", Q)
print("Key (K):\n", K)
print("Value (V):\n", V)


Query (Q):
 [[0.37454012 0.95071431 0.73199394]
 [0.59865848 0.15601864 0.15599452]
 [0.05808361 0.86617615 0.60111501]
 [0.70807258 0.02058449 0.96990985]]
Key (K):
 [[0.83244264 0.21233911 0.18182497]
 [0.18340451 0.30424224 0.52475643]
 [0.43194502 0.29122914 0.61185289]
 [0.13949386 0.29214465 0.36636184]]
Value (V):
 [[0.45606998 0.78517596 0.19967378]
 [0.51423444 0.59241457 0.04645041]
 [0.60754485 0.17052412 0.06505159]
 [0.94888554 0.96563203 0.80839735]]


####  Compute Attention Scores

In [12]:
# Compute dot product of Query and Key (QK^T)
attention_scores = np.dot(Q, K.T)
print("\nRaw Attention Scores:\n", attention_scores)



Raw Attention Scores:
 [[0.64675177 0.74205833 0.88652906 0.59816679]
 [0.55984141 0.23912325 0.39947042 0.18623963]
 [0.34157207 0.58961914 0.64513862 0.48137664]
 [0.77015453 0.64509281 0.90528538 0.46012339]]


####  Apply Scaling

In [14]:
d_k = embedding_dim  # Scaling factor
scaled_scores = attention_scores / np.sqrt(d_k)
print("\nScaled Scores:\n", scaled_scores)


Scaled Scores:
 [[0.37340231 0.42842758 0.51183779 0.34535176]
 [0.32322459 0.13805788 0.23063436 0.1075255 ]
 [0.19720672 0.34041677 0.37247096 0.27792293]
 [0.44464892 0.37244451 0.52266676 0.26565236]]


#### Apply Softmax to Normalize Scores

In [15]:
def softmax(x):
    return np.exp(x) / np.sum(np.exp(x), axis=1, keepdims=True)

attention_weights = softmax(scaled_scores)
print("\nAttention Weights:\n", attention_weights)



Attention Weights:
 [[0.23938437 0.2529257  0.27492711 0.23276281]
 [0.28180838 0.23417311 0.25688721 0.2271313 ]
 [0.2257542  0.26051406 0.26899991 0.24473182]
 [0.25990286 0.24179821 0.28099188 0.21730706]]


#### Multiply by Value Matrix

In [16]:
# Multiply attention weights by the Value matrix
output = np.dot(attention_weights, V)
print("\nFinal Output (Self-Attention result):\n", output)



Final Output (Self-Attention result):
 [[0.62713495 0.60944066 0.26559657]
 [0.62053633 0.62312745 0.26747044]
 [0.63257702 0.61378096 0.27251761]
 [0.61978955 0.60506881 0.25707683]]


### Implementing a Basic Transformer Block

In [17]:
import torch
import torch.nn as nn
import numpy as np

#### Define Positional Encoding
Since Transformers don’t have recurrence, we use a mathematical function to encode word positions.

In [18]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]


#### Implement Multi-Head Attention
Instead of computing attention once, Multi-Head Attention allows the model to focus on different words at the same time.

In [19]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0  # Ensure dimensions match
        self.d_k = d_model // num_heads
        self.num_heads = num_heads
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def forward(self, Q, K, V):
        batch_size = Q.shape[0]

        # Linear transformations
        Q = self.W_q(Q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        K = self.W_k(K).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        V = self.W_v(V).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)

        # Scaled Dot-Product Attention
        scores = torch.matmul(Q, K.transpose(-2, -1)) / np.sqrt(self.d_k)
        attention_weights = torch.nn.functional.softmax(scores, dim=-1)
        output = torch.matmul(attention_weights, V)

        # Concatenate heads
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_k)
        return self.W_o(output)


#### Build a Transformer Block
**Each Transformer Block consists of:**
1. Multi-Head Attention
2. Feed-Forward Neural Network
3. Layer Normalization
4. Residual Connections



In [20]:
class TransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads, d_ff):
        super(TransformerBlock, self).__init__()
        self.attention = MultiHeadAttention(d_model, num_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.ff = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, x):
        attn_output = self.attention(x, x, x)
        x = self.norm1(x + attn_output)  # Residual Connection + Normalization
        ff_output = self.ff(x)
        x = self.norm2(x + ff_output)  # Residual Connection + Normalization
        return x