In [1]:
import torch
print(torch.__version__)

2.5.1+cpu


In [3]:
import torch
import torch.nn as nn

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cpu


In [None]:
bce_loss = nn.BCELoss()

# Lets define batch size
batch_size = 4
latent_dim = 10

# Fake and real labels for the discriminator
real_labels = torch.ones(batch_size, 1).to(device)
fake_labels = torch.zeros(batch_size, 1).to(device)

# Inputs for the discriminator
real_data = torch.randn(batch_size, 3, 64, 64).to(device)

# Fake data generated by the generator
generator = nn.Sequential(
    nn.Linear(latent_dim, 3*3*64),
    nn.Tanh()
).to(device)

latent_vectors = torch.randn(batch_size, latent_dim).to(device)



In [9]:
import torch
import torch.nn as nn
import torch.optim as optim


# Define a simple Teacher Model, it will be large model
class TeacherModel(nn.Module):
    def __init__(self):
        super(TeacherModel, self).__init__()
        self.fc = nn.Linear(10,5)

    def forward(self, x):
        # Apply softmax to the output of the linear layer
        return nn.functional.softmax(self.fc(x), dim=1)

# Define a simple Student Model, it will be small model
class StudentModel(nn.Module):
    def __init__(self):
        super(StudentModel, self).__init__()
        self.fc = nn.Linear(10,5)

    def forward(self, x):
        # Apply softmax to the output of the linear layer
        return nn.functional.softmax(self.fc(x), dim=1)

# Instantiate teacher and student models
teacher = TeacherModel()
student = StudentModel()

# Move the models to appropriate devices
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
teacher.to(device)
student.to(device)

# Define loss function and optimizer
kl_loss = nn.KLDivLoss(reduction='batchmean')
optimizer = optim.Adam(student.parameters(), lr=0.01)

# Create dummy inputs
input_data = torch.randn(16, 10).to(device)

# Teacher model generates predictions
with torch.no_grad():
    teacher_output = teacher(input_data)

for epoch in range(10):
    # Forward pass of student model
    student_output = student(input_data)

    # Compute KL divergence loss
    loss = kl_loss(torch.log(student_output), teacher_output)

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Print loss for monitoring
    print(f"Epoch {epoch+1}, Loss: {loss.item():0.4f}")

Epoch 1, Loss: 0.2170
Epoch 2, Loss: 0.1993
Epoch 3, Loss: 0.1827
Epoch 4, Loss: 0.1672
Epoch 5, Loss: 0.1527
Epoch 6, Loss: 0.1393
Epoch 7, Loss: 0.1269
Epoch 8, Loss: 0.1154
Epoch 9, Loss: 0.1049
Epoch 10, Loss: 0.0952


In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Lets define a teacher model
class TeacherModel(nn.Module):
    def __init__(self):
        super(TeacherModel, self).__init__()
        self.fc = nn.Linear(10, 5)

    def forward(self, x):
        return F.softmax(self.fc(x), dim=1)
    
# Lets define a student model
class StudentModel(nn.Module):
    def __init__(self):
        super(StudentModel, self).__init__()
        self.fc = nn.Linear(10, 5)
    
    def forward(self, x):
        return F.softmax(self.fc(x), dim=1)
    
teacher_model = TeacherModel()
student_model = StudentModel()

teacher_model.to('cpu')
student_model.to('cpu')

kl_loss = nn.KLDivLoss(reduction='batchmean')
optimizer = optim.Adam(student_model.parameters(), lr=0.01)

input_data = torch.randn(16, 10).to('cpu')

with torch.no_grad():
    teacher_output = teacher_model(input_data)

for epoch in range(10):
    optimizer.zero_grad()
    student_output = student_model(input_data)
    loss = kl_loss(torch.log(student_output) , teacher_output)
    loss.backward()
    optimizer.step()

    # Print the loss for each epoch
    print(f"Epoch {epoch + 1}, Loss: {loss.item():.4f}")


Epoch 1, Loss: 0.3851
Epoch 2, Loss: 0.3541
Epoch 3, Loss: 0.3252
Epoch 4, Loss: 0.2981
Epoch 5, Loss: 0.2730
Epoch 6, Loss: 0.2497
Epoch 7, Loss: 0.2281
Epoch 8, Loss: 0.2083
Epoch 9, Loss: 0.1900
Epoch 10, Loss: 0.1733


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# Define Teacher and Student Models
class TeacherModel(nn.Module):
    def __init__(self):
        super(TeacherModel, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(784, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 10)
        )
    
    def forward(self, x):
        return self.fc(x)

class StudentModel(nn.Module):
    def __init__(self):
        super(StudentModel, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(784, 128),
            nn.ReLU(),
            nn.Linear(128, 10)
        )
    
    def forward(self, x):
        return self.fc(x)

# Distillation loss function
def distillation_loss(student_logits, teacher_logits, targets, alpha=0.5, temperature=2.0):
    """Compute the knowledge distillation loss."""
    soft_targets = F.log_softmax(student_logits / temperature, dim=1)
    teacher_soft_targets = F.softmax(teacher_logits / temperature, dim=1)
    kl_loss = F.kl_div(soft_targets, teacher_soft_targets, reduction='batchmean') * (temperature ** 2)
    ce_loss = F.cross_entropy(student_logits, targets)
    return alpha * kl_loss + (1 - alpha) * ce_loss

# Training function
def train(student, teacher, dataloader, optimizer, alpha=0.5, temperature=2.0):
    student.train()
    teacher.eval()  # Teacher model is fixed
    total_loss = 0
    for data, targets in dataloader:
        data, targets = data.view(data.size(0), -1), targets
        optimizer.zero_grad()

        student_logits = student(data)
        with torch.no_grad():
            teacher_logits = teacher(data)
        
        loss = distillation_loss(student_logits, teacher_logits, targets, alpha, temperature)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

# Dummy dataset and models
from torch.utils.data import DataLoader, TensorDataset

# Random data for demonstration
x = torch.randn(1000, 784)  # 1000 samples, 784 features (like MNIST flattened images)
y = torch.randint(0, 10, (1000,))  # Random labels for 10 classes

dataset = TensorDataset(x, y)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

teacher = TeacherModel()
student = StudentModel()

# Pre-train the teacher model (this would be a separate step in practice)
teacher_optimizer = optim.Adam(teacher.parameters(), lr=0.001)
for epoch in range(5):  # Quick teacher training
    teacher.train()
    total_loss = 0
    for data, targets in dataloader:
        data, targets = data.view(data.size(0), -1), targets
        teacher_optimizer.zero_grad()
        output = teacher(data)
        loss = F.cross_entropy(output, targets)
        loss.backward()
        teacher_optimizer.step()
        total_loss += loss.item()
    print(f"Teacher Epoch {epoch+1}, Loss: {total_loss / len(dataloader):.4f}")

# Train the student model with knowledge distillation
student_optimizer = optim.Adam(student.parameters(), lr=0.001)
for epoch in range(5):
    loss = train(student, teacher, dataloader, student_optimizer)
    print(f"Student Epoch {epoch+1}, Loss: {loss:.4f}")
