In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

class SmallCNN(nn.Module):
    def __init__(self):
        super(SmallCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(32 * 8 * 8, 128)
        self.fc2 = nn.Linear(128, 10)
    
    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.max_pool2d(x, 2)
        x = torch.relu(self.conv2(x))
        x = torch.max_pool2d(x, 2)
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

teacher_model = models.resnet18(pretrained=True)
teacher_model.fc = nn.Linear(512, 10)
teacher_model.eval()  

student_model = SmallCNN()

transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

criterion = nn.CrossEntropyLoss() 
optimizer = optim.Adam(student_model.parameters(), lr=0.001)

def distillation_loss(student_logits, teacher_logits, T):
    teacher_probs = nn.functional.softmax(teacher_logits / T, dim=1)
    student_probs = nn.functional.log_softmax(student_logits / T, dim=1)
    return nn.functional.kl_div(student_probs, teacher_probs, reduction='batchmean') * (T * T)

num_epochs = 10
temperature = 5.0 
alpha = 0.5

for epoch in range(num_epochs):
    student_model.train()
    for images, labels in train_loader:
        optimizer.zero_grad()
        
        # Forward pass through teacher and student models
        with torch.no_grad():
            teacher_logits = teacher_model(images)
        
        student_logits = student_model(images)
        
        # Calculate the distillation loss
        distill_loss = distillation_loss(student_logits, teacher_logits, temperature)
        
        # Calculate the classification loss (standard supervised learning loss)
        classification_loss = criterion(student_logits, labels)
        
        # Combined loss: a weighted sum of distillation loss and classification loss
        loss = alpha * distill_loss + (1 - alpha) * classification_loss
        
        # Backpropagation and optimization
        loss.backward()
        optimizer.step()

        # Get predictions
        teacher_preds = torch.argmax(teacher_logits, dim=1)
        student_preds = torch.argmax(student_logits, dim=1)

        # Print predictions for the current batch
        for i in range(len(images)):
            print(f"Image {i+1}:")
            print(f"  Teacher Prediction: {teacher_preds[i].item()}, Student Prediction: {student_preds[i].item()}")
    
        break
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}")


Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [03:36<00:00, 785825.19it/s] 


Extracting ./data/cifar-10-python.tar.gz to ./data
Epoch [1/10], Loss: 0.8739
Epoch [2/10], Loss: 0.9472
Epoch [3/10], Loss: 0.8050
Epoch [4/10], Loss: 0.6588
Epoch [5/10], Loss: 0.9096
Epoch [6/10], Loss: 0.7311
Epoch [7/10], Loss: 0.6789
Epoch [8/10], Loss: 0.5437
Epoch [9/10], Loss: 0.6746
Epoch [10/10], Loss: 0.6755


In [11]:
import torch
import torchvision.transforms as transforms
from PIL import Image
import matplotlib.pyplot as plt

# Assuming the student_model is already defined and trained
student_model.eval()  # Set the model to evaluation mode

# Define the same transformation used during training
transform = transforms.Compose([
    transforms.Resize((32, 32)),  # Resize to match input size of student model
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

# Function to preprocess and predict on a single image
def predict_image(image_path):
    # Load and preprocess the image
    image = Image.open(image_path).convert('RGB')
    image = transform(image)  # Apply the transformations
    image = image.unsqueeze(0)  # Add a batch dimension

    with torch.no_grad():  # Disable gradient calculation
        output = student_model(image)  # Forward pass through the model

    # Get the predicted class
    predicted_class = torch.argmax(output, dim=1).item()  # Get the index of the max log-probability
    return predicted_class

# Example usage
image_path = '/home/ajeet/code/testing/data/person1.jpg'  # Replace with your image path
predicted_class = predict_image(image_path)

# Display the predicted class
print(f'Predicted Class: {predicted_class}')

CIFAR10_CLASSES = [
    "plane", "car", "bird", "cat", "deer",
    "dog", "frog", "horse", "ship", "truck"
]

# Use the predicted_class index to get the class name
predicted_class_name = CIFAR10_CLASSES[predicted_class]
print(f'Predicted Class Name: {predicted_class_name}')



Predicted Class: 8
Predicted Class Name: ship


In [20]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

teacher_model = models.resnet18(pretrained=True)
teacher_model.fc = nn.Linear(512, 10)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
teacher_model.to(device)

Files already downloaded and verified




cuda


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [32]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(teacher_model.parameters(), lr=0.0001)

num_epochs = 8
teacher_model.train()

for epoch in range(num_epochs):
    running_loss = 0.0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad() 
        outputs = teacher_model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_loss = running_loss / len(train_loader)
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}")

torch.save(teacher_model.state_dict(), 'fine_tuned_teacher_model_2.pth')
print("Fine-tuned teacher model saved!")

Epoch [1/8], Loss: 0.0540
Epoch [2/8], Loss: 0.0458
Epoch [3/8], Loss: 0.0463
Epoch [4/8], Loss: 0.0419
Epoch [5/8], Loss: 0.0403
Epoch [6/8], Loss: 0.0341
Epoch [7/8], Loss: 0.0336
Epoch [8/8], Loss: 0.0364
Fine-tuned teacher model saved!


In [35]:
import torch
import torch.nn as nn
import torchvision.models as models
from torchvision import transforms
from PIL import Image

teacher_model = models.resnet18(pretrained=False)
teacher_model.fc = nn.Linear(512, 10)
teacher_model.load_state_dict(torch.load('fine_tuned_teacher_model_2.pth'))
teacher_model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
teacher_model.to(device)

def preprocess_image(image_path):
    transform = transforms.Compose([
        transforms.Resize((32, 32)),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
    ])
    
    image = Image.open(image_path).convert('RGB')
    image = transform(image)
    image = image.unsqueeze(0)
    return image.to(device)

def infer(image_path):
    image = preprocess_image(image_path)
    with torch.no_grad():
        outputs = teacher_model(image)
        _, predicted = torch.max(outputs, 1)
    return predicted.item()

image_path = '/home/ajeet/Downloads/cat.jpeg'
predicted_class = infer(image_path)
print(f'Predicted class index: {predicted_class}')

cifar10_classes = [
    "plane", "car", "bird", "cat", "deer",
    "dog", "frog", "horse", "ship", "truck"
]

predicted_class_name = cifar10_classes[predicted_class]
print(f'Predicted class: {predicted_class_name}')


Predicted class index: 3
Predicted class: cat


  teacher_model.load_state_dict(torch.load('fine_tuned_teacher_model_2.pth'))


In [36]:
from torch.utils.data import DataLoader
import torchvision.datasets as datasets
from torchvision.transforms import transforms


transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])


test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=True)

Files already downloaded and verified


In [37]:
# def test(model, test_loader, device):
#     correct = 0
#     total = 0

#     with torch.no_grad():
#         for inputs, labels in test_loader:
#             inputs, labels = inputs.to(device), labels.to(device)

#             outputs = model(inputs)
#             _, predicted = torch.max(outputs.data, 1)

#             total += labels.size(0)
#             correct += (predicted == labels).sum().item()

#     accuracy = 100 * correct / total
#     print(f"Test Accuracy: {accuracy:.2f}%")
#     return accuracy

correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = teacher_model(images)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        # break

accuracy = 100 * correct / total
print(f'Accuracy of the fine-tuned model on the test dataset: {accuracy:.2f}%')

Accuracy of the fine-tuned model on the test dataset: 83.18%


In [38]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

class SmallCNN(nn.Module):
    def __init__(self):
        super(SmallCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(32 * 8 * 8, 128)
        self.fc2 = nn.Linear(128, 10)
    
    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.max_pool2d(x, 2)
        x = torch.relu(self.conv2(x))
        x = torch.max_pool2d(x, 2)
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# teacher_model = models.resnet18(pretrained=True)
# teacher_model.fc = nn.Linear(512, 10)
# teacher_model.eval()

student_model = SmallCNN()
student_model.to(device)

transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(student_model.parameters(), lr=0.001)

def distillation_loss(student_logits, teacher_logits, T):
    teacher_probs = nn.functional.softmax(teacher_logits / T, dim=1)
    student_probs = nn.functional.log_softmax(student_logits / T, dim=1)
    return nn.functional.kl_div(student_probs, teacher_probs, reduction='batchmean') * (T * T)

num_epochs = 10
temperature = 5.0
alpha = 0.5

for epoch in range(num_epochs):
    student_model.train()
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        
        with torch.no_grad():
            teacher_logits = teacher_model(images)
        
        student_logits = student_model(images)
        
        distill_loss = distillation_loss(student_logits, teacher_logits, temperature)
        classification_loss = criterion(student_logits, labels)
        
        loss = alpha * distill_loss + (1 - alpha) * classification_loss
        
        loss.backward()
        optimizer.step()

        teacher_preds = torch.argmax(teacher_logits, dim=1)
        student_preds = torch.argmax(student_logits, dim=1)

        # for i in range(len(images)):
        #     print(f"Image {i+1}:")
        #     print(f"  Teacher Prediction: {teacher_preds[i].item()}, Student Prediction: {student_preds[i].item()}")
    
        # break
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}")


Files already downloaded and verified
Epoch [1/10], Loss: 6.2819
Epoch [2/10], Loss: 8.9293
Epoch [3/10], Loss: 5.3986
Epoch [4/10], Loss: 5.4600
Epoch [5/10], Loss: 4.0815
Epoch [6/10], Loss: 7.2221
Epoch [7/10], Loss: 6.2185
Epoch [8/10], Loss: 4.2522
Epoch [9/10], Loss: 4.8399
Epoch [10/10], Loss: 3.0830


In [39]:
from torch.utils.data import DataLoader
import torchvision.datasets as datasets
from torchvision.transforms import transforms


# transform = transforms.Compose([
#     transforms.Resize((32, 32)),
#     transforms.ToTensor(),
#     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
# ])

test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=True)

Files already downloaded and verified


In [43]:
import torch
import torchvision.transforms as transforms
from PIL import Image
import matplotlib.pyplot as plt

student_model.eval() 

transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])


def predict_image(image_path):
    image = Image.open(image_path).convert('RGB')
    image = transform(image)
    image = image.unsqueeze(0)

    image = image.to(device)

    with torch.no_grad():
        output = student_model(image)

    predicted_class = torch.argmax(output, dim=1).item()
    return predicted_class

image_path = '/home/ajeet/Downloads/car.jpeg'
predicted_class = predict_image(image_path)

print(f'Predicted Class: {predicted_class}')

CIFAR10_CLASSES = [
    "plane", "car", "bird", "cat", "deer",
    "dog", "frog", "horse", "ship", "truck"
]

predicted_class_name = CIFAR10_CLASSES[predicted_class]
print(f'Predicted Class Name: {predicted_class_name}')

Predicted Class: 1
Predicted Class Name: car


In [41]:
student_model.to(device)
student_model.eval()

correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader: 
        images, labels = images.to(device), labels.to(device)
        
        outputs = student_model(images)
        _, predicted = torch.max(outputs, 1)

        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f'Accuracy of the student model on the test dataset: {accuracy:.2f}%')

Accuracy of the student model on the test dataset: 71.96%


In [8]:
total_params = sum(p.numel() for p in student_model.parameters())
param_size = 4

print(total_params /100000)
total_size = total_params * param_size / (1024 * 1024)
print(f'Model size in memory: {total_size:.2f} MB')

2.6865
Model size in memory: 1.02 MB


In [9]:
total_params = sum(p.numel() for p in teacher_model.parameters())
param_size = 4

print(total_params /100000)
total_size = total_params * param_size / (1024 * 1024)
print(f'Model size in memory: {total_size:.2f} MB')

111.81642
Model size in memory: 42.65 MB


In [26]:
student_model.cpu()
student_model.eval() 

quantized_model = torch.quantization.quantize_dynamic(
    student_model,
    {nn.Linear},   
    dtype=torch.qint8
)

total_params = sum(p.numel() for p in quantized_model.parameters())
param_size = 4

print(total_params /100000)
total_size = total_params * param_size / (1024 * 1024)
print(f'Model size in memory: {total_size:.2f} MB')

0.05088
Model size in memory: 0.02 MB


In [24]:
quantized_model

SmallCNN(
  (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (fc1): DynamicQuantizedLinear(in_features=2048, out_features=128, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
  (fc2): DynamicQuantizedLinear(in_features=128, out_features=10, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
)

In [27]:
dummy_input = torch.randn(1, 3, 32, 32)
quantized_output = quantized_model(dummy_input)
print(f"Output of quantized model: {quantized_output}")

Output of quantized model: tensor([[ -0.9451,  -1.4627,  -9.8652,  -4.8697, -18.1972, -22.9591,   8.9231,
          -5.9368,  -0.3267,  14.9413]], grad_fn=<WarnNotImplemented>)


In [29]:
# quantized_model.to(device)
# quantized_model.eval()

correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader: 
        images, labels = images, labels
        
        outputs = quantized_model(images)
        _, predicted = torch.max(outputs, 1)

        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f'Accuracy of the student model on the test dataset: {accuracy:.2f}%')

Accuracy of the student model on the test dataset: 70.75%


In [19]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

class SmallCNN(nn.Module):
    def __init__(self):
        super(SmallCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(32 * 8 * 8, 128)
        self.fc2 = nn.Linear(128, 10)
    
    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.max_pool2d(x, 2)
        x = torch.relu(self.conv2(x))
        x = torch.max_pool2d(x, 2)
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# teacher_model = models.resnet18(pretrained=True)
# teacher_model.fc = nn.Linear(512, 10)
# teacher_model.eval()

student_model = SmallCNN()
student_model.to(device)

transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(student_model.parameters(), lr=0.0001)

def distillation_loss(student_logits, teacher_logits, T):
    teacher_probs = nn.functional.softmax(teacher_logits / T, dim=1)
    student_probs = nn.functional.log_softmax(student_logits / T, dim=1)
    return nn.functional.kl_div(student_probs, teacher_probs, reduction='batchmean') * (T * T)

num_epochs = 10
temperature = 5.0
alpha = 0.5

for epoch in range(num_epochs):
    student_model.train()
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        
        with torch.no_grad():
            teacher_logits = teacher_model(images)
        
        student_logits = student_model(images)
        
        # distill_loss = distillation_loss(student_logits, teacher_logits, temperature)
        # classification_loss = criterion(student_logits, labels)
        
        # loss = alpha * distill_loss + (1 - alpha) * classification_loss

        soft_targets = nn.functional.softmax(teacher_logits / 2, dim=-1)
        soft_prob = nn.functional.log_softmax(student_logits / 2, dim=-1)

        # Calculate the soft targets loss. Scaled by T**2 as suggested by the authors of the paper "Distilling the knowledge in a neural network"
        soft_targets_loss = torch.sum(soft_targets * (soft_targets.log() - soft_prob)) / soft_prob.size()[0] * (2**2)

        # Calculate the true label loss
        label_loss = criterion(student_logits, labels)

        # Weighted sum of the two losses
        loss = 0.10 * soft_targets_loss + 0.90 * label_loss
        
        loss.backward()
        optimizer.step()

        teacher_preds = torch.argmax(teacher_logits, dim=1)
        student_preds = torch.argmax(student_logits, dim=1)

        # for i in range(len(images)):
        #     print(f"Image {i+1}:")
        #     print(f"  Teacher Prediction: {teacher_preds[i].item()}, Student Prediction: {student_preds[i].item()}")
    
        # break
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}")


Files already downloaded and verified
Epoch [1/10], Loss: 2.1874
Epoch [2/10], Loss: 1.7820
Epoch [3/10], Loss: 1.7328
Epoch [4/10], Loss: 1.7014
Epoch [5/10], Loss: 1.5585
Epoch [6/10], Loss: 1.3253
Epoch [7/10], Loss: 1.4570
Epoch [8/10], Loss: 1.5063
Epoch [9/10], Loss: 1.6937
Epoch [10/10], Loss: 1.4680


In [18]:
from torch.utils.data import DataLoader
import torchvision.datasets as datasets
from torchvision.transforms import transforms


# transform = transforms.Compose([
#     transforms.Resize((32, 32)),
#     transforms.ToTensor(),
#     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
# ])

test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=True)

student_model.to(device)
student_model.eval()

correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader: 
        images, labels = images.to(device), labels.to(device)
        
        outputs = student_model(images)
        _, predicted = torch.max(outputs, 1)

        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f'Accuracy of the student model on the test dataset: {accuracy:.2f}%')

Files already downloaded and verified
Accuracy of the student model on the test dataset: 56.40%


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

class SmallCNN(nn.Module):
    def __init__(self):
        super(SmallCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(32 * 8 * 8, 128)
        self.fc2 = nn.Linear(128, 10)
    
    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.max_pool2d(x, 2)
        x = torch.relu(self.conv2(x))
        x = torch.max_pool2d(x, 2)
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# teacher_model = models.resnet18(pretrained=True)
# teacher_model.fc = nn.Linear(512, 10)  # Adjusting output for CIFAR-10
# teacher_model.eval()  # Set to evaluation mode

device = "cuda"
student_model_without_distillation = SmallCNN()
student_model_without_distillation.to(device)

transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
r
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(student_model_without_distillation.parameters(), lr=0.001)

# def distillation_loss(student_logits, teacher_logits, T):
#     teacher_probs = nn.functional.softmax(teacher_logits / T, dim=1)
#     student_probs = nn.functional.log_softmax(student_logits / T, dim=1)
#     return nn.functional.kl_div(student_probs, teacher_probs, reduction='batchmean') * (T * T)

num_epochs = 25
temperature = 5.0
alpha = 0.5

for epoch in range(num_epochs):
    student_model_without_distillation.train()
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        
        # with torch.no_grad():
        #     teacher_logits = teacher_model(images)
        
        student_logits = student_model_without_distillation(images)
        
        # distill_loss = distillation_loss(student_logits, teacher_logits, temperature)
        
        classification_loss = criterion(student_logits, labels)
        
        # loss = alpha * distill_loss + (1 - alpha) * classification_loss
        loss = classification_loss
        
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}")


Files already downloaded and verified
Epoch [1/25], Loss: 1.4256
Epoch [2/25], Loss: 1.1326
Epoch [3/25], Loss: 0.6523
Epoch [4/25], Loss: 0.3514
Epoch [5/25], Loss: 0.4743
Epoch [6/25], Loss: 0.6629
Epoch [7/25], Loss: 0.2457
Epoch [8/25], Loss: 0.5723
Epoch [9/25], Loss: 0.3800
Epoch [10/25], Loss: 0.4427
Epoch [11/25], Loss: 0.1900
Epoch [12/25], Loss: 0.3048
Epoch [13/25], Loss: 0.7931
Epoch [14/25], Loss: 0.0563
Epoch [15/25], Loss: 0.1185
Epoch [16/25], Loss: 0.1702
Epoch [17/25], Loss: 0.1869
Epoch [18/25], Loss: 0.2829
Epoch [19/25], Loss: 0.2178
Epoch [20/25], Loss: 0.1647
Epoch [21/25], Loss: 0.0865
Epoch [22/25], Loss: 0.1052
Epoch [23/25], Loss: 0.0546
Epoch [24/25], Loss: 0.0073
Epoch [25/25], Loss: 0.3126


In [8]:
from torch.utils.data import DataLoader
import torchvision.datasets as datasets
from torchvision.transforms import transforms


test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=True)

Files already downloaded and verified


In [9]:
student_model_without_distillation.to(device)
student_model_without_distillation.eval()

correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        
        outputs = student_model_without_distillation(images)
        _, predicted = torch.max(outputs, 1)

        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f'Accuracy of the student_model_without_distillation on the test dataset: {accuracy:.2f}%')

Accuracy of the student_model_without_distillation on the test dataset: 66.36%


In [6]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import json
import requests
import matplotlib.pyplot as plt
import cv2

without_finetuned_model = models.resnet18(pretrained=True)
without_finetuned_model.eval()

transform = transforms.Compose([
    transforms.Resize(256),
    # transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

def preprocess_image(image_path):
    image = Image.open(image_path).convert('RGB') 
    image = transform(image)
    image = image.unsqueeze(0)
    return image

def classify_image(image_path):
    image_tensor = preprocess_image(image_path)
    with torch.no_grad():
        outputs = without_finetuned_model(image_tensor)
        _, predicted_idx = torch.max(outputs, 1)
    
    labels = requests.get('https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt').text.splitlines()
    predicted_class = labels[predicted_idx.item()]
    return predicted_class

image_path = '/home/ajeet/Downloads/truck.jpeg'
predicted_class = classify_image(image_path)
print(f"Predicted Class: {predicted_class}")




Predicted Class: trailer truck


In [9]:
without_finetuned_test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

without_finetuned_test_loader = torch.utils.data.DataLoader(without_finetuned_test_dataset, batch_size=128, shuffle=True)

Files already downloaded and verified


In [13]:
correct = 0
total = 0
with torch.no_grad():
    for images, labels in without_finetuned_test_loader:
        images, labels = images, labels
        outputs = without_finetuned_model(images)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        # break

accuracy = 100 * correct / total
print(f'Accuracy of the without_finetuned model on the test dataset: {accuracy:.2f}%')

torch.Size([128, 3, 256, 256])
torch.Size([128, 3, 256, 256])
torch.Size([128, 3, 256, 256])
torch.Size([128, 3, 256, 256])
torch.Size([128, 3, 256, 256])
torch.Size([128, 3, 256, 256])
torch.Size([128, 3, 256, 256])
torch.Size([128, 3, 256, 256])
torch.Size([128, 3, 256, 256])
torch.Size([128, 3, 256, 256])
torch.Size([128, 3, 256, 256])
torch.Size([128, 3, 256, 256])
torch.Size([128, 3, 256, 256])
torch.Size([128, 3, 256, 256])
torch.Size([128, 3, 256, 256])
torch.Size([128, 3, 256, 256])
torch.Size([128, 3, 256, 256])
torch.Size([128, 3, 256, 256])
torch.Size([128, 3, 256, 256])
torch.Size([128, 3, 256, 256])
torch.Size([128, 3, 256, 256])
torch.Size([128, 3, 256, 256])
torch.Size([128, 3, 256, 256])
torch.Size([128, 3, 256, 256])
torch.Size([128, 3, 256, 256])
torch.Size([128, 3, 256, 256])
torch.Size([128, 3, 256, 256])
torch.Size([128, 3, 256, 256])
torch.Size([128, 3, 256, 256])
torch.Size([128, 3, 256, 256])
torch.Size([128, 3, 256, 256])
torch.Size([128, 3, 256, 256])
torch.Si

In [44]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10
import torchvision

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Teacher Model: Pre-trained ResNet for image classification
class TeacherModel(nn.Module):
    def __init__(self):
        super(TeacherModel, self).__init__()
        resnet = models.resnet18(pretrained=True)
        self.features = nn.Sequential(*list(resnet.children())[:-1])  # Remove the final layer

    def forward(self, x):
        with torch.no_grad():  # We don't update the teacher model
            x = self.features(x)
            x = x.view(x.size(0), -1)  # Flatten the output
        return x

# Student Model: LSTM network to generate text (captions) based on image features
class StudentModel(nn.Module):
    def __init__(self, feature_dim, hidden_dim, vocab_size):
        super(StudentModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(feature_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, features):
        # LSTM expects input of shape (batch_size, seq_length, input_size)
        # We will assume sequence length of 1 for simplicity here
        lstm_out, _ = self.lstm(features.unsqueeze(1))
        output = self.fc(lstm_out.squeeze(1))  # Output shape: (batch_size, vocab_size)
        return output

# Example vocabulary size (10 classes for simplicity)
vocab_size = 10
teacher_model = TeacherModel().to(device)
student_model = StudentModel(feature_dim=512, hidden_dim=256, vocab_size=vocab_size).to(device)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(student_model.parameters(), lr=0.001)

# Transform and Data Loader for CIFAR-10 dataset
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

train_dataset = CIFAR10(root='./data', train=True, transform=transform, download=True)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Dummy function to convert labels to words (for demonstration purposes)
def label_to_word(label):
    word_list = ["cat", "dog", "car", "plane", "tree", "house", "person", "ship", "horse", "bird"]
    return word_list[label]

# Training loop for Cross-Modal Distillation
epochs = 5
for epoch in range(epochs):
    student_model.train()
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        
        # Forward pass with teacher model to get image features
        image_features = teacher_model(images)  # Shape: (batch_size, 512)
        
        # Forward pass with student model
        student_outputs = student_model(image_features)  # Shape: (batch_size, vocab_size)
        
        # Convert labels to a compatible form (example: using class index as "words")
        labels = labels.to(device)
        
        # Compute loss and optimize
        loss = criterion(student_outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    print(f"Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}")

# After training, you could input image features and have the student generate "captions" based on these features.




Files already downloaded and verified
Epoch [1/5], Loss: 0.4101
Epoch [2/5], Loss: 0.7097
Epoch [3/5], Loss: 0.7318
Epoch [4/5], Loss: 0.9238
Epoch [5/5], Loss: 0.8046


In [50]:
import torch

# Function to perform inference using the student model
def infer(image, teacher_model, student_model, device):
    # Set both models to evaluation mode
    teacher_model.eval()
    student_model.eval()
    
    with torch.no_grad():
        # Move image to device and pass through the teacher model to extract features
        image = image.to(device)
        image_features = teacher_model(image.unsqueeze(0))  # Add batch dimension
        
        # Pass the extracted features through the student model to generate output
        output = student_model(image_features)
        
        # Get the predicted class (index of the max value)
        predicted_class = torch.argmax(output, dim=1).item()
        
        # Map the predicted index to a word (label) using our dummy vocabulary
        word_list = [
            "plane", "car", "bird", "cat", "deer",
            "dog", "frog", "horse", "ship", "truck"
        ]
        predicted_word = word_list[predicted_class]
        
        return predicted_word

# Example usage:
# Get a sample image from the dataset
sample_image, _ = train_dataset[0]  # Get the image (ignoring the label for inference)
# sample_image = transform(sample_image)  # Apply the same transform used during training

# Perform inference
predicted_label = infer(sample_image, teacher_model, student_model, device)
print(f"Predicted label: {predicted_label}")


Predicted label: frog


In [51]:
import torch
from torchvision.datasets import CIFAR10
from torch.utils.data import DataLoader

# Function to calculate accuracy of the student model
def calculate_accuracy(student_model, teacher_model, test_loader, device):
    student_model.eval()
    teacher_model.eval()
    
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            
            # Extract features from the teacher model
            image_features = teacher_model(images)
            
            # Get outputs from the student model
            student_outputs = student_model(image_features)

            # Get the predicted class (index of the max value)
            _, predicted = torch.max(student_outputs, dim=1)

            # Update total and correct predictions
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = correct / total * 100  # Calculate accuracy as a percentage
    return accuracy

# Transform for the test dataset (same as training)
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

# Load the CIFAR-10 test dataset
test_dataset = CIFAR10(root='./data', train=False, transform=transform, download=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Calculate accuracy of the student model
accuracy = calculate_accuracy(student_model, teacher_model, test_loader, device)
print(f'Student model accuracy: {accuracy:.2f}%')


Files already downloaded and verified
Student model accuracy: 79.73%


In [52]:
total_params = sum(p.numel() for p in student_model.parameters())
param_size = 4

print(total_params /100000)
total_size = total_params * param_size / (1024 * 1024)
print(f'Model size in memory: {total_size:.2f} MB')

7.9105
Model size in memory: 3.02 MB


In [53]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class StudentModel(nn.Module):
    def __init__(self, hidden_dim, vocab_size):
        super(StudentModel, self).__init__()
        self.cnn = models.resnet18(pretrained=False)
        self.cnn.fc = nn.Identity()
        
        self.lstm = nn.LSTM(512, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        features = self.cnn(x)
        features = features.unsqueeze(1)
        lstm_out, _ = self.lstm(features)
        output = self.fc(lstm_out.squeeze(1))
        return output

vocab_size = 10
hidden_dim = 256

student_model = StudentModel(hidden_dim=hidden_dim, vocab_size=vocab_size).to(device)

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

train_dataset = CIFAR10(root='./data', train=True, transform=transform, download=True)

def infer(image, model, device):
    model.eval()
    with torch.no_grad():
        image = image.to(device)
        output = model(image.unsqueeze(0))
        predicted_class = torch.argmax(output, dim=1).item()
        CIFAR10_CLASSES = [
            "plane", "car", "bird", "cat", "deer",
            "dog", "frog", "horse", "ship", "truck"
        ]
        predicted_label = CIFAR10_CLASSES[predicted_class]
        
        return predicted_label

sample_image, _ = train_dataset[0] 
# sample_image = transform(sample_image)

predicted_label = infer(sample_image, student_model, device)
print(f"Predicted label: {predicted_label}")


Files already downloaded and verified
Predicted label: horse


In [54]:
total_params = sum(p.numel() for p in student_model.parameters())
param_size = 4

print(total_params /100000)
total_size = total_params * param_size / (1024 * 1024)
print(f'Model size in memory: {total_size:.2f} MB')

119.67562
Model size in memory: 45.65 MB


In [1]:
from transformers import CLIPModel, CLIPConfig, AutoProcessor
import torch

def reduce_clip_layers(teacher_model: CLIPModel, num_text_layers: int, num_vision_layers: int) -> CLIPModel:
    config = teacher_model.config.to_dict()
    config['text_config']['num_hidden_layers'] = num_text_layers
    config['vision_config']['num_hidden_layers'] = num_vision_layers
    
    student_config = CLIPConfig.from_dict(config)
    
    student_model = CLIPModel(student_config)
    
    teacher_text_layers = teacher_model.text_model.encoder.layers
    student_text_layers = student_model.text_model.encoder.layers
    
    step_text = len(teacher_text_layers) // len(student_text_layers)
    
    for i, student_layer in enumerate(student_text_layers):
        teacher_layer = teacher_text_layers[i * step_text]
        student_layer.load_state_dict(teacher_layer.state_dict())
    
    teacher_vision_layers = teacher_model.vision_model.encoder.layers
    student_vision_layers = student_model.vision_model.encoder.layers
    
    step_vision = len(teacher_vision_layers) // len(student_vision_layers)
    
    for i, student_layer in enumerate(student_vision_layers):
        teacher_layer = teacher_vision_layers[i * step_vision]
        student_layer.load_state_dict(teacher_layer.state_dict())
    
    return student_model

teacher_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
num_text_layers = 2
num_vision_layers = 2

student_model = reduce_clip_layers(teacher_model, num_text_layers, num_vision_layers)

processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

text = ["a photo of a dog"]
image = torch.rand(1, 3, 224, 224) 

inputs = processor(text=text, images=image, return_tensors="pt", padding=True)

with torch.no_grad():
    outputs = student_model(**inputs)

text_features = outputs.text_embeds
image_features = outputs.image_embeds

print("Text Features:", text_features)
print("Image Features:", image_features)


  from .autonotebook import tqdm as notebook_tqdm
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]
2024-10-20 23:39:10.865419: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-20 23:39:11.625113: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
It looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.


Text Features: tensor([[ 1.7387e-02,  5.6551e-02, -4.6250e-02, -2.1587e-03,  7.5924e-02,
         -7.3925e-02,  3.8801e-02,  9.4470e-02,  8.6250e-03, -2.7143e-02,
          4.8001e-02, -2.4516e-02, -3.5613e-02, -6.2863e-02,  1.6915e-02,
         -1.5899e-02, -2.9257e-02, -5.2187e-02,  6.5392e-02,  6.1481e-03,
          2.9515e-02,  5.1625e-02, -6.3014e-02, -2.7322e-02,  1.5342e-02,
          6.2930e-02,  8.1192e-02, -1.3159e-02,  6.3469e-02,  2.8556e-02,
         -5.6184e-02,  7.8403e-02,  4.9809e-03, -3.2727e-02, -1.3405e-03,
          9.7308e-02, -9.2236e-02, -1.9654e-02, -3.3392e-02, -1.1686e-02,
          9.2331e-03,  2.2359e-02, -5.1954e-02, -9.2235e-03, -3.7786e-02,
         -3.8870e-02, -8.2130e-02,  9.9372e-02,  3.7040e-02, -7.3556e-02,
          7.9372e-03,  7.0264e-04, -6.5739e-02,  5.4766e-02, -4.7167e-02,
          1.2790e-02, -2.3587e-02,  9.7568e-04,  1.2494e-02, -6.3058e-02,
         -6.7019e-02, -5.2892e-02, -3.8356e-02,  5.5975e-03,  6.7959e-03,
          9.9674e-02,  

In [20]:
# text=["a photo of a cat", "a photo of a dog"]
# # image = torch.rand(1, 3, 224, 224)
from PIL import Image

image = Image.open("/home/ajeet/codework/datasets/video_incidents_ajeet/f044cf1d-8a5b-4703-a05c-3b57c5c14989_merged/0_34.jpg")
text = ["a person", "a cell phone"]

inputs = processor(text=text, images=image, return_tensors="pt", padding=True)

with torch.no_grad():
    outputs = student_model(**inputs)

logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
print(probs)

tensor([[0.7122, 0.2878]])


In [4]:
total_params = sum(p.numel() for p in student_model.parameters())
param_size = 4

print(total_params /100000)
total_size = total_params * param_size / (1024 * 1024)
print(f'Model size in memory: {total_size:.2f} MB')

488.74753
Model size in memory: 186.44 MB


In [21]:
def dynamic_quantize_clip(model: CLIPModel) -> CLIPModel:
    model.text_model = torch.quantization.quantize_dynamic(
        model.text_model,
        {torch.nn.Linear},
        dtype=torch.qint8
    )
    model.vision_model = torch.quantization.quantize_dynamic(
        model.vision_model,
        {torch.nn.Linear},
        dtype=torch.qint8
    )
    return model

quantized_model = dynamic_quantize_clip(student_model)

In [22]:
total_params = sum(p.numel() for p in quantized_model.parameters())
param_size = 4

print(total_params /100000)
total_size = total_params * param_size / (1024 * 1024)
print(f'Model size in memory: {total_size:.2f} MB')

284.04481
Model size in memory: 108.35 MB


In [23]:
from PIL import Image

image = Image.open("/home/ajeet/codework/datasets/video_incidents_ajeet/f044cf1d-8a5b-4703-a05c-3b57c5c14989_merged/0_34.jpg")
text = ["a person", "a cell phone"]

inputs = processor(text=text, images=image, return_tensors="pt", padding=True)

with torch.no_grad():
    outputs = quantized_model(**inputs)

logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
print(probs)

tensor([[0.7193, 0.2807]])
