In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-mlordp73
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-mlordp73
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.2.0-py3-none-any.whl (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.4/54.4 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->clip==1.0)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->clip==1.0)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->clip==1.0)
  Using cached nvidi

Image-Des with 100 images (10 images per class)

In [7]:
# Install necessary packages
!pip install torch torchvision

import os
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Dataset
import clip
import pandas as pd

# Define the path to your dataset
data_dir = '/content/drive/MyDrive/classes'

# Define the transformation for the images
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize the images to the size expected by the CLIP model
    transforms.ToTensor(),         # Convert the images to Tensors
    transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),  # Normalize the images
])

# Define text descriptions for each class
class_descriptions = {
    'AnnualCrop': "This is a photo of an annual crop",
    'Forest': "This is a photo of a forest",
    'HerbaceousVegetation': "This is a photo of herbaceous vegetation",
    'Highway': "This is a photo of a highway",
    'Industrial': "This is a photo of an industrial area",
    'Pasture': "This is a photo of a pasture",
    'PermanentCrop': "This is a photo of a permanent crop",
    'Residential': "This is a photo of a residential area",
    'River': "This is a photo of a river",
    'Sealake': "This is a photo of a sea or lake"
}

# Create a custom dataset that returns images and text descriptions
class CustomDataset(Dataset):
    def __init__(self, image_folder, transform, class_descriptions):
        self.dataset = datasets.ImageFolder(image_folder, transform=transform)
        self.class_descriptions = class_descriptions

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        image, label = self.dataset[idx]
        class_name = self.dataset.classes[label]
        description = self.class_descriptions[class_name]
        return image, description, label

# Create the dataset using the custom dataset class
dataset = CustomDataset(data_dir, transform=preprocess, class_descriptions=class_descriptions)

# Split the dataset into training, validation, and test sets
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, val_size, test_size])

# Create the DataLoaders
train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=2)

# Load the CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess_clip = clip.load("RN50", device=device)

# Define a list to store performance metrics and combined features
performance_metrics = []
all_combined_features = []

# Define the criterion for loss calculation
criterion = torch.nn.CrossEntropyLoss()

# Training function
def train_model(model, train_loader, device, num_epochs=4):
    model.train()  # Use training mode
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    for epoch in range(num_epochs):
        running_loss = 0.0
        correct_predictions = 0
        total_predictions = 0

        for images, texts, labels in train_loader:
            texts = clip.tokenize(texts).to(device)

            images = images.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()

            # Removed torch.no_grad() block to allow gradients to flow
            image_features = model.encode_image(images)
            text_features = model.encode_text(texts)

            # Normalize the features
            image_features = image_features / image_features.norm(dim=1, keepdim=True)
            text_features = text_features / text_features.norm(dim=1, keepdim=True)

            # Calculate the similarity matrix
            similarity_matrix = torch.matmul(image_features, text_features.T)

            # Create labels
            target_labels = torch.arange(len(images)).to(device)

            # Calculate the loss
            loss = (criterion(similarity_matrix, target_labels) + criterion(similarity_matrix.T, target_labels)) / 2

            loss.backward()
            optimizer.step()

            running_loss += loss.item() * images.size(0)

            # Calculate accuracy
            _, predicted = torch.max(similarity_matrix, 1)
            correct_predictions += (predicted == target_labels).sum().item()
            total_predictions += target_labels.size(0)

            # Save the combined features
            combined_features = torch.cat((image_features, text_features), dim=1).detach().cpu().numpy() # Detach the tensor from the computation graph
            all_combined_features.extend(combined_features)

        epoch_loss = running_loss / len(train_loader.dataset)
        epoch_accuracy = correct_predictions / total_predictions

        performance_metrics.append({
            'epoch': epoch,
            'loss': epoch_loss,
            'accuracy': epoch_accuracy
        })

        print(f'Epoch {epoch}/{num_epochs - 1}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}')

# Train the model
train_model(model, train_loader, device, num_epochs=4)

# Save performance metrics to a CSV file
df_metrics = pd.DataFrame(performance_metrics)
output_metrics_path = '/content/drive/MyDrive/performance_metrics.csv'
df_metrics.to_csv(output_metrics_path, index=False)
print(f'Performance metrics saved to {output_metrics_path}')

# Save combined features (image and text) to a CSV file
df_combined_features = pd.DataFrame(all_combined_features)
output_combined_features_path = '/content/drive/MyDrive/combined_features1.csv'
df_combined_features.to_csv(output_combined_features_path, index=False)
print(f'Combined features saved to {output_combined_features_path}')




100%|███████████████████████████████████████| 244M/244M [00:04<00:00, 63.6MiB/s]
  self.pid = os.fork()


Epoch 0/3, Loss: 2.0752, Accuracy: 0.2500
Epoch 1/3, Loss: 2.0241, Accuracy: 0.7500
Epoch 2/3, Loss: 1.8253, Accuracy: 1.0000
Epoch 3/3, Loss: 1.6551, Accuracy: 1.0000
Performance metrics saved to /content/drive/MyDrive/performance_metrics.csv
Combined features saved to /content/drive/MyDrive/combined_features1.csv


Image-Des with 100 images (10 images per class) With training , validation and testing loss and accuracy

In [8]:
# Install necessary packages
!pip install torch torchvision

import os
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Dataset
import clip
import pandas as pd

# Define the path to your dataset
data_dir = '/content/drive/MyDrive/classes'

# Define the transformation for the images
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize the images to the size expected by the CLIP model
    transforms.ToTensor(),         # Convert the images to Tensors
    transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),  # Normalize the images
])

# Define text descriptions for each class
class_descriptions = {
    'AnnualCrop': "This is a photo of an annual crop",
    'Forest': "This is a photo of a forest",
    'HerbaceousVegetation': "This is a photo of herbaceous vegetation",
    'Highway': "This is a photo of a highway",
    'Industrial': "This is a photo of an industrial area",
    'Pasture': "This is a photo of a pasture",
    'PermanentCrop': "This is a photo of a permanent crop",
    'Residential': "This is a photo of a residential area",
    'River': "This is a photo of a river",
    'Sealake': "This is a photo of a sea or lake"
}

# Create a custom dataset that returns images and text descriptions
class CustomDataset(Dataset):
    def __init__(self, image_folder, transform, class_descriptions):
        self.dataset = datasets.ImageFolder(image_folder, transform=transform)
        self.class_descriptions = class_descriptions

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        image, label = self.dataset[idx]
        class_name = self.dataset.classes[label]
        description = self.class_descriptions[class_name]
        return image, description, label

# Create the dataset using the custom dataset class
dataset = CustomDataset(data_dir, transform=preprocess, class_descriptions=class_descriptions)

# Split the dataset into training, validation, and test sets
#train_size = int(0.8 * len(dataset))
#val_size = int(0.1 * len(dataset))
#test_size = len(dataset) - train_size - val_size
#train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, val_size, test_size])

# Create the DataLoaders
train_loader = DataLoader(dataset, batch_size=10, shuffle=True, num_workers=4)
#val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)
#test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4)

# Load the CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess_clip = clip.load("RN50", device=device)

# Define a list to store performance metrics and combined features
performance_metrics = []
all_combined_features = []

# Define the criterion for loss calculation
criterion = torch.nn.CrossEntropyLoss()

# Training function
def train_model(model, train_loader, device, num_epochs=4):
    model.train()  # Use training mode
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    for epoch in range(num_epochs):
        running_loss = 0.0
        correct_predictions = 0
        total_predictions = 0

        for images, texts, labels in train_loader:
            texts = clip.tokenize(texts).to(device)

            images = images.to(device)
            labels = labels.to(device)
            print(images.shape)
            print(texts.shape)
            print(labels.shape)
            optimizer.zero_grad()

            # Removed torch.no_grad() block to allow gradients to flow
            image_features = model.encode_image(images)
            text_features = model.encode_text(texts)
            print(image_features.shape)
            print(text_features.shape)

            # Normalize the features
            image_features = image_features / image_features.norm(dim=1, keepdim=True)
            text_features = text_features / text_features.norm(dim=1, keepdim=True)

            # Calculate the similarity matrix
            similarity_matrix = torch.matmul(image_features, text_features.T)

            # Create labels
            target_labels = torch.arange(len(images)).to(device)

            # Calculate the loss
            loss = (criterion(similarity_matrix, target_labels) + criterion(similarity_matrix.T, target_labels)) / 2

            loss.backward()
            optimizer.step()

            running_loss += loss.item() * images.size(0)

            # Calculate accuracy
            _, predicted = torch.max(similarity_matrix, 1)
            correct_predictions += (predicted == target_labels).sum().item()
            total_predictions += target_labels.size(0)

            # Save the combined features
            combined_features = torch.cat((image_features, text_features), dim=1).detach().cpu().numpy() # Detach the tensor from the computation graph
            all_combined_features.extend(combined_features)

        epoch_loss = running_loss / len(train_loader.dataset)
        epoch_accuracy = correct_predictions / total_predictions

        performance_metrics.append({
            'epoch': epoch,
            'loss': epoch_loss,
            'accuracy': epoch_accuracy
        })

        print(f'Epoch {epoch}/{num_epochs - 1}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}')

# Validation function
#def validate_model(model, val_loader, device):
    model.eval()  # Use evaluation mode
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for images, texts, labels in val_loader:
            texts = clip.tokenize(texts).to(device)

            images = images.to(device)
            labels = labels.to(device)

            image_features = model.encode_image(images)
            text_features = model.encode_text(texts)

            # Normalize the features
            image_features = image_features / image_features.norm(dim=1, keepdim=True)
            text_features = text_features / text_features.norm(dim=1, keepdim=True)

            # Calculate the similarity matrix
            similarity_matrix = torch.matmul(image_features, text_features.T)

            # Create labels
            target_labels = torch.arange(len(images)).to(device)

            # Calculate the loss
            loss = (criterion(similarity_matrix, target_labels) + criterion(similarity_matrix.T, target_labels)) / 2

            running_loss += loss.item() * images.size(0)

            # Calculate accuracy
            _, predicted = torch.max(similarity_matrix, 1)
            correct_predictions += (predicted == target_labels).sum().item()
            total_predictions += target_labels.size(0)

    epoch_loss = running_loss / len(val_loader.dataset)
    epoch_accuracy = correct_predictions / total_predictions

    print(f'Validation Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}')

# Testing function
#def test_model(model, test_loader, device):
    model.eval()  # Use evaluation mode
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for images, texts, labels in test_loader:
            texts = clip.tokenize(texts).to(device)

            images = images.to(device)
            labels = labels.to(device)

            image_features = model.encode_image(images)
            text_features = model.encode_text(texts)

            # Normalize the features
            image_features = image_features / image_features.norm(dim=1, keepdim=True)
            text_features = text_features / text_features.norm(dim=1, keepdim=True)

            # Calculate the similarity matrix
            similarity_matrix = torch.matmul(image_features, text_features.T)

            # Create labels
            target_labels = torch.arange(len(images)).to(device)

            # Calculate the loss
            loss = (criterion(similarity_matrix, target_labels) + criterion(similarity_matrix.T, target_labels)) / 2

            running_loss += loss.item() * images.size(0)

            # Calculate accuracy
            _, predicted = torch.max(similarity_matrix, 1)
            correct_predictions += (predicted == target_labels).sum().item()
            total_predictions += target_labels.size(0)

    epoch_loss = running_loss / len(test_loader.dataset)
    epoch_accuracy = correct_predictions / total_predictions

    print(f'Test Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}')

# Train the model
train_model(model, train_loader, device, num_epochs=4)

# Validate the model
#validate_model(model, val_loader, device)

# Test the model
#test_model(model, test_loader, device)

# Save performance metrics to a CSV file
df_metrics = pd.DataFrame(performance_metrics)
output_metrics_path = '/content/drive/MyDrive/performance_metrics.csv'
df_metrics.to_csv(output_metrics_path, index=False)
print(f'Performance metrics saved to {output_metrics_path}')

# Save combined features (image and text) to a CSV file
df_combined_features = pd.DataFrame(all_combined_features)
output_combined_features_path = '/content/drive/MyDrive/combined_features2.csv'
df_combined_features.to_csv(output_combined_features_path, index=False)
print(f'Combined features saved to {output_combined_features_path}')






torch.Size([10, 3, 224, 224])
torch.Size([10, 77])
torch.Size([10])
torch.Size([10, 1024])
torch.Size([10, 1024])
Epoch 0/3, Loss: 2.3003, Accuracy: 0.2000
torch.Size([10, 3, 224, 224])
torch.Size([10, 77])
torch.Size([10])
torch.Size([10, 1024])
torch.Size([10, 1024])
Epoch 1/3, Loss: 2.2483, Accuracy: 0.7000
torch.Size([10, 3, 224, 224])
torch.Size([10, 77])
torch.Size([10])
torch.Size([10, 1024])
torch.Size([10, 1024])
Epoch 2/3, Loss: 2.0625, Accuracy: 1.0000
torch.Size([10, 3, 224, 224])
torch.Size([10, 77])
torch.Size([10])
torch.Size([10, 1024])
torch.Size([10, 1024])
Epoch 3/3, Loss: 1.8717, Accuracy: 1.0000
Validation Loss: 0.0000, Accuracy: 1.0000
Test Loss: 0.0000, Accuracy: 1.0000
Performance metrics saved to /content/drive/MyDrive/performance_metrics.csv
Combined features saved to /content/drive/MyDrive/combined_features2.csv


Image Description with 1000 images (100 per class) With training testing and validation loss and accuracy

In [24]:
# Install necessary packages
!pip install torch torchvision

import os
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Dataset
import clip
import pandas as pd

# Define the path to your dataset
data_dir = '/content/drive/MyDrive/classes2'

# Define the transformation for the images
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize the images to the size expected by the CLIP model
    transforms.ToTensor(),         # Convert the images to Tensors
    transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),  # Normalize the images
])

# Define text descriptions for each class
class_descriptions = {
    'AnnualCrop': "This is a photo of an annual crop",
    'Forest': "This is a photo of a forest",
    'HerbaceousVegetation': "This is a photo of herbaceous vegetation",
    'Highway': "This is a photo of a highway",
    'Industrial': "This is a photo of an industrial area",
    'Pasture': "This is a photo of a pasture",
    'PermanentCrop': "This is a photo of a permanent crop",
    'Residential': "This is a photo of a residential area",
    'River': "This is a photo of a river",
    'Sealake': "This is a photo of a sea or lake"
}

# Create a custom dataset that returns images and text descriptions
class CustomDataset(Dataset):
    def __init__(self, image_folder, transform, class_descriptions):
        self.dataset = datasets.ImageFolder(image_folder, transform=transform)
        self.class_descriptions = class_descriptions

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        image, label = self.dataset[idx]
        class_name = self.dataset.classes[label]
        description = self.class_descriptions[class_name]
        return image, description, label

# Create the dataset using the custom dataset class
dataset = CustomDataset(data_dir, transform=preprocess, class_descriptions=class_descriptions)

# Split the dataset into training, validation, and test sets
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, val_size, test_size])

# Create the DataLoaders
train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4)

# Load the CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess_clip = clip.load("RN50", device=device)

# Define a list to store performance metrics and combined features
performance_metrics = []
all_combined_features = []

# Define the criterion for loss calculation
criterion = torch.nn.CrossEntropyLoss()

# Training function
def train_model(model, train_loader, device, num_epochs=4):
    model.train()  # Use training mode
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    for epoch in range(num_epochs):
        running_loss = 0.0
        correct_predictions = 0
        total_predictions = 0

        for images, texts, labels in train_loader:
            texts = clip.tokenize(texts).to(device)

            images = images.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()

            # Removed torch.no_grad() block to allow gradients to flow
            image_features = model.encode_image(images)
            text_features = model.encode_text(texts)
            print(image_features.shape)
            print(text_features.shape)

            # Normalize the features
            image_features = image_features / image_features.norm(dim=1, keepdim=True)
            text_features = text_features / text_features.norm(dim=1, keepdim=True)

            # Calculate the similarity matrix
            similarity_matrix = torch.matmul(image_features, text_features.T)
            print(similarity_matrix)

            # Create labels
            target_labels = torch.arange(len(images)).to(device)

            # Calculate the loss
            loss = (criterion(similarity_matrix, target_labels) + criterion(similarity_matrix.T, target_labels)) / 2

            loss.backward()
            optimizer.step()

            running_loss += loss.item() * images.size(0)

            # Calculate accuracy
            _, predicted = torch.max(similarity_matrix, 1)
            correct_predictions += (predicted == target_labels).sum().item()
            total_predictions += target_labels.size(0)

            # Save the combined features
            combined_features = torch.cat((image_features, text_features), dim=1).detach().cpu().numpy() # Detach the tensor from the computation graph
            all_combined_features.extend(combined_features)

        epoch_loss = running_loss / len(train_loader.dataset)
        epoch_accuracy = correct_predictions / total_predictions

        performance_metrics.append({
            'epoch': epoch,
            'loss': epoch_loss,
            'accuracy': epoch_accuracy
        })

        print(f'Epoch {epoch}/{num_epochs - 1}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}')

# Validation function
def validate_model(model, val_loader, device):
    model.eval()  # Use evaluation mode
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for images, texts, labels in val_loader:
            texts = clip.tokenize(texts).to(device)

            images = images.to(device)
            labels = labels.to(device)

            image_features = model.encode_image(images)
            text_features = model.encode_text(texts)

            # Normalize the features
            image_features = image_features / image_features.norm(dim=1, keepdim=True)
            text_features = text_features / text_features.norm(dim=1, keepdim=True)

            # Calculate the similarity matrix
            similarity_matrix = torch.matmul(image_features, text_features.T)

            # Create labels
            target_labels = torch.arange(len(images)).to(device)

            # Calculate the loss
            loss = (criterion(similarity_matrix, target_labels) + criterion(similarity_matrix.T, target_labels)) / 2

            running_loss += loss.item() * images.size(0)

            # Calculate accuracy
            _, predicted = torch.max(similarity_matrix, 1)
            correct_predictions += (predicted == target_labels).sum().item()
            total_predictions += target_labels.size(0)

    epoch_loss = running_loss / len(val_loader.dataset)
    epoch_accuracy = correct_predictions / total_predictions

    print(f'Validation Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}')

# Testing function
def test_model(model, test_loader, device):
    model.eval()  # Use evaluation mode
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for images, texts, labels in test_loader:
            texts = clip.tokenize(texts).to(device)

            images = images.to(device)
            labels = labels.to(device)

            image_features = model.encode_image(images)
            text_features = model.encode_text(texts)

            # Normalize the features
            image_features = image_features / image_features.norm(dim=1, keepdim=True)
            text_features = text_features / text_features.norm(dim=1, keepdim=True)

            # Calculate the similarity matrix
            similarity_matrix = torch.matmul(image_features, text_features.T)

            # Create labels
            target_labels = torch.arange(len(images)).to(device)

            # Calculate the loss
            loss = (criterion(similarity_matrix, target_labels) + criterion(similarity_matrix.T, target_labels)) / 2

            running_loss += loss.item() * images.size(0)

            # Calculate accuracy
            _, predicted = torch.max(similarity_matrix, 1)
            correct_predictions += (predicted == target_labels).sum().item()
            total_predictions += target_labels.size(0)

    epoch_loss = running_loss / len(test_loader.dataset)
    epoch_accuracy = correct_predictions / total_predictions

    print(f'Test Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}')

# Train the model
train_model(model, train_loader, device, num_epochs=4)

# Validate the model
validate_model(model, val_loader, device)

# Test the model
test_model(model, test_loader, device)

# Save performance metrics to a CSV file
df_metrics = pd.DataFrame(performance_metrics)
output_metrics_path = '/content/drive/MyDrive/performance_metrics.csv'
df_metrics.to_csv(output_metrics_path, index=False)
print(f'Performance metrics saved to {output_metrics_path}')

# Save combined features (image and text) to a CSV file
df_combined_features = pd.DataFrame(all_combined_features)
output_combined_features_path = '/content/drive/MyDrive/combined_features3.csv'
df_combined_features.to_csv(output_combined_features_path, index=False)
print(f'Combined features saved to {output_combined_features_path}')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
          0.0501,  0.2703],
        [ 0.1969, -0.4933, -0.5156,  0.1969,  0.3168, -0.5156, -0.5156, -0.4933,
          0.9076, -0.5530],
        [ 0.5273, -0.8811, -0.8527,  0.5273,  0.8748, -0.8527, -0.8527, -0.8811,
          0.1398,  0.1833]], grad_fn=<MmBackward0>)
torch.Size([10, 1024])
torch.Size([10, 1024])
tensor([[ 0.6314,  0.6314, -0.7791,  0.6314, -0.1192, -0.1617,  0.5782, -0.3676,
          0.6314,  0.3694],
        [ 0.7885,  0.7885, -0.5997,  0.7885, -0.4434, -0.4679,  0.5077, -0.1131,
          0.7885,  0.2641],
        [-0.2294, -0.2294,  0.5901, -0.2294, -0.0350, -0.0678, -0.1182,  0.4849,
         -0.2294,  0.0512],
        [ 0.2496,  0.2496,  0.8785,  0.2496, -0.6437, -0.6406, -0.0374,  0.8288,
          0.2496,  0.0641],
        [-0.6490, -0.6490, -0.4181, -0.6490,  0.9776,  0.9726, -0.0530, -0.8439,
         -0.6490,  0.0283],
        [-0.8926, -0.8926,  0.2161, -0.8926,  0.8003,  0.8371, -0.4121, -0

Image Description with 1000 images (100 per class) With training testing and validation loss and accuracy

Code 2

In [9]:
# Install necessary packages
!pip install torch torchvision

import os
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Dataset
import clip
import pandas as pd

# Define the path to your dataset
data_dir = '/content/drive/MyDrive/classes2'

# Define the transformation for the images
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize the images to the size expected by the CLIP model
    transforms.ToTensor(),         # Convert the images to Tensors
    transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),  # Normalize the images
])

# Define text descriptions for each class
class_descriptions = {
    'AnnualCrop': "This is a photo of an annual crop",
    'Forest': "This is a photo of a forest",
    'HerbaceousVegetation': "This is a photo of herbaceous vegetation",
    'Highway': "This is a photo of a highway",
    'Industrial': "This is a photo of an industrial area",
    'Pasture': "This is a photo of a pasture",
    'PermanentCrop': "This is a photo of a permanent crop",
    'Residential': "This is a photo of a residential area",
    'River': "This is a photo of a river",
    'Sealake': "This is a photo of a sea or lake"
}

# Create a custom dataset that returns images and text descriptions
class CustomDataset(Dataset):
    def __init__(self, image_folder, transform, class_descriptions):
        self.dataset = datasets.ImageFolder(image_folder, transform=transform)
        self.class_descriptions = class_descriptions

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        image, label = self.dataset[idx]
        class_name = self.dataset.classes[label]
        description = self.class_descriptions[class_name]
        return image, description, label

# Create the dataset using the custom dataset class
dataset = CustomDataset(data_dir, transform=preprocess, class_descriptions=class_descriptions)

# Split the dataset into training, validation, and test sets
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, val_size, test_size])

# Create the DataLoaders
train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4)

# Load the CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess_clip = clip.load("RN50", device=device)

# Define a list to store performance metrics and combined features
performance_metrics = []

# Define the criterion for loss calculation
criterion = torch.nn.CrossEntropyLoss()

# Training function
def train_model(model, train_loader, device, num_epochs=4):
    model.train()  # Use training mode
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    batch_index = 0  # Initialize batch index

    for epoch in range(num_epochs):
        running_loss = 0.0
        correct_predictions = 0
        total_predictions = 0

        for images, texts, labels in train_loader:
            texts = clip.tokenize(texts).to(device)
            images = images.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()

            # Removed torch.no_grad() block to allow gradients to flow
            image_features = model.encode_image(images)
            text_features = model.encode_text(texts)

            # Normalize the features
            image_features = image_features / image_features.norm(dim=1, keepdim=True)
            text_features = text_features / text_features.norm(dim=1, keepdim=True)

            # Calculate the similarity matrix
            similarity_matrix = torch.matmul(image_features, text_features.T)

            # Create labels
            target_labels = torch.arange(len(images)).to(device)

            # Calculate the loss
            loss = (criterion(similarity_matrix, target_labels) + criterion(similarity_matrix.T, target_labels)) / 2

            loss.backward()
            optimizer.step()

            running_loss += loss.item() * images.size(0)

            # Calculate accuracy
            _, predicted = torch.max(similarity_matrix, 1)
            correct_predictions += (predicted == target_labels).sum().item()
            total_predictions += target_labels.size(0)

            # Save the combined features with batch index
            combined_features = torch.cat((image_features, text_features), dim=1).detach().cpu().numpy()
            df_combined_features = pd.DataFrame(combined_features)
            df_combined_features['batch_index'] = batch_index
            output_combined_features_path = f'/content/drive/MyDrive/combined_features_batch_{batch_index}.csv'
            df_combined_features.to_csv(output_combined_features_path, index=False)
            batch_index += 1

        epoch_loss = running_loss / len(train_loader.dataset)
        epoch_accuracy = correct_predictions / total_predictions

        performance_metrics.append({
            'epoch': epoch,
            'loss': epoch_loss,
            'accuracy': epoch_accuracy
        })

        print(f'Epoch {epoch}/{num_epochs - 1}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}')

# Validation function
def validate_model(model, val_loader, device):
    model.eval()  # Use evaluation mode
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for images, texts, labels in val_loader:
            texts = clip.tokenize(texts).to(device)
            images = images.to(device)
            labels = labels.to(device)

            image_features = model.encode_image(images)
            text_features = model.encode_text(texts)

            # Normalize the features
            image_features = image_features / image_features.norm(dim=1, keepdim=True)
            text_features = text_features / text_features.norm(dim=1, keepdim=True)

            # Calculate the similarity matrix
            similarity_matrix = torch.matmul(image_features, text_features.T)

            # Create labels
            target_labels = torch.arange(len(images)).to(device)

            # Calculate the loss
            loss = (criterion(similarity_matrix, target_labels) + criterion(similarity_matrix.T, target_labels)) / 2

            running_loss += loss.item() * images.size(0)

            # Calculate accuracy
            _, predicted = torch.max(similarity_matrix, 1)
            correct_predictions += (predicted == target_labels).sum().item()
            total_predictions += target_labels.size(0)

    epoch_loss = running_loss / len(val_loader.dataset)
    epoch_accuracy = correct_predictions / total_predictions

    print(f'Validation Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}')

# Testing function
def test_model(model, test_loader, device):
    model.eval()  # Use evaluation mode
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for images, texts, labels in test_loader:
            texts = clip.tokenize(texts).to(device)
            images = images.to(device)
            labels = labels.to(device)

            image_features = model.encode_image(images)
            text_features = model.encode_text(texts)

            # Normalize the features
            image_features = image_features / image_features.norm(dim=1, keepdim=True)
            text_features = text_features / text_features.norm(dim=1, keepdim=True)

            # Calculate the similarity matrix
            similarity_matrix = torch.matmul(image_features, text_features.T)

            # Create labels
            target_labels = torch.arange(len(images)).to(device)

            # Calculate the loss
            loss = (criterion(similarity_matrix, target_labels) + criterion(similarity_matrix.T, target_labels)) / 2

            running_loss += loss.item() * images.size(0)

            # Calculate accuracy
            _, predicted = torch.max(similarity_matrix, 1)
            correct_predictions += (predicted == target_labels).sum().item()
            total_predictions += target_labels.size(0)

    epoch_loss = running_loss / len(test_loader.dataset)
    epoch_accuracy = correct_predictions / total_predictions

    print(f'Test Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}')

# Train the model
train_model(model, train_loader, device, num_epochs=4)

# Validate the model
validate_model(model, val_loader, device)

# Test the model
test_model(model, test_loader, device)

# Save performance metrics to a CSV file
df_metrics = pd.DataFrame(performance_metrics)
output_metrics_path = '/content/drive/MyDrive/performance_metrics.csv'
df_metrics.to_csv(output_metrics_path, index=False)
print(f'Performance metrics saved to {output_metrics_path}')


Epoch 0/3, Loss: 1.9015, Accuracy: 0.3187


  self.pid = os.fork()


Epoch 1/3, Loss: 1.7972, Accuracy: 0.3450


  self.pid = os.fork()


Epoch 2/3, Loss: 1.7498, Accuracy: 0.3975


  self.pid = os.fork()
  self.pid = os.fork()


Epoch 3/3, Loss: 1.7292, Accuracy: 0.4313


  self.pid = os.fork()
  self.pid = os.fork()


Validation Loss: 2.8161, Accuracy: 0.2100
Test Loss: 2.7828, Accuracy: 0.1800
Performance metrics saved to /content/drive/MyDrive/performance_metrics.csv
