<a href="https://colab.research.google.com/github/bindhureddy51/Vehicle-counting/blob/main/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import zipfile
import os

# Path to the zip file
zip_file_path = '/content/drive/MyDrive/aichallenger.zip'

# Specify the folder where you want to extract the files
output_folder = '/content/extracted_folder/'

# Make sure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Open and extract the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(output_folder)

print(f'Files extracted to {output_folder}')


Files extracted to /content/extracted_folder/


In [5]:
!pip install torch torchvision transformers tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from sklearn.metrics import accuracy_score
from tqdm import tqdm  # For progress bar

# Feature Tokenizer Module: Combines CNN and Transformer
class FeatureTokenizer(nn.Module):
    def _init(self, input_channels=3, patch_size=16, embed_dim=768):  # Corrected __init_
        super(FeatureTokenizer, self)._init_()
        self.patch_size = patch_size
        self.conv = nn.Conv2d(input_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
        self.pos_embed = nn.Parameter(torch.zeros(1, (224 // patch_size) ** 2, embed_dim))  # Positional embeddings
        self.transformer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=8)

    def forward(self, x):
        x = self.conv(x)  # Convolution to extract patches
        x = x.flatten(2).transpose(1, 2)  # Flatten and reorder for Transformer
        x = x + self.pos_embed  # Add positional embeddings
        x = self.transformer(x)
        return x

# Token Encoder Module: Multi-head attention and MLP
class TokenEncoder(nn.Module):
    def _init(self, embed_dim=768):  # Corrected __init_
        super(TokenEncoder, self)._init_()
        self.mha = nn.MultiheadAttention(embed_dim, num_heads=8, batch_first=True)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, embed_dim * 4),
            nn.GELU(),
            nn.Linear(embed_dim * 4, embed_dim)
        )

    def forward(self, x):
        attn_out, _ = self.mha(x, x, x)
        x = self.norm1(x + attn_out)
        mlp_out = self.mlp(x)
        x = self.norm2(x + mlp_out)
        return x

# Multi-Label Decoder Module with Residual Connections
class MultiLabelDecoder(nn.Module):
    def _init_(self, embed_dim=768, num_classes=61):  # Set for 61 classes
        super(MultiLabelDecoder, self)._init_()
        self.mha_self = nn.MultiheadAttention(embed_dim, num_heads=8, batch_first=True)
        self.mha_cross = nn.MultiheadAttention(embed_dim, num_heads=8, batch_first=True)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, embed_dim * 4),
            nn.GELU(),
            nn.Linear(embed_dim * 4, embed_dim)
        )
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, x, context):
        self_attn_out, _ = self.mha_self(x, x, x)
        x = self.norm1(x + self_attn_out)
        cross_attn_out, _ = self.mha_cross(x, context, context)
        x = self.norm2(x + cross_attn_out)
        x = self.mlp(x)
        x = self.fc(x.mean(dim=1))  # Global average pooling
        return x

# Main LDI-NET Architecture
class LDINet(nn.Module):
    def _init_(self, input_channels=3, embed_dim=768, patch_size=16, num_classes=61):  # Adjusted for 61 classes
        super(LDINet, self)._init_()
        self.feature_tokenizer = FeatureTokenizer(input_channels, patch_size, embed_dim)
        self.token_encoder = TokenEncoder(embed_dim)
        self.multi_label_decoder = MultiLabelDecoder(embed_dim, num_classes)

    def forward(self, x):
        tokens = self.feature_tokenizer(x)
        encoded_tokens = self.token_encoder(tokens)
        out = self.multi_label_decoder(encoded_tokens, tokens)
        return out

# Dataset Preparation
def prepare_dataloader(data_path, batch_size=32):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    dataset = datasets.ImageFolder(data_path, transform=transform)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return dataloader

# Training Function with Progress Bar and Logging
import torch.nn.functional as F

def train_model(model, train_loader, val_loader, device, num_epochs=10):
    criterion = nn.BCEWithLogitsLoss()  # For multi-label classification
    optimizer = optim.AdamW(model.parameters(), lr=1e-4)

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        loop = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch + 1}/{num_epochs}")

        for i, (inputs, labels) in loop:
            inputs, labels = inputs.to(device), labels.to(device)

            # Convert labels to one-hot encoding
            labels_one_hot = F.one_hot(labels, num_classes=61).float()

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels_one_hot)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

            loop.set_postfix(loss=loss.item())

        print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {running_loss / len(train_loader):.4f}")
        validate_model(model, val_loader, device)


# Validation Function
def validate_model(model, val_loader, device):
    model.eval()
    preds, labels_list = [], []
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.float().to(device)
            outputs = model(inputs)
            preds.append((outputs > 0.5).cpu())  # Threshold for multi-label
            labels_list.append(labels.cpu())
    preds = torch.cat(preds)
    labels_list = torch.cat(labels_list)
    accuracy = accuracy_score(labels_list.numpy(), preds.numpy())
    print(f"Validation Accuracy: {accuracy * 100:.2f}%")

# Main Script
if __name__ == "_main_":
    # Paths and Device Configuration
    train_data_path = "/content/extracted_folder/aichallenger/train"  # Update with the actual path
    val_data_path = "/content/extracted_folder/aichallenger/val"      # Update with the actual path
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Prepare DataLoaders
    train_loader = prepare_dataloader(train_data_path, batch_size=32)
    val_loader = prepare_dataloader(val_data_path, batch_size=32)

    # Model Initialization
    model = LDINet(input_channels=3, embed_dim=768, patch_size=16, num_classes=61)  # 61 classes
    model.to(device)

    # Train and Validate
    train_model(model, train_loader, val_loader, device, num_epochs=10)




In [27]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import numpy as np
import os

# Feature Tokenizer Module: Combines CNN and Transformer
class FeatureTokenizer(nn.Module):
    def __init__(self, input_channels=3, patch_size=16, embed_dim=768):  # Corrected __init__
        super(FeatureTokenizer, self).__init__()
        self.patch_size = patch_size
        self.conv = nn.Conv2d(input_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
        self.pos_embed = nn.Parameter(torch.zeros(1, (224 // patch_size) ** 2, embed_dim))  # Positional embeddings
        self.transformer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=8)

    def forward(self, x):
        x = self.conv(x)  # Convolution to extract patches
        x = x.flatten(2).transpose(1, 2)  # Flatten and reorder for Transformer
        x = x + self.pos_embed  # Add positional embeddings
        x = self.transformer(x)
        return x

# Token Encoder Module: Multi-head attention and MLP
class TokenEncoder(nn.Module):
    def __init__(self, embed_dim=768):  # Corrected __init__
        super(TokenEncoder, self).__init__()
        self.mha = nn.MultiheadAttention(embed_dim, num_heads=8, batch_first=True)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, embed_dim * 4),
            nn.GELU(),
            nn.Linear(embed_dim * 4, embed_dim)
        )

    def forward(self, x):
        attn_out, _ = self.mha(x, x, x)
        x = self.norm1(x + attn_out)
        mlp_out = self.mlp(x)
        x = self.norm2(x + mlp_out)
        return x

# Multi-Label Decoder Module with Residual Connections
class MultiLabelDecoder(nn.Module):
    def __init__(self, embed_dim=768, num_classes=61):  # Set for 61 classes
        super(MultiLabelDecoder, self).__init__()
        self.mha_self = nn.MultiheadAttention(embed_dim, num_heads=8, batch_first=True)
        self.mha_cross = nn.MultiheadAttention(embed_dim, num_heads=8, batch_first=True)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, embed_dim * 4),
            nn.GELU(),
            nn.Linear(embed_dim * 4, embed_dim)
        )
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, x, context):
        self_attn_out, _ = self.mha_self(x, x, x)
        x = self.norm1(x + self_attn_out)
        cross_attn_out, _ = self.mha_cross(x, context, context)
        x = self.norm2(x + cross_attn_out)
        x = self.mlp(x)
        x = self.fc(x.mean(dim=1))  # Global average pooling
        return x

# Main LDI-NET Architecture
class LDINet(nn.Module):
    def __init__(self, num_classes=61):  # Only passing num_classes for simplicity
        super(LDINet, self).__init__()
        self.feature_tokenizer = FeatureTokenizer(input_channels=3, patch_size=16, embed_dim=768)
        self.token_encoder = TokenEncoder(embed_dim=768)
        self.multi_label_decoder = MultiLabelDecoder(embed_dim=768, num_classes=num_classes)

    def forward(self, x):
        tokens = self.feature_tokenizer(x)
        encoded_tokens = self.token_encoder(tokens)
        out = self.multi_label_decoder(encoded_tokens, tokens)
        return out

# Dataset Preparation for Unlabeled Test Data
class UnlabeledImageDataset(torch.utils.data.Dataset):
    def __init__(self, image_folder, transform=None):
        self.image_folder = image_folder
        self.transform = transform
        self.image_paths = [os.path.join(image_folder, fname) for fname in os.listdir(image_folder)]

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        image = Image.open(image_path)
        if self.transform:
            image = self.transform(image)
        return image, image_path

def prepare_unlabeled_test_dataloader(data_path, batch_size=32):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    dataset = UnlabeledImageDataset(data_path, transform=transform)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    return dataloader, dataset

# Generate Predictions
def generate_predictions(model, test_loader, device):
    model.eval()
    predictions = []
    image_paths = []
    with torch.no_grad():
        for inputs, paths in test_loader:
            inputs = inputs.to(device)
            outputs = model(inputs)
            preds = torch.sigmoid(outputs).cpu().numpy()
            predictions.append(preds)
            image_paths.extend(paths)
    predictions = np.vstack(predictions)
    return predictions, image_paths

# Paths and Device Configuration
test_data_path = "/content/extracted_folder/aichallenger/testA"  # Update with the actual path
model_checkpoint_path = "model_checkpoint.pth"  # Path to the trained model checkpoint
output_predictions_path = "test_predictions.npy"
output_image_paths_file = "test_image_paths.txt"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Prepare DataLoader
test_loader, dataset = prepare_unlabeled_test_dataloader(test_data_path, batch_size=32)
model_checkpoint_path = "/content/model_checkpoint.pth"
torch.save(model.state_dict(), 'model_checkpoint.pth')

import os

# Correct model checkpoint path
  # Ensure the correct path here

# Check if the checkpoint file exists
if not os.path.isfile(model_checkpoint_path):
    raise FileNotFoundError(f"Model checkpoint file not found at {model_checkpoint_path}")

# Load the Trained Model
model = LDINet(num_classes=61)  # Adjusted for the correct number of classes
model.load_state_dict(torch.load(model_checkpoint_path, weights_only=True))  # Use weights_only=True for safety
model.to(device)


# Generate Predictions
predictions, image_paths = generate_predictions(model, test_loader, device)

# Save Predictions
np.save(output_predictions_path, predictions)
with open(output_image_paths_file, "w") as f:
    f.writelines("\n".join(image_paths))

print(f"Predictions saved to {output_predictions_path}")
print(f"Image paths saved to {output_image_paths_file}")





Predictions saved to test_predictions.npy
Image paths saved to test_image_paths.txt


In [28]:
import torch
from PIL import Image
from torchvision import transforms
import numpy as np

# Define your model checkpoint path
model_checkpoint_path = "/content/model_checkpoint.pth"  # Adjust to your model's path
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the transformation for a single image (same as for the batch data)
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to match the model input
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load the single test image
image_path = "/content/extracted_folder/aichallenger/testA/00dd183fdc49db6f61af5a9de0e3d8ab.jpg"  # Path to the single image you want to predict
image = Image.open(image_path)

# Apply the transformation to the image
image_tensor = transform(image).unsqueeze(0).to(device)  # Add batch dimension

# Load the pre-trained model
model = LDINet(num_classes=61)  # Adjust the number of classes if needed
model.load_state_dict(torch.load(model_checkpoint_path, weights_only=True))  # Load model weights safely
model.to(device)
model.eval()  # Set model to evaluation mode

# Generate Prediction for the Single Image
with torch.no_grad():  # No need to track gradients for inference
    outputs = model(image_tensor)  # Pass the image through the model
    predictions = torch.sigmoid(outputs).cpu().numpy()  # Apply sigmoid for multi-label classification
    predicted_class = np.argmax(predictions, axis=1)  # Get the class with the highest probability

# Print the prediction results
print(f"Predicted class index: {predicted_class[0]}")
print(f"Prediction probabilities: {predictions}")


Predicted class index: 59
Prediction probabilities: [[0.51108456 0.5134737  0.5041784  0.4948417  0.50098866 0.50641245
  0.49790868 0.48643762 0.48276508 0.4984841  0.48687145 0.5077355
  0.47892025 0.5126781  0.48684838 0.50813264 0.5232973  0.49734095
  0.50253594 0.4988868  0.5016016  0.49682796 0.4922758  0.4826482
  0.4691294  0.5176645  0.5154398  0.5244732  0.48125678 0.50134826
  0.50305647 0.50175256 0.48737222 0.51005673 0.478794   0.49576375
  0.4785461  0.50519496 0.47160023 0.49935567 0.49944606 0.50430024
  0.4800299  0.5046772  0.4882057  0.48407388 0.47247103 0.49540246
  0.49930015 0.5215208  0.4758738  0.49935925 0.501565   0.51034516
  0.52055764 0.50938773 0.5131076  0.4993666  0.49814805 0.53190243
  0.5114557 ]]


In [30]:
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer  # For multi-label encoding

# Adjusted Dataset class for validation with folder structure
class ValidatedImageDataset(Dataset):
    def __init__(self, image_folder, transform=None):
        self.image_folder = image_folder
        self.transform = transform
        self.image_paths = []
        self.labels = []
        self.classes = os.listdir(image_folder)
        self.class_to_idx = {cls: idx for idx, cls in enumerate(self.classes)}

        for cls in self.classes:
            class_folder = os.path.join(image_folder, cls)
            for image_name in os.listdir(class_folder):
                image_path = os.path.join(class_folder, image_name)
                self.image_paths.append(image_path)
                # Assigning the label for each image (binary vector for multi-label classification)
                label = [0] * len(self.classes)
                label[self.class_to_idx[cls]] = 1
                self.labels.append(label)

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        label = self.labels[idx]
        image = Image.open(image_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image, np.array(label), image_path  # Return image, label, and path

# Define the image transformation
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Prepare validation dataset and dataloader
validation_data_path = '/content/extracted_folder/aichallenger/val'  # Update to the actual validation data path
validation_dataset = ValidatedImageDataset(image_folder=validation_data_path, transform=transform)
validation_loader = DataLoader(validation_dataset, batch_size=32, shuffle=False)

# Model evaluation and accuracy calculation
def calculate_accuracy(predictions, ground_truth_labels):
    predicted_labels = (predictions > 0.5).astype(int)  # Threshold at 0.5 for multi-label classification
    correct_predictions = (predicted_labels == ground_truth_labels).astype(int)
    accuracy = correct_predictions.mean()  # Mean of correct predictions
    return accuracy

# Evaluate the model on the validation set
model.eval()  # Set the model to evaluation mode
predictions = []
ground_truth_labels = []
image_paths = []

with torch.no_grad():
    for inputs, labels, paths in validation_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)  # Ensure labels are on the same device
        outputs = model(inputs)
        preds = torch.sigmoid(outputs).cpu().numpy()

        predictions.append(preds)
        ground_truth_labels.append(labels.cpu().numpy())  # Store true labels
        image_paths.extend(paths)

# Convert predictions and labels to numpy arrays for accuracy calculation
predictions = np.vstack(predictions)
ground_truth_labels = np.vstack(ground_truth_labels)

# Calculate accuracy
accuracy = calculate_accuracy(predictions, ground_truth_labels)
print(f"Validation Multi-label Accuracy: {accuracy:.4f}")



Validation Multi-label Accuracy: 0.5346
