In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.models.vision_transformer import vit_b_16, ViT_B_16_Weights
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import cv2
import numpy as np
from PIL import Image
import os

In [5]:
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

In [6]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # ViT expects 224x224
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(degrees=15),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [7]:
class MultimodalCropDataset(Dataset):
    def __init__(self, df, transform=None, scaler=None):
        self.df = df.copy()
        self.transform = transform
        self.scaler = scaler if scaler else StandardScaler()
        
        # Standardize numerical features (excluding 'image_path' and 'Label')
        numerical_cols = self.df.columns[:-2]  # Assuming last two are 'image_path' and 'Label'
        if scaler is None:
            self.df[numerical_cols] = self.scaler.fit_transform(self.df[numerical_cols])
        else:
            self.df[numerical_cols] = self.scaler.transform(self.df[numerical_cols])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path = self.df.iloc[idx]["image_path"]
        image = cv2.imread(img_path)
        if image is None:
            raise FileNotFoundError(f"Image not found: {img_path}")
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = Image.fromarray(image)
        if self.transform:
            image = self.transform(image)

        numerical_features = pd.to_numeric(self.df.iloc[idx, :-2], errors='coerce').values.astype('float32')
        numerical_features = torch.tensor(numerical_features, dtype=torch.float32)
        label = torch.tensor(self.df.iloc[idx]["Label"], dtype=torch.long)
        return image, numerical_features, label

In [8]:
class MultimodalViT(nn.Module):
    def __init__(self, num_numerical_features, num_classes, dropout_rate=0.3):
        super(MultimodalViT, self).__init__()
        
        # Pretrained Vision Transformer
        self.vit = vit_b_16(weights=ViT_B_16_Weights.DEFAULT)
        self.vit.heads = nn.Identity()  # Remove default classification head
        
        # Numerical feature encoder
        self.num_encoder = nn.Sequential(
            nn.Linear(num_numerical_features, 128),
            nn.ReLU(),
            nn.LayerNorm(128),
            nn.Dropout(dropout_rate)
        )
        
        # Cross-attention layer
        self.cross_attention = nn.MultiheadAttention(embed_dim=768, num_heads=8)  # ViT output dim is 768
        self.cls_token = nn.Parameter(torch.randn(1, 1, 768))  # Learnable classification token
        
        # Classification head
        self.fc = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.LayerNorm(256),
            nn.Dropout(dropout_rate),
            nn.Linear(256, num_classes)
        )
    
    def forward(self, image, numerical_features):
        # Extract image features from ViT
        img_features = self.vit(image)  # Shape: (batch_size, 768)
        img_features = img_features.unsqueeze(0)  # Shape: (1, batch_size, 768)
        
        # Process numerical features
        num_features = self.num_encoder(numerical_features)  # Shape: (batch_size, 128)
        num_features = num_features.unsqueeze(0)  # Shape: (1, batch_size, 128)
        
        # Pad numerical features to match ViT embedding size
        num_features = nn.functional.pad(num_features, (0, 768 - 128))  # Shape: (1, batch_size, 768)
        
        # Add CLS token for classification
        batch_size = image.size(0)
        cls_tokens = self.cls_token.expand(-1, batch_size, -1)  # Shape: (1, batch_size, 768)
        
        # Combine features for cross-attention
        combined_features = torch.cat((cls_tokens, img_features, num_features), dim=0)  # Shape: (3, batch_size, 768)
        
        # Apply cross-attention
        attn_output, _ = self.cross_attention(cls_tokens, combined_features, combined_features)
        
        # Final classification
        output = self.fc(attn_output.squeeze(0))  # Shape: (batch_size, num_classes)
        return output

In [9]:
csv_file = r"C:\Users\Samridhaa\OneDrive\Desktop\New_DL\mapped_data_with_images.csv"
df = pd.read_csv(csv_file)

# Encode labels
label_encoder = LabelEncoder()
df["Label"] = label_encoder.fit_transform(df["Label"])
num_classes = len(label_encoder.classes_)

In [10]:
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["Label"], random_state=42)

# Create datasets
train_dataset = MultimodalCropDataset(train_df, transform=transform)
test_dataset = MultimodalCropDataset(test_df, transform=transform, scaler=train_dataset.scaler)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=0, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=0, pin_memory=True)

print(f"Train set size: {len(train_dataset)}")
print(f"Test set size: {len(test_dataset)}")

Train set size: 1760
Test set size: 440


In [14]:
print(torch.cuda.is_available())  # Should print True if GPU is detected
print(torch.cuda.current_device())  # Should print 0 (or device index)
print(torch.cuda.get_device_name(0))  # Should print your GPU name

True
0
NVIDIA GeForce RTX 3050 Laptop GPU


In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultimodalViT(num_numerical_features=7, num_classes=num_classes).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-4)
criterion = nn.CrossEntropyLoss()

In [10]:
import time 
start_time = time.time()
sample = train_dataset[0]  # Change this to a random index if needed
end_time = time.time()

print(f"Time taken for single sample: {end_time - start_time:.2f} seconds")


Time taken for single sample: 0.00 seconds


In [11]:
# Check sample batch
images, num_features, labels = next(iter(train_loader))
print(f"Image batch shape: {images.shape}")
print(f"Numerical features shape: {num_features.shape}")
print(f"Labels shape: {labels.shape}")

Image batch shape: torch.Size([16, 3, 224, 224])
Numerical features shape: torch.Size([16, 7])
Labels shape: torch.Size([16])


In [13]:
num_epochs = 15
best_acc = 0.0

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for images, num_features, labels in train_loader:
        images, num_features, labels = images.to(device), num_features.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images, num_features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * images.size(0)

    train_loss = train_loss / len(train_dataset)
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {train_loss:.4f}")


Epoch [1/15], Loss: 0.4946
Epoch [2/15], Loss: 0.0532
Epoch [3/15], Loss: 0.0232
Epoch [4/15], Loss: 0.1023
Epoch [5/15], Loss: 0.0718
Epoch [6/15], Loss: 0.0408
Epoch [7/15], Loss: 0.0124
Epoch [8/15], Loss: 0.0305
Epoch [9/15], Loss: 0.0070
Epoch [10/15], Loss: 0.0050
Epoch [11/15], Loss: 0.0042
Epoch [12/15], Loss: 0.0039
Epoch [13/15], Loss: 0.0034
Epoch [14/15], Loss: 0.0030
Epoch [15/15], Loss: 0.0027


In [13]:

model.eval()
val_loss = 0.0
preds, targets = [], []
with torch.no_grad():
    for images, num_features, labels in test_loader:
        images, num_features, labels = images.to(device), num_features.to(device), labels.to(device)
        outputs = model(images, num_features)
        loss = criterion(outputs, labels)
        val_loss += loss.item() * images.size(0)
            
        _, predicted = torch.max(outputs, 1)
        preds.extend(predicted.cpu().numpy())
        targets.extend(labels.cpu().numpy())
    
val_loss = val_loss / len(test_dataset)
val_acc = accuracy_score(targets, preds)
    
print(f"Epoch {epoch+1}/{num_epochs}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}, Val Acc = {val_acc:.4f}")
    
    # Save best model
if val_acc > best_acc:
    best_acc = val_acc
    torch.save(model.state_dict(), "vit_multimodal_best.pth")
    print(f"Saved best model with accuracy: {best_acc:.4f}")

print("Training completed!")

NameError: name 'model' is not defined

In [16]:
def predict_vit(image_path, numerical_features=None, model_path="vit_multimodal_best.pth"):
    # Load model
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()
    
    # Process image
    image = cv2.imread(image_path)
    if image is None:
        raise FileNotFoundError(f"Image not found: {image_path}")
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = Image.fromarray(image)
    image = transform(image).unsqueeze(0).to(device)
    
    # Process numerical features
    if numerical_features is None:
        numerical_features = np.zeros(7)  # Default if not provided
    numerical_features = train_dataset.scaler.transform(numerical_features.reshape(1, -1))[0]
    numerical_features = torch.tensor(numerical_features, dtype=torch.float32).unsqueeze(0).to(device)
    
    # Predict
    with torch.no_grad():
        output = model(image, numerical_features)
        predicted_class = torch.argmax(output, dim=1).item()
    
    # Map back to class name
    index_to_class = {v: k for k, v in dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))).items()}
    return index_to_class[predicted_class]

In [18]:
image_path = r"C:\Users\Samridhaa\OneDrive\Desktop\New_DL\Test\0a5e9323-dbad-432d-ac58-d291718345d9___FREC_Scab 3417.JPG"
try:
    # Example with dummy numerical features
    result = predict_vit(image_path)
    print(f"Prediction (without numerical features): {result}")
    
    # Example with sample numerical features
    sample_num_features = np.array([0.5, 1.2, 0.8, 2.1, 0.9, 1.5, 0.3])  # Replace with real data if available
    result = predict_vit(image_path, sample_num_features)
    print(f"Prediction (with numerical features): {result}")
except Exception as e:
    print(f"Error during inference: {e}")

  model.load_state_dict(torch.load(model_path, map_location=device))
  model.load_state_dict(torch.load(model_path, map_location=device))


Prediction (without numerical features): Apple___Cedar_apple_rust
Prediction (with numerical features): Apple___Cedar_apple_rust




In [19]:
class_mapping = {i: label for i, label in enumerate(label_encoder.classes_)}
print(class_mapping)

{0: 'Apple___Apple_scab', 1: 'Apple___Black_rot', 2: 'Apple___Cedar_apple_rust', 3: 'Apple___healthy', 4: 'Blueberry___healthy', 5: 'Cherry_(including_sour)___Powdery_mildew', 6: 'Cherry_(including_sour)___healthy', 7: 'Corn_(maize)___Cercospora_leaf_spot Gray_leaf_spot', 8: 'Corn_(maize)___Common_rust_', 9: 'Corn_(maize)___Northern_Leaf_Blight', 10: 'Corn_(maize)___healthy', 11: 'Grape___Black_rot', 12: 'Grape___Esca_(Black_Measles)', 13: 'Grape___Leaf_blight_(Isariopsis_Leaf_Spot)', 14: 'Grape___healthy', 15: 'Orange___Haunglongbing_(Citrus_greening)', 16: 'Peach___Bacterial_spot', 17: 'Peach___healthy', 18: 'Pepper,_bell___Bacterial_spot', 19: 'Pepper,_bell___healthy', 20: 'Potato___Early_blight', 21: 'Potato___healthy'}


In [None]:
torch.save(model.state_dict(), "vit_multimodal_final.pth")
print("Final model saved as 'vit_multimodal_final.pth'")