## Model architecture definition

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
import torch

class EffNetLSTM(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        
        # EfficientNet-B0 backbone (outputs 1280 channels)
        effnet = models.efficientnet_b0(weights=models.EfficientNet_B0_Weights.IMAGENET1K_V1)
        self.cnn = effnet.features  
        
        self.channel_reducer = nn.Sequential(
            nn.Conv2d(1280, 512, kernel_size=1),  
            nn.BatchNorm2d(512),
            nn.ReLU()
        )
        
        self.lstm = nn.LSTM(
            input_size=512,
            hidden_size=256,
            num_layers=2,
            bidirectional=True,
            batch_first=True
        )
        
        self.classifier = nn.Sequential(
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        features = self.cnn(x)  
        
        x = self.channel_reducer(features)  
        bs, c, h, w = x.size()
        x = x.permute(0, 2, 3, 1).reshape(bs, h*w, c)  
        
        lstm_out, (h_n, c_n) = self.lstm(x)
        last_hidden = torch.cat((h_n[-2], h_n[-1]), dim=1)  
        
        return self.classifier(last_hidden)

## Loading data

In [2]:
import cv2
import pandas as pd
import torch
import numpy as np

num_classes = 10

In [3]:
df_val = pd.read_csv('wikiart_csv/genre_val.csv',header=None, names=["image_path", "genre_id"])

In [4]:
val_images = df_val['image_path'].values
val_labels = df_val['genre_id'].values

## Creating dataset class and dataloader

In [5]:
# create test and train dataset for dataloader

def get_image(image_path,image_size=224):
    try:
        img = cv2.imread('./wikiart/' + image_path)
        if img is None:
            raise ValueError(f"Image not loaded: ./wikiart/{image_path}")
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        h, w, _ = img.shape
        scale = 256 / min(h, w)
        new_w = int(w * scale)
        new_h = int(h * scale)
        img_resized = cv2.resize(img, (new_w, new_h))
        start_x = (new_w - image_size) // 2
        start_y = (new_h - image_size) // 2
        img_cropped = img_resized[start_y:start_y+image_size, start_x:start_x+image_size]
        img_cropped = img_cropped.astype(np.float32) / 255.0
        img_tensor = torch.from_numpy(img_cropped).permute(2, 0, 1)
        mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
        std  = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
        img_tensor = (img_tensor - mean) / std
        return img_tensor
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return torch.zeros(3, image_size, image_size)

class WikiArtDataset(torch.utils.data.Dataset):
    def __init__(self, images, labels):
        self.images = images
        self.labels = labels

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        # image_vectors = []
        # for image in self.images:
        #     image_emb = get_image(image)
        #     image_vectors.append(image_emb)
        # image = torch.stack(image_vectors)
        image = self.images[idx]
        # label should be a one-hot encoded vector
        label = torch.zeros(num_classes)
        label[self.labels[idx]] = 1

        return image, label

# train_dataset = WikiArtDataset(train_images, train_labels)
# train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataset = WikiArtDataset(val_images, val_labels)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=128, shuffle=False)

## Model declaration and loading the stored weights

In [6]:
# Load the model
model = EffNetLSTM(num_classes=num_classes)
model.load_state_dict(torch.load('effnet_rcnn_epoch_11_genre.pth'))
model = model.cuda()

In [7]:
# Metrics to evaluate model performance

def top_1_accuracy(outputs, labels):
    _, predicted = torch.max(outputs, 1)
    _, actual = torch.max(labels, 1)
    correct = (predicted == actual).sum().item()
    return correct / labels.shape[0]

def top_5_accuracy(outputs, labels):
    _, predicted = torch.topk(outputs, 5, dim=1)
    _, actual = torch.max(labels, 1)
    correct = 0
    for i in range(labels.shape[0]):
        if actual[i] in predicted[i]:
            correct += 1
    return correct / labels.shape[0]

# Evaluate the model on the validation set

model.eval()
top1_acc = 0
top5_acc = 0
num_batches = 0
for image_paths, labels in val_loader:
    with torch.no_grad():
        images = torch.stack([get_image(image_path) for image_path in image_paths])
        images = images.cuda()
        outputs = model(images)
        labels = labels.cuda()
        top1_acc += top_1_accuracy(outputs, labels)
        top5_acc += top_5_accuracy(outputs, labels)
        num_batches += 1

top1_acc /= num_batches
top5_acc /= num_batches
print(f"Top-1 accuracy: {top1_acc:.2f}")
print(f"Top-5 accuracy: {top5_acc:.2f}")


Top-1 accuracy: 0.75
Top-5 accuracy: 0.98


## Outlier detection based on embeddings and confidence of prediction

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import cv2

softmax = nn.Softmax(dim=1)

def detect_outliers(outputs, labels, image_paths, threshold=0.5):
    probs = torch.nn.functional.softmax(outputs, dim=1)  
    max_probs, predicted_classes = torch.max(probs, dim=1)  
    _, actual_classes = torch.max(labels, dim=1)  

    outliers = []
    for i in range(len(image_paths)):
        prob = max_probs[i].item()  
        pred_class = predicted_classes[i].item()
        actual_class = actual_classes[i].item()

        if prob < threshold:
            outliers.append((image_paths[i], pred_class, actual_class, prob))

    return outliers


def get_embeddings(model, images):
    with torch.no_grad():
        features = model.cnn(images)  
        
        x = model.channel_reducer(features)  
        bs, c, h, w = x.size()
        x = x.permute(0, 2, 3, 1).reshape(bs, h*w, c)  
        
        lstm_out, (h_n, c_n) = model.lstm(x)
        lstm_out, _ = model.lstm(x)
        return lstm_out.mean(dim=1).cpu().numpy()

# Load model
num_classes = 10
model = EffNetLSTM(num_classes=num_classes)
model.load_state_dict(torch.load('effnet_rcnn_epoch_11_genre.pth'))
model = model.cuda()
model.eval()

outlier_samples = []
all_embeddings = []
all_labels = []
all_image_paths = []
predictions = []

for image_paths, labels in val_loader:
    with torch.no_grad():
        images = torch.stack([get_image(image_path) for image_path in image_paths])
        images = images.cuda()
        outputs = model(images)
        labels = labels.cuda()
        
        # Detect outliers
        outliers = detect_outliers(outputs, labels, image_paths)
        outlier_samples.extend(outliers)
        
        t = torch.nn.functional.softmax(outputs, dim=1)
        max_probs, predicted_classes = torch.max(t, dim=1)
        predictions.extend(predicted_classes.cpu().numpy())
        # Collect embeddings for clustering
        embeddings = get_embeddings(model, images)
        all_embeddings.append(embeddings)
        all_labels.extend(labels.cpu().numpy())
        all_image_paths.extend(image_paths)

all_embeddings = np.vstack(all_embeddings)

similarity_matrix = cosine_similarity(all_embeddings)
outlier_indices = np.argsort(np.mean(similarity_matrix, axis=1))[:10]

print("Low-confidence outliers:")
outlier_accuracy = 0
for path, pred, actual, conf in outlier_samples:
    outlier_accuracy += (pred != actual)
    print(f"Image: {path}, Predicted: {pred}, Actual: {actual}, Confidence: {conf:.2f}")
    
outlier_accuracy /= len(outlier_samples)
print(f"Outlier detection accuracy by confidence: {outlier_accuracy:.2f}")

print("Embedding-based outliers:")
outlier_accuracy = 0
for idx in outlier_indices:
    outlier_accuracy += (predictions[idx] != all_labels[idx].argmax())
    print(f"Image: {all_image_paths[idx]}, Predicted: {predictions[idx]}, Actual: {all_labels[idx].argmax()}")

outlier_accuracy /= len(outlier_indices)
print(f"Outlier detection accuracy by embeddings: {outlier_accuracy:.2f}")


Low-confidence outliers:
Image: Realism/camille-corot_the-monk-1874.jpg, Predicted: 2, Actual: 2, Confidence: 0.44
Image: Cubism/arthur-segal_die-melkerin.jpg, Predicted: 2, Actual: 2, Confidence: 0.33
Image: Naive_Art_Primitivism/marc-chagall_dance-1962.jpg, Predicted: 2, Actual: 2, Confidence: 0.48
Image: Naive_Art_Primitivism/edith-vonnegut_falling.jpg, Predicted: 6, Actual: 2, Confidence: 0.45
Image: Realism/nikolay-bogdanov-belsky_to-work.jpg, Predicted: 2, Actual: 2, Confidence: 0.47
Image: Naive_Art_Primitivism/niko-pirosmani_the-temple-festival-in-bolnisi.jpg, Predicted: 2, Actual: 2, Confidence: 0.35
Image: Symbolism/nicholas-roerich_fires-of-victory-sentinel-lights-on-the-towers-in-gobi-1940.jpg, Predicted: 7, Actual: 2, Confidence: 0.44
Image: Impressionism/henri-martin_woman-by-the-artist.jpg, Predicted: 2, Actual: 2, Confidence: 0.40
Image: Naive_Art_Primitivism/marc-chagall_the-avenue-of-opera-1969.jpg, Predicted: 7, Actual: 2, Confidence: 0.40
Image: Realism/vasily-polen

## Statistical method for outlier detection based on mahalanobis distance

In [None]:
import numpy as np

# Compute mean and covariance matrix for embeddings
mean_vec = np.mean(all_embeddings, axis=0)
cov_matrix = np.cov(all_embeddings, rowvar=False)
inv_cov_matrix = np.linalg.inv(cov_matrix)

def mahalanobis_distance(x, mean, inv_cov):
    delta = x - mean
    return np.sqrt(np.dot(np.dot(delta, inv_cov), delta.T))

# Calculate distances 
distances = np.array([mahalanobis_distance(emb, mean_vec, inv_cov_matrix) for emb in all_embeddings])
# Set a threshold
threshold_distance = np.percentile(distances, 95)
mahalanobis_outliers = [all_image_paths[i] for i, d in enumerate(distances) if d > threshold_distance]

outlier_accuracy = 0
for path in mahalanobis_outliers:
    outlier_accuracy += 1 if predictions[all_image_paths.index(path)] != all_labels[all_image_paths.index(path)].argmax() else 0
    print(f"Outlier: {path} predicted as {predictions[all_image_paths.index(path)]}, actual {all_labels[all_image_paths.index(path)].argmax()}")

print(f"Outlier accuracy: {outlier_accuracy / len(mahalanobis_outliers):.2f}")


Outlier: Contemporary_Realism/eric-fischl_best-western-study.jpg predicted as 2, actual 2
Outlier: Naive_Art_Primitivism/niko-pirosmani_the-temple-festival-in-bolnisi.jpg predicted as 2, actual 2
Outlier: Impressionism/eugene-boudin_beach-at-trouville-1864.jpg predicted as 2, actual 2
Outlier: Impressionism/nikolay-bogdanov-belsky_an-afternoon-fishing.jpg predicted as 2, actual 2
Outlier: Impressionism/maurice-prendergast_italian-flower-market-1898.jpg predicted as 2, actual 2
Outlier: Impressionism/william-merritt-chase_woman-on-a-dock.jpg predicted as 1, actual 2
Outlier: Art_Nouveau_Modern/carl-larsson_the-day-before-christmas-1892(1).jpg predicted as 7, actual 2
Outlier: Impressionism/john-singer-sargent_the-brook-1907.jpg predicted as 2, actual 2
Outlier: Realism/vladimir-makovsky_before-explaining-the-date-1900.jpg predicted as 4, actual 2
Outlier: Impressionism/james-mcneill-whistler_variations-in-violet-and-grey-market-place.jpg predicted as 1, actual 2
Outlier: Impressionism/j