In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
all_dir = '/content/drive/My Drive/523_pipeline/concatenation/eval_images_concate_01/'

In [3]:
all_dir

'/content/drive/My Drive/523_pipeline/concatenation/eval_images_concate_01/'

In [4]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import os

# Load a pre-trained ResNet model
model = models.resnet50(pretrained=True)
model.eval()  # Set the model to evaluation mode

# Define a transform to preprocess the image
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Function to load and transform an image
def load_image(image_path):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0)
    return image

# Function to extract features using the PyTorch model
def extract_features(input_image, model):
    with torch.no_grad():
        features = model(input_image)
    return features


# Initialize the list to store the highest similarity score for each ground truth image
scores = []

# Loop through each ground truth image
for i in range(1, 303):  # Assuming ground truth images range from eval_00001_gt.png to eval_00589_gt.png
    gt_img_path = os.path.join(all_dir, f'eval_{i:05d}_gt.png')
    gt_image_tensor = load_image(gt_img_path)

    # Extract features for the ground truth image
    gt_features = extract_features(gt_image_tensor, model)

    # Initialize a list to store scores for each generated image corresponding to the current ground truth
    gen_scores = []

    # Loop through each of the three generated images for the current ground truth
    for j in range(3):  # Assuming generated images range from eval_00001_gen_00.png to eval_00001_gen_02.png
        gen_img_path = os.path.join(all_dir, f'eval_{i:05d}_gen_{j:02d}.png')
        gen_image_tensor = load_image(gen_img_path)

        # Extract features for the generated image
        gen_features = extract_features(gen_image_tensor, model)

        # Calculate cosine similarity
        similarity = torch.nn.functional.cosine_similarity(gen_features, gt_features, dim=1).item()

        # Add the similarity score to the list
        gen_scores.append(similarity)

    # Store the highest similarity score for this ground truth image
    highest_similarity = max(gen_scores)
    scores.append(highest_similarity)

# At this point, 'scores' contains the highest similarity scores for each ground truth image
print(scores)


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 158MB/s]


[0.858167827129364, 0.8149492740631104, 0.7390893697738647, 0.7182627320289612, 0.8252477049827576, 0.7890536785125732, 0.7752482295036316, 0.7729378938674927, 0.6568295955657959, 0.7767173051834106, 0.8481190800666809, 0.806626558303833, 0.7483872771263123, 0.8048943281173706, 0.7204264402389526, 0.761933445930481, 0.7164853811264038, 0.8319079875946045, 0.8054932355880737, 0.8361576795578003, 0.591419517993927, 0.8224729299545288, 0.733851969242096, 0.6860159635543823, 0.7006815075874329, 0.7431530952453613, 0.6750985980033875, 0.6927540898323059, 0.7615638971328735, 0.7871006727218628, 0.6825798749923706, 0.7370364665985107, 0.7379375696182251, 0.8496289253234863, 0.7683517336845398, 0.7780988216400146, 0.7521151900291443, 0.7975324392318726, 0.7770662903785706, 0.8263983726501465, 0.6864908933639526, 0.7137171030044556, 0.8159371018409729, 0.814200758934021, 0.8479858636856079, 0.8721381425857544, 0.7139254212379456, 0.8120042085647583, 0.7548492550849915, 0.7713470458984375, 0.818

In [5]:
len(scores)

302

In [6]:
average_score = sum(scores) / len(scores)
average_score

0.7689180816246184

In [7]:
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import os

# Initialize CLIP model and processor
model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32')
processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')

def calculate_text_image_correlation(image_path, text):
    image = Image.open(image_path).convert('RGB').resize((256, 256))
    inputs = processor(text=[text], images=image, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)
    similarity = torch.nn.functional.cosine_similarity(outputs.text_embeds, outputs.image_embeds, dim=1)
    return similarity.item()

# List to store the highest similarity score for each ground truth image and corresponding text
text_scores = []

# Loop through each ground truth image and its corresponding prompt
for i in range(1, 303):  # Assuming the range from eval_00001 to eval_00589
    prompt_path = os.path.join(all_dir, f'eval_{i:05d}_prompt.txt')

    # Read the prompt from the file
    with open(prompt_path, 'r') as file:
        prompt_text = file.read().strip()

    # Initialize a list to store scores for each generated image corresponding to the current prompt
    gen_scores = []

    # Loop through each of the three generated images for the current ground truth
    for j in range(3):  # Assuming generated images range from eval_00001_gen_00.png to eval_00001_gen_02.png
        gen_img_path = os.path.join(all_dir, f'eval_{i:05d}_gen_{j:02d}.png')

        # Calculate the similarity between the text and the corresponding generated image
        similarity = calculate_text_image_correlation(gen_img_path, prompt_text)

        # Add the similarity score to the list
        gen_scores.append(similarity)

    # Store the highest similarity score for this ground truth image
    highest_similarity = max(gen_scores)
    text_scores.append(highest_similarity)

# At this point, 'text_scores' contains the highest similarity scores for each ground truth image with its corresponding text
print(text_scores)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

[0.27111977338790894, 0.3212692141532898, 0.3091084957122803, 0.31091952323913574, 0.34275540709495544, 0.30040448904037476, 0.32107013463974, 0.2982428967952728, 0.331563264131546, 0.3347680866718292, 0.32577750086784363, 0.3105977475643158, 0.29205185174942017, 0.3574772775173187, 0.3186721205711365, 0.30358681082725525, 0.3402097821235657, 0.3149645924568176, 0.33580246567726135, 0.2964271008968353, 0.30139151215553284, 0.31598877906799316, 0.2669457793235779, 0.3045817017555237, 0.29790613055229187, 0.26532334089279175, 0.3822104036808014, 0.30291610956192017, 0.3114980161190033, 0.2934824526309967, 0.34544044733047485, 0.34330323338508606, 0.3361353576183319, 0.29105374217033386, 0.27871063351631165, 0.3601123094558716, 0.3144994080066681, 0.2825436592102051, 0.2898554801940918, 0.34206101298332214, 0.3628416359424591, 0.330513596534729, 0.36557042598724365, 0.32719460129737854, 0.3330228328704834, 0.34396257996559143, 0.3148803114891052, 0.3335307836532593, 0.3281295895576477, 0.

In [8]:
len(text_scores)

302

In [9]:
average_text_score = sum(text_scores) / len(text_scores)
average_text_score

0.32603830276735574

In [10]:
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import torch
import os

# Initialize the InceptionV3 model
model = models.inception_v3(pretrained=True)
model.eval()  # Set the model to evaluation mode

# Define image transformation
transform = transforms.Compose([
    transforms.Resize(299),  # Resize the image to fit the input size of the model
    transforms.CenterCrop(299),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


# Filter to obtain only generated image file paths
image_files = sorted([os.path.join(all_dir, f)
                      for f in os.listdir(all_dir)
                      if 'gen' in f and f.endswith('.png')])

# List to store the confidence scores for each image
image_quality_score = []

# Process each image individually
for image_path in image_files:
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0)  # Add a batch dimension

    # Get model prediction and confidence
    with torch.no_grad():
        outputs = model(image)
        _, predicted = outputs.max(1)
        confidence = torch.nn.functional.softmax(outputs, dim=1)[0, predicted].item()

    # Store the confidence score
    image_quality_score.append(confidence)

# Print or return the list of confidence scores
print(image_quality_score)


Downloading: "https://download.pytorch.org/models/inception_v3_google-0cc3c7bd.pth" to /root/.cache/torch/hub/checkpoints/inception_v3_google-0cc3c7bd.pth
100%|██████████| 104M/104M [00:00<00:00, 177MB/s]


[0.10088681429624557, 0.30061742663383484, 0.1858360320329666, 0.09284551441669464, 0.44480568170547485, 0.1269252598285675, 0.2278282344341278, 0.21958640217781067, 0.29700714349746704, 0.6919198036193848, 0.08306062966585159, 0.27213913202285767, 0.3653300106525421, 0.31521928310394287, 0.3526995778083801, 0.33296066522598267, 0.8522560596466064, 0.8571513891220093, 0.9312523007392883, 0.539907693862915, 0.22535763680934906, 0.12189839780330658, 0.3355740010738373, 0.11331640928983688, 0.04197594150900841, 0.06226985156536102, 0.08707211166620255, 0.08901629596948624, 0.4672607481479645, 0.5523920655250549, 0.03541269525885582, 0.1979035884141922, 0.05350425839424133, 0.6283909678459167, 0.7889115214347839, 0.1362236738204956, 0.3092929720878601, 0.5129505395889282, 0.2761895954608917, 0.44408512115478516, 0.1980748027563095, 0.48560354113578796, 0.20719976723194122, 0.13475124537944794, 0.08993755280971527, 0.15146978199481964, 0.5136756300926208, 0.22925803065299988, 0.354845285415

In [11]:
average_quality_score = sum(image_quality_score) / len(image_quality_score)
average_quality_score

0.3000596580229223