In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
all_dir = '/content/drive/My Drive/523_pipeline/attention_02'

In [3]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import os

# Load a pre-trained ResNet model
model = models.resnet50(pretrained=True)
model.eval()  # Set the model to evaluation mode

# Define a transform to preprocess the image
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Function to load and transform an image
def load_image(image_path):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0)
    return image

# Function to extract features using the PyTorch model
def extract_features(input_image, model):
    with torch.no_grad():
        features = model(input_image)
    return features


# Initialize the list to store the highest similarity score for each ground truth image
scores = []

# Loop through each ground truth image
for i in range(1, 590):  # Assuming ground truth images range from eval_00001_gt.png to eval_00589_gt.png
    gt_img_path = os.path.join(all_dir, f'eval_{i:05d}_gt.png')
    gt_image_tensor = load_image(gt_img_path)

    # Extract features for the ground truth image
    gt_features = extract_features(gt_image_tensor, model)

    # Initialize a list to store scores for each generated image corresponding to the current ground truth
    gen_scores = []

    # Loop through each of the three generated images for the current ground truth
    for j in range(3):  # Assuming generated images range from eval_00001_gen_00.png to eval_00001_gen_02.png
        gen_img_path = os.path.join(all_dir, f'eval_{i:05d}_gen_{j:02d}.png')
        gen_image_tensor = load_image(gen_img_path)

        # Extract features for the generated image
        gen_features = extract_features(gen_image_tensor, model)

        # Calculate cosine similarity
        similarity = torch.nn.functional.cosine_similarity(gen_features, gt_features, dim=1).item()

        # Add the similarity score to the list
        gen_scores.append(similarity)

    # Store the highest similarity score for this ground truth image
    highest_similarity = max(gen_scores)
    scores.append(highest_similarity)

# At this point, 'scores' contains the highest similarity scores for each ground truth image
print(scores)


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 183MB/s]


[0.5293762683868408, 0.5822304487228394, 0.6288471817970276, 0.6071566343307495, 0.6335282325744629, 0.5663012862205505, 0.42576465010643005, 0.5698140859603882, 0.7385660409927368, 0.6454997062683105, 0.6529979109764099, 0.5842203497886658, 0.6571513414382935, 0.661871075630188, 0.6529723405838013, 0.47178417444229126, 0.5344505310058594, 0.5730710029602051, 0.6645503640174866, 0.29878896474838257, 0.44873517751693726, 0.5425625443458557, 0.6023232340812683, 0.6199033260345459, 0.6810133457183838, 0.6009024381637573, 0.6483300924301147, 0.5888952612876892, 0.5887430906295776, 0.6082768440246582, 0.6508104801177979, 0.5972039699554443, 0.6812009811401367, 0.489989697933197, 0.635675847530365, 0.3899262249469757, 0.23977850377559662, 0.43604108691215515, 0.4633449614048004, 0.37520748376846313, 0.38277021050453186, 0.5268216729164124, 0.48921987414360046, 0.6031603813171387, 0.6674284934997559, 0.5990372896194458, 0.513085663318634, 0.5688136219978333, 0.6404626369476318, 0.531575739383

In [4]:
len(scores)

589

In [5]:
average_score = sum(scores) / len(scores)
average_score

0.5841565166800861

In [9]:
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import os

# Initialize CLIP model and processor
model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32')
processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')

def calculate_text_image_correlation(image_path, text):
    image = Image.open(image_path).convert('RGB').resize((256, 256))
    inputs = processor(text=[text], images=image, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)
    similarity = torch.nn.functional.cosine_similarity(outputs.text_embeds, outputs.image_embeds, dim=1)
    return similarity.item()

# List to store the highest similarity score for each ground truth image and corresponding text
text_scores = []

# Loop through each ground truth image and its corresponding prompt
for i in range(1, 590):  # Assuming the range from eval_00001 to eval_00589
    prompt_path = os.path.join(all_dir, f'eval_{i:05d}_prompt.txt')

    # Read the prompt from the file
    with open(prompt_path, 'r') as file:
        prompt_text = file.read().strip()

    # Initialize a list to store scores for each generated image corresponding to the current prompt
    gen_scores = []

    # Loop through each of the three generated images for the current ground truth
    for j in range(3):  # Assuming generated images range from eval_00001_gen_00.png to eval_00001_gen_02.png
        gen_img_path = os.path.join(all_dir, f'eval_{i:05d}_gen_{j:02d}.png')

        # Calculate the similarity between the text and the corresponding generated image
        similarity = calculate_text_image_correlation(gen_img_path, prompt_text)

        # Add the similarity score to the list
        gen_scores.append(similarity)

    # Store the highest similarity score for this ground truth image
    highest_similarity = max(gen_scores)
    text_scores.append(highest_similarity)

# At this point, 'text_scores' contains the highest similarity scores for each ground truth image with its corresponding text
print(text_scores)


[0.318999320268631, 0.3163068890571594, 0.27418142557144165, 0.2944302260875702, 0.29639750719070435, 0.284509539604187, 0.3113288879394531, 0.30708742141723633, 0.2631371021270752, 0.2939034700393677, 0.31968575716018677, 0.30263304710388184, 0.3071320056915283, 0.3258014917373657, 0.27875539660453796, 0.3156145215034485, 0.3094511330127716, 0.2953280210494995, 0.30497246980667114, 0.321668803691864, 0.2864883542060852, 0.32109975814819336, 0.28114479780197144, 0.33511078357696533, 0.3221741020679474, 0.29198887944221497, 0.26606908440589905, 0.33354419469833374, 0.34627676010131836, 0.3070055842399597, 0.34943532943725586, 0.30832210183143616, 0.27249836921691895, 0.3234480321407318, 0.28913354873657227, 0.31871506571769714, 0.3163670599460602, 0.2890699803829193, 0.3175298869609833, 0.30296003818511963, 0.2873104214668274, 0.29275619983673096, 0.27159929275512695, 0.2648197412490845, 0.35987767577171326, 0.3159048855304718, 0.2667316198348999, 0.32149630784988403, 0.2715362906455993

In [10]:
len(text_scores)

589

In [11]:
average_text_score = sum(text_scores) / len(text_scores)
average_text_score

0.30531334279955746

In [12]:
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import torch
import os

# Initialize the InceptionV3 model
model = models.inception_v3(pretrained=True)
model.eval()  # Set the model to evaluation mode

# Define image transformation
transform = transforms.Compose([
    transforms.Resize(299),  # Resize the image to fit the input size of the model
    transforms.CenterCrop(299),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


# Filter to obtain only generated image file paths
image_files = sorted([os.path.join(all_dir, f)
                      for f in os.listdir(all_dir)
                      if 'gen' in f and f.endswith('.png')])

# List to store the confidence scores for each image
image_quality_score = []

# Process each image individually
for image_path in image_files:
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0)  # Add a batch dimension

    # Get model prediction and confidence
    with torch.no_grad():
        outputs = model(image)
        _, predicted = outputs.max(1)
        confidence = torch.nn.functional.softmax(outputs, dim=1)[0, predicted].item()

    # Store the confidence score
    image_quality_score.append(confidence)

# Print or return the list of confidence scores
print(image_quality_score)


Downloading: "https://download.pytorch.org/models/inception_v3_google-0cc3c7bd.pth" to /root/.cache/torch/hub/checkpoints/inception_v3_google-0cc3c7bd.pth
100%|██████████| 104M/104M [00:00<00:00, 184MB/s] 


[0.34037819504737854, 0.37067732214927673, 0.13756884634494781, 0.10853976756334305, 0.11451957374811172, 0.25916722416877747, 0.20445920526981354, 0.07496032863855362, 0.6566592454910278, 0.1324600875377655, 0.13546015322208405, 0.06649991869926453, 0.06154759228229523, 0.05911124870181084, 0.19850434362888336, 0.26296907663345337, 0.03827687352895737, 0.1010875478386879, 0.19278207421302795, 0.19278207421302795, 0.12973079085350037, 0.05499930679798126, 0.07148384302854538, 0.09084269404411316, 0.11178021878004074, 0.06405821442604065, 0.11072289198637009, 0.3879401683807373, 0.28289711475372314, 0.0962325930595398, 0.17816610634326935, 0.08025095611810684, 0.05131152272224426, 0.07001133263111115, 0.21906520426273346, 0.2721713185310364, 0.08220625668764114, 0.17689794301986694, 0.10701119899749756, 0.0435250885784626, 0.3987380564212799, 0.17930184304714203, 0.25766244530677795, 0.4274435043334961, 0.1614677906036377, 0.23291151225566864, 0.5805713534355164, 0.2876453399658203, 0.4

In [13]:
average_quality_score = sum(image_quality_score) / len(image_quality_score)
average_quality_score

0.22728709081391327