In [18]:
import clip
from PIL import Image
import torch
from torchvision import transforms
import wav2clip

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load image

In [2]:
image_path = "images\pexels-souvenirpixels-414612.jpg"
image = Image.open(image_path).convert("RGB")
image = image.resize((512, 512))
# Preprocess: convert to tensor normalized in [-1, 1]
image_tensor = (
    transforms.ToTensor()(image).unsqueeze(0).to(device)
)  # shape [1,3,H,W]
image_tensor = 2.0 * image_tensor - 1.0  # scale from [0,1] to [-1,1]

  image_path = "images\pexels-souvenirpixels-414612.jpg"


# Get CLIP embeddings

In [7]:
def get_preprocessing_for_clip():
    return transforms.Compose([
        # 1) Resize shorter edge to target_size, keep aspect ratio:
        transforms.Resize(224, interpolation=transforms.InterpolationMode.BILINEAR),
        
        # 2) Center‐crop to exactly (target_size, target_size):
        transforms.CenterCrop(224),
        
        # 3) Normalize per‐channel:
        transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])
    ])

In [4]:
# Load CLIP model
clip_model, _ = clip.load("ViT-B/32", device=device)

In [9]:
clip_preprocessing = get_preprocessing_for_clip()
preprocessed_image_tensor = clip_preprocessing(image_tensor)
preprocessed_image_tensor.shape

torch.Size([1, 3, 224, 224])

In [11]:
image_embedding = clip_model.encode_image(preprocessed_image_tensor)
image_embedding.shape

torch.Size([1, 512])

In [None]:


# from losses.audio_image_loss import AudioImageLoss

# loss = AudioImageLoss(audio_path=r"audio\1-100032-A-0.wav", device="cuda")
# print(loss.audio_embedding.shape)

clip_model, _ = clip.load("ViT-B/32", device='cuda', jit=False)
print(clip_model.visual.conv1.weight.shape)

torch.Size([768, 3, 32, 32])


# Wav2CLIP

In [19]:
audio_path = "audio/1-100032-A-0.wav"
import librosa

wav2clip_model = wav2clip.get_model()
wav, sr = librosa.load(audio_path, sr=48000, mono=True)
audio_tensor = wav
audio_emb_batch = wav2clip.embed_audio(audio_tensor, wav2clip_model)
audio_emb_batch.shape

(1, 512)

In [20]:
audio_emb_batch

array([[ 1.4408289 , -0.01741131, -0.58991355, -1.289473  , -1.8538855 ,
         0.34507215,  0.42664137,  0.67956156,  0.29198477, -0.14426492,
        -0.744748  ,  0.9637542 , -0.01516724,  0.38428319, -0.2123092 ,
        -0.24426425,  0.45778924, -0.06724355,  0.15096365,  0.6511883 ,
         0.3772672 , -0.22962151,  0.86018777, -0.54331505, -0.49831495,
         0.8214668 , -0.17716826,  0.00570333,  0.5888524 ,  0.143208  ,
         0.8572799 ,  1.3991243 , -0.6894167 , -0.6604547 , -0.11505379,
         0.5740982 ,  0.4883307 ,  0.11116552,  0.7536177 , -0.70415246,
         0.592709  ,  0.00321844,  1.0652252 ,  0.27322462, -0.0205162 ,
         1.0768218 , -0.9000274 ,  0.3167838 , -0.23760112,  0.05755676,
         0.10318513, -0.2090534 , -0.29189757, -0.30470818, -0.24034634,
         1.4006778 ,  0.2446542 ,  0.00371204, -0.04598327, -0.4348867 ,
        -0.1575933 , -0.93822503,  0.1671465 ,  0.82479477,  0.22670904,
        -0.5186713 ,  0.00606272, -0.33216614, -0.5