In [None]:
import os
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.manifold import TSNE
import torch
from torch import nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from transformers import CLIPModel, CLIPTokenizer, CLIPProcessor
from sentence_transformers import SentenceTransformer

path_huggingface = os.path.expandvars('$DSDIR/HuggingFace_Models/') 


In [None]:
def load_clip_model():

    # Load the pre-trained CLIP model
    model = CLIPModel.from_pretrained(path_huggingface+'openai/clip-vit-large-patch14')
    model = model.cuda()

    # Load the corresponding tokenizer
    tokenizer = CLIPTokenizer.from_pretrained(path_huggingface+'openai/clip-vit-large-patch14')
    processor = CLIPProcessor.from_pretrained(path_huggingface+"openai/clip-vit-large-patch14")

    return model, tokenizer, processor

def load_bert_model():

    bert_model = SentenceTransformer(path_huggingface+'sentence-transformers/paraphrase-distilroberta-base-v1')

    return bert_model

def compute_text_embeddings(premises):

    num_premises = len(premises)
    batch_size = 512

    clip_model, tokenizer, processor = load_clip_model()
    bert_model = load_bert_model()

    # Compute the embeddings for each batch of premises
    bert_text_embeds_prompts = []
    for i in tqdm(range(0, len(premises), batch_size)):
        premises_batch = premises[i:i+batch_size]
        with torch.no_grad():
            text_embeds_prompts_batch = bert_model.encode(premises_batch)

        text_embeds_prompts_batch = torch.from_numpy(text_embeds_prompts_batch)
        text_embeds_prompts_batch = F.normalize(text_embeds_prompts_batch, dim=1)

        bert_text_embeds_prompts.append(text_embeds_prompts_batch)

    # Concatenate the embeddings for all batches
    bert_text_embeds_prompts = torch.cat(bert_text_embeds_prompts, dim=0)


    # split the premises into batches
    premises_batches = [premises[i:i+batch_size] for i in range(0, num_premises, batch_size)]

    # compute the embeddings for each batch of premises
    clip_text_embeds_prompts = torch.zeros(num_premises, 768)
    for i, premises_batch in enumerate(tqdm(premises_batches)):
        tok = tokenizer(premises_batch, return_tensors="pt", padding=True, truncation=True)
        
        for key in tok.keys():
            tok[key] = tok[key].cuda()
        with torch.no_grad():
            text_outputs = clip_model.text_model(**tok)
        text_embeds = text_outputs[1]
        text_embeds = clip_model.text_projection(text_embeds)
        text_embeds_prompt = F.normalize(text_embeds, dim=1)
        start_idx = i * batch_size
        end_idx = min(start_idx + batch_size, num_premises)
        clip_text_embeds_prompts[start_idx:end_idx, :] = text_embeds_prompt

    return bert_text_embeds_prompts, clip_text_embeds_prompts

# Data

In [None]:
df = pd.read_csv('similar_from_MS-COCO_top0_do_steerFalse_steer.csv', header=0, names=['sample_1', 'sample_2', 'CLIP_similarity', 'BERT_similarity', 'diff'])
for col in ['sample_1', 'sample_2']:
    df[col] = df[col].str.replace('  ', ' ')
    df[col] = df[col].str.replace(' .', '.')
    df[col] = df[col].str.lstrip()
    df[col] = df[col].str.rstrip()

df = df[df['sample_1'] != df['sample_2']]
df

In [None]:
plt.figure(figsize=(10, 5))

# Plot CLIP similarity histogram
plt.subplot(1, 3, 1)
plt.hist(df['CLIP_similarity'], bins=20, color='blue')
plt.xlabel('CLIP Similarity')
plt.ylabel('Frequency')
plt.title('Histogram of CLIP Similarity')

# Plot BERT similarity histogram
plt.subplot(1, 3, 2)
plt.hist(df['BERT_similarity'], bins=20, color='blue')
plt.xlabel('BERT Similarity')
plt.ylabel('Frequency')
plt.title('Histogram of BERT Similarity')

# Plot diff histogram
plt.subplot(1, 3, 3)
plt.hist(df['diff'], bins=20, color='green')
plt.xlabel('Difference')
plt.ylabel('Frequency')
plt.title('Histogram of Difference')

plt.tight_layout()
plt.show()


In [None]:
mask_bad = (df['CLIP_similarity'] > 0.9) & (df['diff'].abs() > 0.2)
df[mask_bad]

In [None]:
mask_good = (df['CLIP_similarity'] > 0.9) & (df['diff'].abs() < 0.05)
df[mask_good]

In [None]:
# build negative data (text not correctly embedded by CLIP)
text_bad = pd.concat((df.loc[mask_bad, 'sample_1'], df.loc[mask_bad, 'sample_2'])).to_list()
y_bad = np.zeros(len(text_bad))

# build positive data
text_good = pd.concat((df.loc[mask_good, 'sample_1'], df.loc[mask_good, 'sample_2'])).to_list()
y_good = np.ones(len(text_good))

# build train/test sets
test_set_fraction = 0.1
text_train, text_test, y_train, y_test = train_test_split(text_bad+text_good, np.concatenate((y_bad, y_good)), test_size=test_set_fraction, random_state=123)

X_train_bert, X_train_clip = compute_text_embeddings(text_train)
X_test_bert, X_test_clip = compute_text_embeddings(text_test)

In [None]:
# Perform t-SNE dimensionality reduction
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_test_clip)

# Plot the t-SNE visualization
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_test)
plt.colorbar()
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.title('t-SNE Visualization of Features')
plt.show()


In [None]:
# Perform t-SNE dimensionality reduction
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_test_bert)

# Plot the t-SNE visualization
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_test)
plt.colorbar()
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.title('t-SNE Visualization of Features')
plt.show()


# Predictor

In [None]:
model = LinearRegression()
model.fit(X_train_clip, y_train)

In [None]:
class BinaryClassifier(nn.Module):
    def __init__(self):
        super(BinaryClassifier, self).__init__()
        self.layer1 = nn.Linear(768, 64)
        self.layer2 = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = self.sigmoid(self.layer2(x))
        return x
model = BinaryClassifier()

train_data = TensorDataset(torch.FloatTensor(X_train_clip.detach().numpy()), torch.LongTensor(y_train))
test_data = TensorDataset(torch.FloatTensor(X_test_clip.detach().numpy()), torch.LongTensor(y_test))
train_loader = DataLoader(dataset=train_data, batch_size=64, shuffle=True)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(100):  # Number of epochs
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()  # Reset gradients
        outputs = model(X_batch).squeeze()  # Forward pass
        loss = criterion(outputs, y_batch.float())  # Compute loss
        loss.backward()  # Backward pass
        optimizer.step()  # Update weights

    if (epoch+1) % 10 == 0:  # Print loss every 10 epochs
        print(f'Epoch {epoch+1}, Loss: {loss.item()}')



In [None]:
y_pred = model.predict(X_test_clip.detach().numpy())
print(mean_squared_error(y_test, y_pred))

plt.figure()
plt.scatter(y_pred, y_test, alpha=0.5)
plt.xlabel('pred')
plt.ylabel('true')

In [None]:
y_pred