In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaModel
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Load tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaModel.from_pretrained('roberta-base')

# Example DataFrame
df = pd.DataFrame({
    'hyp': ["Sentence 1 part A", "Sentence 2 part A", "Sentence 3 part A"],
    'tgt': ["Sentence 1 part B", "Sentence 2 part B", "Sentence 3 part B"],
    'labels': [0, 1, 0]
})

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['hyp'], text_pair=examples['tgt'], truncation=True, padding='max_length', max_length=256)

tokenized_inputs = df.apply(tokenize_function, axis=1)

# Extract embeddings
def extract_embeddings(inputs):
    input_ids = tf.constant([inputs['input_ids']])
    attention_mask = tf.constant([inputs['attention_mask']])
    outputs = model(input_ids, attention_mask=attention_mask)
    return outputs.last_hidden_state[:, 0, :].numpy()  # [CLS] token embeddings

embeddings = tokenized_inputs.apply(extract_embeddings)
embeddings = np.vstack(embeddings)

# Dimensionality reduction with t-SNE
tsne = TSNE(n_components=2, random_state=42)
embeddings_2d = tsne.fit_transform(embeddings)

# Visualization
colors = ['red' if label == 0 else 'blue' for label in df['labels']]
plt.figure(figsize=(10, 10))
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=colors)
plt.title('2D visualization of RoBERTa embeddings')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.show()
