In [1]:
import pandas as pd
import numpy as np
from utils import SiameseNetwork, train, set_seed, train_test_splitting, generate_embedding_pairs
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import warnings
warnings.filterwarnings('ignore')

In [2]:
seed = 496
set_seed(seed)
data = pd.read_parquet("../Datasets/embeddings.parquet")

In [3]:
data.columns

Index(['Author', 'Title', 'Section', 'Date', 'Cleaned_Content', 'Text_Length',
       'Embedding'],
      dtype='object')

In [4]:
author_counts = data['Author'].value_counts()
authors_with_enough_articles = author_counts[author_counts >= 5].index
data_author = data[data['Author'].isin(authors_with_enough_articles)]
data_length = data_author[data_author['Text_Length']> 4000]
#pairings = create_pairings(data_length)

In [5]:
train_df, test_df = train_test_splitting(data_length)
print(f"Train set size: {len(train_df)}, Test set size: {len(test_df)}")

Train set size: 1268, Test set size: 223


In [6]:
pairings = generate_embedding_pairs(train_df, seed)

In [7]:
count_0 = 0
count_1 = 0
for x in pairings:
    if x[2] == 0:
        count_0 += 1
    elif x[2] == 1:
        count_1 += 1
        
print(count_0)
print(count_1)

12562
12562


In [8]:
class AuthorshipVerificationDataset(Dataset):
    def __init__(self, samples):
        self.samples = samples
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        sample = self.samples[idx]
        emb1, emb2, label = sample
        return emb1, emb2, torch.tensor(label, dtype=torch.float)

# Convert to Dataset and DataLoader
dataset = AuthorshipVerificationDataset(pairings)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [9]:
set_seed(496)
model = SiameseNetwork(embedding_dim=1536, output_dim=128)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()

train(model, dataloader, epochs=10, optimizer=optimizer, criterion=criterion)

Epoch 1, Loss: 0.570449318253357
Epoch 2, Loss: 0.549397030935336
Epoch 3, Loss: 0.5431215385823456
Epoch 4, Loss: 0.5457746860862688
Epoch 5, Loss: 0.5430910356399667
Epoch 6, Loss: 0.5586931243652605
Epoch 7, Loss: 0.5424103862001696
Epoch 8, Loss: 0.5428302722956995
Epoch 9, Loss: 0.561691116020273
Epoch 10, Loss: 0.5499743254281789


In [10]:
from torch.utils.data import TensorDataset, DataLoader

embeddings_tensor = torch.stack(test_df['Embedding'].apply(lambda emb: torch.tensor(emb, dtype=torch.float32)).tolist())

# Create a DataLoader for your embeddings
embeddings_dataset = TensorDataset(embeddings_tensor)
embeddings_loader = DataLoader(embeddings_dataset, batch_size=32)

def extract_embeddings(model, loader):
    model.eval()
    embeddings = []
    with torch.no_grad():
        for batch in loader:
            emb = batch[0]
            emb = model.forward_once(emb).cpu().numpy()
            embeddings.extend(emb)
    return np.array(embeddings)

extracted_embeddings = extract_embeddings(model, embeddings_loader)

In [11]:
print(extracted_embeddings.shape)

(223, 128)


In [12]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Apply t-SNE to reduce the embeddings to 2 dimensions
tsne = TSNE(n_components=2, random_state=42)


In [None]:
embeddings_2d = tsne.fit_transform(extracted_embeddings)