In [1]:
import pandas as pd
import numpy as np
from utils import create_pairings, SiameseNetwork, train_siamese, set_seed, train_test_splitting, generate_embedding_pairs
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import warnings
warnings.filterwarnings('ignore')

In [2]:
seed = 496
set_seed(seed)
data = pd.read_parquet("../Datasets/embeddings.parquet")

In [3]:
data.columns

Index(['Author', 'Title', 'Section', 'Date', 'Cleaned_Content', 'Text_Length',
       'Embedding'],
      dtype='object')

In [4]:
author_counts = data['Author'].value_counts()
authors_with_enough_articles = author_counts[author_counts >= 5].index
data_author = data[data['Author'].isin(authors_with_enough_articles)]
data_length = data_author[data_author['Text_Length']> 4000]
#pairings = create_pairings(data_length)

In [5]:
train_df, test_df = train_test_splitting(data_length)
print(f"Train set size: {len(train_df)}, Test set size: {len(test_df)}")

Train set size: 1268, Test set size: 223


In [14]:
pairings = generate_embedding_pairs(train_df, seed)

In [15]:
count_0 = 0
count_1 = 0
for x in pairings:
    if x[2] == 0:
        count_0 += 1
    elif x[2] == 1:
        count_1 += 1

In [16]:
print(count_0)
print(count_1)

12562
12562


In [9]:
pairings[0]

(tensor([ 0.0481,  0.0749,  0.0515,  ..., -0.0137, -0.0066,  0.0017]),
 tensor([-0.0110,  0.0466,  0.0847,  ..., -0.0053, -0.0050,  0.0007]),
 0)

In [10]:
# --- Define Dataset, Model, etc. --- 
# ... (Implement a PyTorch Dataset to work with your pairings) ...

model = SiameseNetwork(1536)  # Change embedding_size if needed
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class PairingsDataset(Dataset):
    def __init__(self, pairings):
        self.pairings = pairings

    def __len__(self):
        return len(self.pairings)

    def __getitem__(self, index):
        embedding_1, embedding_2, label = self.pairings[index]
        return embedding_1, embedding_2, torch.tensor(label).float()

# --- Create Dataloader ---
dataset = PairingsDataset(pairings)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, worker_init_fn=np.random.seed(seed)) 

#print(device)

model.to(device)
 
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()

# --- Train the Model ---
train_siamese(model, dataloader, num_epochs=1, optimizer=optimizer, criterion=criterion, device=device)

In [None]:
import pandas as pd
from sklearn.manifold import TSNE

def evaluate_embeddings(model, df, output_file="embedding_visualization.png"):
    embeddings = []
    labels = []  # Assuming you have author labels in your df

    for _, row in df.iterrows():
        embedding = generate_embedding(model, row['Embedding'])
        embeddings.append(embedding)
        labels.append(row['Author'])

    # Reduce dimensionality for visualization
    embeddings_2d = TSNE(n_components=2).fit_transform(embeddings)

    # Create a scatter plot
    plt.scatter(embeddings_2d[:,0], embeddings_2d[:,1], c=labels)
    plt.title("Visualization of Embeddings")
    plt.savefig(output_file)

In [None]:
evaluate_embeddings(model, data_length)

In [None]:
import pandas as pd
from sklearn.manifold import TSNE
import torch # Make sure you have PyTorch installed 

def generate_embedding(model, emb):
    """
    Generates an embedding from text using the trained model.

    Args:
        model (nn.Module): Your trained Siamese network model.
        text (str): The input text (from the 'Cleaned_Content' column).

    Returns:
        torch.Tensor: The embedding generated by the model.
    """
    model.eval()
    with torch.no_grad():
        # Text preprocessing (if needed):
        # Example: Assuming you have a function to tokenize and encode your text 
        embedding = model.embedding_branch(torch.tensor(emb, dtype=torch.float32)) 

    return embedding.cpu().detach().numpy()


In [None]:
embedding = model.embedding_branch(input_data) 