In [6]:
## IMPORTS AND SETUP
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModel
from supabase import create_client, Client

In [7]:
## FETCH PREDEFINED EMBEDDINGS AND LABELS

import numpy as np
from supabase import create_client, Client

# Supabase setup
url = "https://hyxoojvfuuvjcukjohyi.supabase.co"
key = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6Imh5eG9vanZmdXV2amN1a2pvaHlpIiwicm9sZSI6ImFub24iLCJpYXQiOjE3MjgzMTU4ODMsImV4cCI6MjA0Mzg5MTg4M30.eBQ3JLM9ddCmPeVq_cMIE4qmm9hqr_HaSwR88wDK8w0"
supabase: Client = create_client(url, key)

def fetch_predefined_embeddings_and_labels():
    """
    Fetch the pre-defined genre embeddings and genre_id labels from the genre_assignments table.
    """
    response = supabase.table("genre_assignments").select("genre_embedding, genre_id").execute()
    
    embeddings = []
    labels = []
    
    # Iterate over the rows of data from genre_assignments
    for row in response.data:
        embedding = row['genre_embedding']
        if isinstance(embedding, list) and all(isinstance(x, (int, float)) for x in embedding):
            embeddings.append(embedding)
            labels.append(row['genre_id'])  # Genre ID is the label
        else:
            print(f"Skipping invalid embedding for genre_id {row['genre_id']}")
    
    return np.array(embeddings, dtype=float), labels

# Fetch pre-defined embeddings and labels
predefined_embeddings, predefined_labels = fetch_predefined_embeddings_and_labels()

In [None]:
## DEFINE AND TRAIN CLASSIFIER [SIMPLE]

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Define a simple feedforward classifier model
class EmbeddingClassifier(nn.Module):
    def __init__(self, input_size, num_classes):
        super(EmbeddingClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)  # First fully connected layer
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, num_classes)  # Output layer with number of classes (genres)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Training function
def train_classifier(embeddings, labels, input_size, num_classes, epochs=10, batch_size=32, learning_rate=0.001):
    """
    Train the classifier on the embeddings.

    :param embeddings: Pre-generated embeddings.
    :param labels: Corresponding genre labels for each embedding.
    :param input_size: Size of the embedding vector (e.g., 768).
    :param num_classes: Number of genres (e.g., 5 genres).
    :param epochs: Number of training epochs.
    :param batch_size: Size of the training batch.
    :param learning_rate: Learning rate for the optimizer.
    :return: Trained classifier model.
    """
    # Convert embeddings and labels to PyTorch tensors
    embeddings_tensor = torch.tensor(embeddings, dtype=torch.float32)
    labels_tensor = torch.tensor(labels, dtype=torch.long)  # Long is needed for classification targets
    
    # Create a DataLoader for batching
    dataset = TensorDataset(embeddings_tensor, labels_tensor)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    # Initialize the classifier model
    model = EmbeddingClassifier(input_size=input_size, num_classes=num_classes)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    loss_fn = nn.CrossEntropyLoss()  # Cross-entropy loss for classification
    
    # Training loop
    model.train()  # Set the model to training mode
    for epoch in range(epochs):
        epoch_loss = 0
        for batch_embeddings, batch_labels in dataloader:
            # Forward pass
            outputs = model(batch_embeddings)
            loss = loss_fn(outputs, batch_labels)
            
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()

        print(f"Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss/len(dataloader)}")
    
    return model

# Fetch pre-defined embeddings and labels from Step 1 (the genre_assignments table)
predefined_embeddings, predefined_labels = fetch_predefined_embeddings_and_labels()

# Call the function to train the model on the pre-defined embeddings and labels
input_size = predefined_embeddings.shape[1]  # The size of your embedding vector
num_classes = 5  # Assuming 5 genres

# Train the model using the pre-defined data
model = train_classifier(predefined_embeddings, predefined_labels, input_size=input_size, num_classes=num_classes)

In [31]:
## LOAD FINE-TUNED BERT MODEL
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the tokenizer and model
model_path = "./fine_tuned_bert"  # Path to fine-tuned BERT model in the current search_embeddings folder

# Load the fine-tuned tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

In [32]:
## FETCH FULL TEXT DATA FROM URLS TABLE

def fetch_full_texts_from_supabase():
    """
    Fetch the full texts from the urls_table where genre_id is NULL or empty.
    """
    try:
        # Fetch rows where genre_id is NULL (empty)
        data = supabase.table('urls_table')\
            .select('id, full_text')\
            .is_('genre_id', None).execute()  # Check for NULL in genre_id

        if data.data:
            return data.data  # Return the list of dictionaries containing 'id' and 'content'
        else:
            print("No rows fetched. Check if genre_id has empty values.")
            return []

    except Exception as e:
        print(f"Failed to fetch full texts: {e}")
        return []
    
    # Debugging: Print the number of texts fetched
print(f"Number of texts fetched: {len(texts)}")
print(f"First text: {texts[0] if len(texts) > 0 else 'No texts available'}")


Number of texts fetched: 996
First text: In previous eras, simulated reality had cracks: fissures in the film, graininess, discoloration. These elements kept us from falling straight into the image. Today, not only has the simulation arrived at a higher resolution of life, it feels more real. Baudrillard’s paranoia that reality had already disintegrated in the 1980s unleashed an ensuing theoretical abyss with no triumphant successor. Yet, in retrospect, it seems he didn’t quite grasp the pace at which life, the image, and perception would merge, such that individuals must navigate a new tension between these opposing forces.

We are currently wrestling with the surfacing of an automated surveillance culture, marked by identity fragmentation and privacy erosion due to online platforms and new technologies like omnipresent sensors, biometrics and ambient intelligence, a phenomenon this essay terms the ambient self. Simultaneously, there exists a nostalgia for a seemingly simpler past. Ho

In [35]:
## PREDICT GENRES WITH FINE-TUNED BERT

def predict_genres_with_bert(model, texts, tokenizer, confidence_threshold=0.6):
    """
    Predict genres for the given texts using the fine-tuned BERT model.

    :param model: The pre-trained BERT model.
    :param texts: The original text content to classify.
    :param tokenizer: The tokenizer for BERT.
    :param confidence_threshold: The confidence threshold for genre assignment.
    :return: List of predicted genres.
    """
    model.eval()  # Set model to evaluation mode

    genre_assignments = []

    for text in texts:
        # Tokenize the input text
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

        # Run the model on the tokenized input
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            probabilities = torch.softmax(logits, dim=-1)

        # Assign genres based on the probabilities
        max_prob, predicted_genre = torch.max(probabilities, dim=1)
        if max_prob >= confidence_threshold:
            genre_assignments.append(predicted_genre.item())
        else:
            genre_assignments.append(5)  # Assign genre 5 as fallback for low confidence
 
    return genre_assignments

# Predict genres for the full text using the fine-tuned BERT model
predicted_genres = predict_genres_with_bert(model, texts, tokenizer)

In [25]:
## PREDICT GENRES WITH CONFIDENCE THRESHOLD DEBUGGING

def predict_genres_with_bert(model, texts, tokenizer, confidence_threshold=0.6):
    """
    Predict genres for the given texts using the fine-tuned BERT model.
    """
    model.eval()  # Set model to evaluation mode

    genre_assignments = []

    for text in texts:
        # Tokenize the input text
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

        # Run the model on the tokenized input
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            probabilities = torch.softmax(logits, dim=-1)

        # Assign genres based on the probabilities
        max_prob, predicted_genre = torch.max(probabilities, dim=1)

        # Print confidence score (max_prob) for debugging
        print(f"Confidence score for text: {max_prob.item()}")

        if max_prob >= confidence_threshold:
            genre_assignments.append(predicted_genre.item())
        else:
            genre_assignments.append(5)  # Assign genre 5 as fallback for low confidence
    
    return genre_assignments

In [34]:
## PREDICT GENRE FUNCTION v3

def predict_genres_with_bert(model, texts, tokenizer, confidence_threshold=0.6):
    """
    Predict genres for the given texts using the fine-tuned BERT model.
    """
    model.eval()  # Set model to evaluation mode

    genre_assignments = []

    for text in texts:
        print(f"Processing text: {text[:50]}...")  # Print the first 50 characters of each text

        # Tokenize the input text
        try:
            inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
            print(f"Tokenized input: {inputs}")
        except Exception as e:
            print(f"Tokenization failed for text: {text[:50]}... with error: {e}")
            continue

        # Run the model on the tokenized input
        try:
            with torch.no_grad():
                outputs = model(**inputs)
                logits = outputs.logits
                probabilities = torch.softmax(logits, dim=-1)
                print(f"Model output probabilities: {probabilities}")
        except Exception as e:
            print(f"Model inference failed for text: {text[:50]}... with error: {e}")
            continue

        # Assign genres based on the probabilities
        max_prob, predicted_genre = torch.max(probabilities, dim=1)

        # Print confidence score (max_prob) for debugging
        print(f"Confidence score for text: {max_prob.item()}")

        if max_prob >= confidence_threshold:
            genre_assignments.append(predicted_genre.item())
        else:
            genre_assignments.append(5)  # Assign genre 5 as fallback for low confidence
    
    return genre_assignments

In [15]:
## UPDATE GENRE IDS IN URLS TABLE

def update_genre_ids_in_urls_table(essay_ids, predicted_genres):
    """
    Update the genre_id in the urls_table based on the predicted genres.

    :param essay_ids: List of essay IDs from the urls_table (remaining_ids).
    :param predicted_genres: List of predicted genres corresponding to each essay ID.
    """
    for id, genre_id in zip(essay_ids, predicted_genres):
        # Update the genre_id for the matching essay ID
        response = supabase.table("urls_table").update({"genre_id": genre_id}).eq("id", id).execute()
        
        # Check for errors in the update process
        if response.status_code != 200:
            print(f"Failed to update id {id}: {response.json()}")
        else:
            print(f"Successfully updated id {id} with genre {genre_id}")

In [None]:
def update_genre_ids_in_urls_table(essay_ids, predicted_genres):
    """
    Update the genre_id in the urls_table based on the predicted genres.

    :param essay_ids: List of essay IDs from the urls_table (remaining_ids).
    :param predicted_genres: List of predicted genres corresponding to each essay ID.
    """
    for id, genre_id in zip(essay_ids, predicted_genres):
        # Ensure id is an integer and genre_id is numeric (float)
        id = int(id)  # Cast id to int (int4 in Supabase)
        genre_id = float(genre_id)  # Cast genre_id to float (numeric in Supabase)

        # Debugging: Print what is about to be updated
        print(f"Attempting to update id {id} with genre {genre_id}")
        
        # Update the genre_id for the matching essay ID
        response = supabase.table("urls_table").update({"genre_id": genre_id}).eq("id", id).execute()

        # Print the response to inspect it
        print(f"Response for id {id}: {response}")

# Update urls_table with the predicted genres
update_genre_ids_in_urls_table(remaining_ids, predicted_genres)