In [None]:
# Import required packages
import pandas as pd
import os
import torch
import math
from transformers import AutoTokenizer, AutoModel

In [2]:
# Initialize a list to store rows for the DataFrame
data = []

In [None]:
# Load the LegalBERT tokenizer and model
model_name = "nlpaueb/legal-bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)


In [4]:
# Generate embeddings
def generate_embeddings(text):
    # Tokenize the input text
    
     # Check if the input text is NaN or a float
    if isinstance(text, float) and math.isnan(text):
        print("Encountered NaN, returning empty tensor.")
        return torch.zeros(1, model.config.hidden_size)  # Returning a zero tensor with appropriate size

    
    text = text.replace('\xa0', ' ')
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)

    # Forward pass to get the output from LegalBERT
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the embeddings from the last hidden state
    # Use the mean pooling of the last hidden states for the [CLS] token
    embeddings = outputs.last_hidden_state.mean(dim=1)  # You can also use [CLS] token instead

    return embeddings

In [5]:
# Path to the folder containing the CSV files
folder_path = 'contract_topics'

# Create the folder if it doesn't exist
output_folder = 'topic_embeddings'
os.makedirs(output_folder, exist_ok=True)

In [6]:
# a = os.path.join(folder_path, 'ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-EX-10.13-JOINT VENTURE AGREEMENT.PDF.csv')

In [7]:
# df3 = pd.read_csv(a)
# s = df3['Topic_text'][0]
# s = s.replace('\xa0', ' ')
# s

In [None]:
# Loop through each file in the folder
for filename in os.listdir(folder_path):
    # Create the full path to the CSV file
    file_path = os.path.join(folder_path, filename)

    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_path)
    data = []
    contract_name = df['Contract_name'][0]
    # Check if the CSV file already exists, if so, skip this contract
    
    output_file = os.path.join(output_folder, f'{contract_name}.csv')
    if os.path.exists(output_file):
        print(f"Skipping {contract_name}, CSV already exists.")
        continue
    for i in range(len(df['Topic_heading'])):
        # print(i)
        # print(df['Contract_name'][i])
        # print(df['Topic_text'][i])
        embedding = generate_embeddings(df['Topic_text'][i])
        data.append({
            'Contract_Name': df['Contract_name'][i],
            'Chunk_title': df['Topic_heading'][i],
            'Chunk_text': df['Topic_text'][i],
            'Chunk_embeddings': embedding
        })
    df2 = pd.DataFrame(data)
    # Save the DataFrame to a CSV file with the contract name
    df2.to_csv(output_file, index=False)
    print(f"Saved topics for {contract_name} to {output_file}")
        