In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
from dotenv import load_dotenv
import polars as pl
from tqdm import tqdm
import tiktoken
import nltk
import spacy
from concurrent.futures import ThreadPoolExecutor

In [None]:
load_dotenv()

# Load spaCy for lemmatization
nlp = spacy.load("en_core_web_lg")

# Define batch size and number of workers
batch_size = 64
num_workers = 6  # Adjust based on your system

In [None]:
# Initialize the NER pipeline
model_name = "xlm-roberta-large-finetuned-conll03-english"  # Your chosen model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner_pipeline = pipeline(
    "ner", 
    model=model, 
    tokenizer=tokenizer, 
    aggregation_strategy="max", 
    device=0  # Set to -1 if using CPU
)

In [None]:
def tiktoken_len(text):
    encoding = tiktoken.encoding_for_model("gpt-4o")  # Or your model name
    return len(encoding.encode(text))

def merge_split_sentences(chunks):
    merged_chunks = []
    current_chunk = ""
    for chunk in tqdm(chunks, desc="Merging Sentences"):
        sentences = nltk.sent_tokenize(chunk['chunk_text'])  # Use 'chunk_text' instead of 'text'
        if not current_chunk: 
            current_chunk = sentences[0] 
            sentences = sentences[1:]
        for sentence in sentences:
            if len(nltk.word_tokenize(current_chunk + " " + sentence)) <= 4000:  # Example word limit
                current_chunk += " " + sentence
            else:
                merged_chunks.append({**chunk, 'chunk_text': current_chunk})  # Keep other chunk data
                current_chunk = sentence
    if current_chunk: 
        merged_chunks.append({**chunk, 'chunk_text': current_chunk})
    return merged_chunks

def extract_entities_transformer(texts, ner_pipeline):
    return ner_pipeline(texts)

def refine_entities(entities):
    refined_entities = [
        [
            (" ".join([token.lemma_ for token in nlp(word)]), label)
            for word, label in entity_list
            if label != 'O' and len(word.split()) <= 5
        ]
        for entity_list in entities
    ]
    return refined_entities

# Function to process a single batch
def process_batch(batch_texts):
    try:
        batch_entities = extract_entities_transformer(batch_texts, ner_pipeline)
        refined = [
            [(entity['word'], entity['entity_group']) for entity in text_entities]
            for text_entities in batch_entities
        ]
        return refine_entities(refined)
    except Exception as e:
        print(f"An error occurred: {e}")
        return [[] for _ in batch_texts]

In [None]:
podcasts_clean = pl.read_parquet("/Users/borja/Documents/Somniumrema/projects/genai/grag/pipeline_outcomes/podcasts_clean.parquet")

podcasts_clean = podcasts_clean[['post_url', 'post_title', 'series_number', 'blog_date',
                                 'blog_title', 'file_name', 'cleaned_text','tokens']]

In [None]:
# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=4000,  # Adjust based on your language model's context window
    chunk_overlap=1000,
    length_function=tiktoken_len, 
    separators=[". ", "!", "?", "\n\n", "\n", " ", ""] 
)

# Create chunks for each transcript while preserving DataFrame structure using list comprehension
all_chunks = [
    {**row, 'chunk_id': i, 'chunk_text': chunk.page_content,  'tokens': tiktoken_len(chunk.page_content)}
    for row in podcasts_clean.iter_rows(named=True)
    for i, chunk in enumerate(text_splitter.create_documents([row['cleaned_text']]))
]

# Merge split sentences
all_chunks = merge_split_sentences(all_chunks)

In [None]:
# Create a new DataFrame with the chunks and original columns, excluding 'cleaned_text'
chunks_df = pl.DataFrame(all_chunks).drop(['cleaned_text', 'chunk_id']) 

# Verify the new DataFrame
chunks_df

In [None]:
# Extract 'chunk_text' as a list for batch processing
texts = chunks_df['chunk_text'].to_list()

# Create list of batches
batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]

# Initialize list to store all entities
entities = []

# Use ThreadPoolExecutor for parallel processing
with ThreadPoolExecutor(max_workers=num_workers) as executor:
    for refined_entities in tqdm(executor.map(process_batch, batches), total=len(batches), desc="Extracting Entities"):
        entities.extend(refined_entities)

# Add the 'entities' column to the DataFrame
chunks_df = chunks_df.with_columns([
    pl.Series("entities", entities)
])

# Verify the new DataFrame with entities
chunks_df

In [None]:
chunks_df.write_parquet("/Users/borja/Documents/Somniumrema/projects/genai/grag/pipeline_outcomes/chunks_df.parquet")