In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
from dotenv import load_dotenv
import polars as pl
from tqdm import tqdm
import tiktoken
import nltk
import spacy
import os
from concurrent.futures import ThreadPoolExecutor

In [3]:
load_dotenv()
OPEN_AI_API_KEY = os.getenv("OPEN_AI_API_KEY")

# Load spaCy for lemmatization
nlp = spacy.load("en_core_web_lg")
#nltk.download('punkt_tab')
# Define batch size and number of workers
batch_size = 64
num_workers = 6  # Adjust based on your system

In [4]:
# Initialize the NER pipeline
model_name = "xlm-roberta-large-finetuned-conll03-english"  
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner_pipeline = pipeline(
    "ner", 
    model=model, 
    tokenizer=tokenizer, 
    aggregation_strategy="max", 
    device=0  # Set to -1 if using CPU
)

Some weights of the model checkpoint at xlm-roberta-large-finetuned-conll03-english were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
def tiktoken_len(text):
    encoding = tiktoken.encoding_for_model("gpt-4o")  # Or your model name
    return len(encoding.encode(text))

def merge_split_sentences(chunks):
    merged_chunks = []
    current_chunk = ""
    for chunk in tqdm(chunks, desc="Merging Sentences"):
        sentences = nltk.sent_tokenize(chunk['chunk_text'])  # Use 'chunk_text' instead of 'text'
        if not current_chunk: 
            current_chunk = sentences[0] 
            sentences = sentences[1:]
        for sentence in sentences:
            if len(nltk.word_tokenize(current_chunk + " " + sentence)) <= 4000:  # Example word limit
                current_chunk += " " + sentence
            else:
                merged_chunks.append({**chunk, 'chunk_text': current_chunk})  # Keep other chunk data
                current_chunk = sentence
    if current_chunk: 
        merged_chunks.append({**chunk, 'chunk_text': current_chunk})
    return merged_chunks

def extract_entities_transformer(texts, ner_pipeline):
    return ner_pipeline(texts)

def refine_entities(entities):
    refined_entities = [
        [
            (" ".join([token.lemma_ for token in nlp(word)]), label)
            for word, label in entity_list
            if label != 'O' and len(word.split()) <= 5
        ]
        for entity_list in entities
    ]
    return refined_entities

# Function to process a single batch
def process_batch(batch_texts):
    try:
        batch_entities = extract_entities_transformer(batch_texts, ner_pipeline)
        refined = [
            [(entity['word'], entity['entity_group']) for entity in text_entities]
            for text_entities in batch_entities
        ]
        return refine_entities(refined)
    except Exception as e:
        print(f"An error occurred: {e}")
        return [[] for _ in batch_texts]

In [5]:
podcasts_clean = pl.read_parquet("/Users/borja/Documents/Somniumrema/projects/genai/grag/pipeline_outcomes/podcasts_clean.parquet")

qa = pl.read_csv("/Users/borja/Documents/Somniumrema/projects/genai/grag/data/acquired-qa-evaluation.csv")
podcasts_clean = podcasts_clean[['post_url', 'post_title', 'series_number', 'blog_date',
                                 'blog_title', 'file_name', 'cleaned_text','tokens']]

# Paso 2: Filtrar las filas que tienen QA
podcasts_clean = podcasts_clean.join(
    qa.select('file_name').unique(),
    on='file_name',
    how='semi'
)
podcasts_clean

post_url,post_title,series_number,blog_date,blog_title,file_name,cleaned_text,tokens
str,str,str,date,str,str,str,i64
"""https://www.acquired.fm/episod…","""Costco""","""Season 13, Episode 2""",2023-08-20,"""The Complete History & Strateg…","""costco""","""I don't think I have ever been…",37417
"""https://www.acquired.fm/episod…","""Nvidia Part III: The Dawn of t…","""Season 13, Episode 3""",2023-09-05,"""The Complete History & Strateg…","""nvidia_part_iii_the_dawn_of_th…","""Do you like my Bucks T-shirt? …",35198
"""https://www.acquired.fm/episod…","""Visa""","""Season 13, Episode 4""",2023-11-26,"""The Complete History & Strateg…","""visa""","""It's funny. When we picked thi…",45242
"""https://www.acquired.fm/episod…","""Renaissance Technologies""","""Season 14, Episode 3""",2024-03-17,"""The Complete History & Strateg…","""renaissance_technologies""","""I always used to misspell Rena…",39313
"""https://www.acquired.fm/episod…","""Porsche (with Doug DeMuro)""","""Season 12, Episode 6""",2023-06-26,"""The Complete History & Strateg…","""porsche_with_doug_demuro""","""It's definitely por-shuh. Por-…",43266
…,…,…,…,…,…,…,…
"""https://www.acquired.fm/episod…","""Airbnb""","""Season 7, Episode 8""",2020-12-10,"""‍""","""airbnb""","""Welcome to season 7, episode 8…",31354
"""https://www.acquired.fm/episod…","""SpaceX""","""Season 6, Episode 7""",2020-05-26,"""Related Episodes""","""spacex""","""Welcome to Season Six, Episode…",29261
"""https://www.acquired.fm/episod…","""Disney, Plus""","""Season 5, Episode 7""",2019-11-25,"""Related Episodes""","""disney_plus""","""Disney makes it approachable, …",25734
"""https://www.acquired.fm/episod…","""WhatsApp""","""Season 6, Episode 1""",2020-01-28,"""Related Episodes""","""whatsapp""","""I do have to say, that based o…",21434


In [6]:
# # Import data
chunk_df = pl.read_parquet("/Users/borja/Documents/Somniumrema/projects/genai/grag/pipeline_outcomes/chunks_df.parquet")


# Paso 2: Filtrar las filas que tienen QA
chunks_clean = chunk_df.join(
    qa.select('file_name').unique(),
    on='file_name',
    how='semi'
)
chunks_clean

# Define the updated titles as a dictionary
updated_titles = {
    "Porsche (with Doug DeMuro)": "Porsche",
    "The Electronic Arts IPO (with Trip Hawkins)": "EA IPO",
    "Nvidia Part I: The GPU Company (1993-2006)": "Nvidia Part I",
    "Nvidia Part II: The Machine Learning Company (2006-2022)": "Nvidia Part II",
    "Nvidia Part III: The Dawn of the AI Era (2022-2023)": "Nvidia Part III",
    "Arena Show Part II: Brooks Running (with CEO Jim Weber)": "Arena Show Part II",
    "Ethereum (with Packy McCormick)": "Ethereum",
    "FTX (with Sam Bankman-Fried & Mario Gabriele)": "FTX",
    "Amazon Web Services": "AWS",
    "Renaissance Technologies": "Renaissance Tech",
    "Berkshire Hathaway Part I": "B. Hathaway Part I"
}

# Apply the replacement using the `replace` method
chunks_clean = chunks_clean.with_columns(
    pl.col("post_title").replace(updated_titles).alias("post_title")
)



In [7]:
# Assuming 'processed_df' is your Polars DataFrame
# Count the number of chunks per podcast title
chunk_counts = chunks_clean['post_title'].value_counts().sort('post_title')
chunk_counts.columns = ['post_title', 'chunk_count']
# Sort the chunk counts from lower to higher
chunk_counts_sorted = chunk_counts.sort('chunk_count', descending=False)

chunk_counts_sorted

post_title,chunk_count
str,u32
"""Arena Show Part II""",4
"""FTX""",6
"""Nvidia Part I""",7
"""WhatsApp""",7
"""Disney, Plus""",8
…,…
"""Nintendo's Origins""",13
"""Porsche""",13
"""Enron""",14
"""Visa""",14


In [8]:
import polars as pl
import plotly.express as px

mean_chunk_size = chunk_counts_sorted['chunk_count'].mean()

# Create a bar chart using Plotly Express with sorted chunk counts
fig = px.bar(chunk_counts_sorted, 
             x='post_title', 
             y='chunk_count', 
             labels={'post_title': 'Podcast', 'chunk_count': 'Número de Chunks'},
             title='Número de Chunks por Podcast'
            )

# Add a horizontal line for the mean chunk size
fig.add_hline(y=mean_chunk_size, line_dash="dash", line_color="red",
              annotation_text=f"Tamaño medio: {mean_chunk_size:.2f}", 
              annotation_position="top left")

# Adjust the layout for better readability
fig.update_layout(
    autosize=False,
    width=1200,  # Adjust width as needed
    height=600,  # Adjust height as needed
    xaxis_tickangle=-45,  # Rotate x-axis labels for better visibility
    xaxis_title='Podcast',
    yaxis_title='Número de Chunks',
    font=dict(size=12),
    margin=dict(l=50, r=50, t=100, b=50),  # Adjust margins to remove white space
    paper_bgcolor='rgba(0,0,0,0)',  # Transparent background
    plot_bgcolor='rgba(0,0,0,0)',  # Transparent plot background
)

# Show the plot
fig.show()


In [9]:
# # Initialize the text splitter
# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=8000,  # Adjust based on your language model's context window
#     chunk_overlap=0.2*8000,
#     length_function=tiktoken_len, 
#     separators=[". ", "!", "?", "\n\n", "\n", " ", ""] 
# )

# # Create chunks for each transcript while preserving DataFrame structure using list comprehension
# all_chunks = [
#     {**row, 'chunk_id': i, 'chunk_text': chunk.page_content,  'tokens': tiktoken_len(chunk.page_content)}
#     for row in podcasts_clean.iter_rows(named=True)
#     for i, chunk in enumerate(text_splitter.create_documents([row['cleaned_text']]))
# ]

# # Merge split sentences
# all_chunks = merge_split_sentences(all_chunks)

In [10]:
# # Create a new DataFrame with the chunks and original columns, excluding 'cleaned_text'
# chunks_df = pl.DataFrame(all_chunks).drop(['cleaned_text', 'chunk_id']) 

# # Verify the new DataFrame
# chunks_df

In [11]:
# # Extract 'chunk_text' as a list for batch processing
# texts = chunks_df['chunk_text'].to_list()

# # Create list of batches
# batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]

# # Initialize list to store all entities
# entities = []

# # Use ThreadPoolExecutor for parallel processing
# with ThreadPoolExecutor(max_workers=num_workers) as executor:
#     for refined_entities in tqdm(executor.map(process_batch, batches), total=len(batches), desc="Extracting Entities"):
#         entities.extend(refined_entities)

# # Add the 'entities' column to the DataFrame
# chunks_df = chunks_df.with_columns([
#     pl.Series("entities", entities)
# ])

# # Verify the new DataFrame with entities
# chunks_df

In [12]:
import polars as pl
from openai import OpenAI
import logging
import json
from typing import Dict, List
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from openai import OpenAI
import json
from typing import Dict
import logging
import time
import os
import re

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

client = OpenAI(api_key=OPEN_AI_API_KEY)

def create_chunks(text: str, chunk_size: int = 9000, overlap: int = 1800) -> List[str]:
    """
    Split text into overlapping chunks
    """
    words = text.split()
    chunks = []
    
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
    
    return chunks

def chunk_text(df: pl.DataFrame) -> pl.DataFrame:
    """
    Split cleaned_text into overlapping chunks while preserving context
    """
    exploded_df = df.with_columns([
        pl.col("cleaned_text")
        .map_elements(
            create_chunks,
            return_dtype=pl.List(pl.Utf8)
        )
        .alias("text_chunks")
    ]).explode("text_chunks")
    
    return exploded_df.with_columns([
        pl.Series(range(len(exploded_df))).alias("chunk_id")
    ])

def extract_podcast_entities(text: str, retries=3) -> Dict:
    prompt = f"""Extract entities from this text segment.
    Return a JSON object exactly like this example:
    {{"entities":[{{"entity":"Apple","type":"COMPANY"}},{{"entity":"New York","type":"LOCATION"}}]}}

    Text: {text}
    """

    for attempt in range(retries):
        try:
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": "You are an entity extractor. Return ONLY compact JSON without formatting."},
                 {"role": "user", "content": prompt}
                ],
                temperature=0
                
            )

            try:
                response_text = response.choices[0].message.content.strip()
                result = json.loads(response_text)
                
                if isinstance(result, dict) and "entities" in result:
                    return result
                    
            except json.JSONDecodeError as e:
                logger.error(f"JSON parsing error: {e}")
                if attempt < retries - 1:
                    time.sleep(2 ** attempt)
                    continue

        except Exception as e:
            logger.error(f"API call error: {e}")
            if attempt < retries - 1:
                time.sleep(2 ** attempt)
                continue

    return {"entities": []}


def process_batch(texts: List[str]) -> List[Dict]:
    """
    Process a batch of text chunks
    """
    results = []
    for text in texts:
        try:
            result = extract_podcast_entities(text)
            results.append(result)
        except Exception as e:
            logger.error(f"Batch processing error: {str(e)}")
            results.append({"entities": []})
    return results

import uuid

def update_chunk_id(df: pl.DataFrame) -> pl.DataFrame:
    df = df.with_columns(
        pl.arange(0, pl.len()).alias("row_nr")
    )
    df = df.with_columns(
        (pl.col("file_name").rank(method="ordinal").over("file_name") -1).cast(pl.UInt32).alias("chunk_id")
    )

    df = df.with_columns(
        (pl.col("file_name") + "_" + pl.col("chunk_id").cast(pl.Utf8)).alias("chunk_id")
    )
    df = df.drop("row_nr")
    return df


def process_podcast_content(df: pl.DataFrame, batch_size: int = 5, num_workers: int = 3) -> pl.DataFrame:
    """
    Process all chunks in the DataFrame with parallel processing
    """
    # Get chunks for processing
    chunks = df["text_chunks"].to_list()
    
    # Create batches
    batches = [chunks[i:i + batch_size] for i in range(0, len(chunks), batch_size)]
    
    all_results = []
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = []
        for batch in batches:
            futures.append(executor.submit(process_batch, batch))
            
        for future in tqdm(futures, total=len(futures), desc="Processing chunks"):
            batch_results = future.result()
            all_results.extend(batch_results)
    
    # Add results and UUID to DataFrame
    return df.with_columns([
        pl.Series("entities", all_results),
        pl.Series("chunk_uuid", [str(uuid.uuid4()) for _ in range(len(df))])
    ])

# Usage
try:
    # First create chunks
    chunked_df = chunk_text(podcasts_clean)
    
    # Then process chunks with entities extraction
    processed_df = process_podcast_content(
        df=chunked_df,
        batch_size=3,  # Small batch size to avoid API rate limits
        num_workers=2  # Fewer workers to manage API calls
    )

    # Update chunk_id
    processed_df = update_chunk_id(processed_df)

    logger.info("Processing completed successfully")
    
except Exception as e:
    logger.error(f"Processing failed: {str(e)}")
    raise


Processing chunks:   0%|          | 0/39 [00:00<?, ?it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Processing chunks:   3%|▎         | 1/39 [00:29<18:27, 29.14s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/cha

In [13]:
processed_df.write_parquet("/Users/borja/Documents/Somniumrema/projects/genai/grag/pipeline_outcomes/podcasts_entities_20241212.parquet")

In [14]:
# import polars as pl

# def update_chunk_id(df: pl.DataFrame) -> pl.DataFrame:
#     df = df.with_columns(
#         pl.arange(0, pl.len()).alias("row_nr")
#     )
#     df = df.with_columns(
#         (pl.col("file_name").rank(method="ordinal").over("file_name") -1).cast(pl.UInt32).alias("chunk_id")
#     )

#     df = df.with_columns(
#         (pl.col("file_name") + "_" + pl.col("chunk_id").cast(pl.Utf8)).alias("chunk_id")
#     )
#     df = df.drop("row_nr")
#     return df



In [15]:
processed_df["entities"][0]

{'entities': [{'entity': 'Charlie Munger', 'type': 'PERSON'},
  {'entity': 'Acquired', 'type': 'ORGANIZATION'},
  {'entity': 'Ben Gilbert', 'type': 'PERSON'},
  {'entity': 'David Rosenthal', 'type': 'PERSON'},
  {'entity': 'Costco', 'type': 'COMPANY'},
  {'entity': 'Kirkland Signature', 'type': 'BRAND'},
  {'entity': 'Nike', 'type': 'COMPANY'},
  {'entity': 'Seattle', 'type': 'LOCATION'},
  {'entity': 'Jim Sinegal', 'type': 'PERSON'},
  {'entity': 'Jeffrey Brotman', 'type': 'PERSON'},
  {'entity': 'Starbucks', 'type': 'COMPANY'},
  {'entity': 'Sol Price', 'type': 'PERSON'},
  {'entity': 'FedMart', 'type': 'COMPANY'},
  {'entity': 'Price Club', 'type': 'COMPANY'},
  {'entity': 'Fedco', 'type': 'COMPANY'},
  {'entity': 'New York City', 'type': 'LOCATION'},
  {'entity': 'Bronx', 'type': 'LOCATION'},
  {'entity': 'Belarus', 'type': 'LOCATION'},
  {'entity': 'Ellis Island', 'type': 'LOCATION'},
  {'entity': 'San Diego', 'type': 'LOCATION'},
  {'entity': 'USC', 'type': 'ORGANIZATION'},
  {'e

In [16]:
import polars as pl

def count_entities(df: pl.DataFrame) -> pl.DataFrame:
    # Extract the list of entities from the struct
    df = df.with_columns(
        pl.col("entities").struct.field("entities").alias("entities_list")
    )

    # Explode the list of entities
    df = df.explode("entities_list")

    # Extract 'entity' and 'type' from the 'entities_list' struct
    df = df.with_columns([
        pl.col("entities_list").struct.field("entity").alias("entity"),
        pl.col("entities_list").struct.field("type").alias("entity_type")
    ])

    # Count entities by file_name and entity_type
    entities_by_podcast = df.group_by(["file_name", "entity_type"]).agg(
        pl.len().alias("count")
    ).pivot(
        index="file_name",
        on="entity_type",
        values="count"
    ).fill_null(0)

    # Compute total counts for each entity_type
    total_counts = entities_by_podcast.select(
        pl.lit("Total").alias("file_name"),
        *[
            pl.col(col).sum().alias(col) for col in entities_by_podcast.columns if col != "file_name"
        ]
    )

    # Append the total row to entities_by_podcast
    full_table = entities_by_podcast.vstack(total_counts)

    return full_table

# Assuming processed_df is your DataFrame with columns 'entities' and 'file_name'
entities_count = count_entities(processed_df)

print("Entidades:")
entities_count

Entidades:


file_name,LAW,DATE,PROGRAM,ORGANIZATION,PERSON,STORY,LOCATION,INDUSTRY,PRODUCT,PLATFORM,DOCUMENT,MEDIA,FRANCHISE,CONSOLE,COMPANY,MOVIE,EVENT,WEBSITE,TEAM,CRYPTOCURRENCY,BOOK_SERIES,PODCAST,PUBLICATION,PROJECT,null,PROGRAMMING_LANGUAGE,COMMUNITY,FUND,BOOK,SYSTEM,WORK_OF_ART,BAND,TECHNOLOGY,CURRENCY,SHOW,CONCEPT,ENTERTAINMENT,FINANCIAL_PRODUCT,LEAGUE,MYTHICAL_CREATURE,AWARD,GROUP,SOCIAL_MEDIA_HANDLE,INDEX,TV SHOW,MONEY,TIMEZONE,WORK,TV_SHOW,TITLE,OTHER,STANDARD,CHARACTER,BRAND,URL,ETHNICITY,INSTITUTION,NATIONALITY,LITERATURE,SPORT,LEGISLATION,FINANCIAL_INSTRUMENT,ARTIST,SERVICE,COMPLIANCE,FINANCIAL_EVENT,ENTITY,RELIGIOUS_TEXT,EMAIL,LEGAL_EVENT,GAME
str,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
"""enron""",1,0,0,16,60,0,20,0,1,0,0,0,1,0,94,0,0,0,0,0,0,0,6,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,4,1,0,0,0
"""peloton""",0,2,0,34,50,0,14,0,2,1,0,0,0,0,69,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""the_nba""",0,0,3,66,90,0,30,2,9,0,0,0,0,0,33,0,1,0,28,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
"""nvidia_part_iii_the_dawn_of_th…",0,12,0,12,37,0,2,0,51,0,1,0,0,0,90,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,17,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""the_electronic_arts_ipo_with_t…",0,0,0,5,40,0,6,0,34,0,0,0,0,0,49,0,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""qualcomm""",0,0,0,11,29,0,20,0,5,0,0,0,0,0,78,0,2,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""amazoncom""",0,4,0,6,99,0,32,0,28,0,0,0,0,0,142,0,1,1,0,0,0,0,0,0,0,0,0,0,3,0,0,0,7,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""whatsapp""",0,0,0,21,25,0,17,0,15,0,0,0,0,0,45,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""nvidia_part_i_the_gpu_company_…",0,8,0,8,35,0,10,0,25,0,0,0,0,0,51,0,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [17]:
import polars as pl
import plotly.express as px

# Assuming 'processed_df' is your Polars DataFrame
# Extract the totals row
totals_row = entities_count.filter(pl.col('file_name') == 'Total')

# Convert to a dictionary for easier manipulation
totals_dict = totals_row.to_dicts()[0]

# Sort the totals in descending order and get the top 10
sorted_totals = sorted(
    [(k, v) for k, v in totals_dict.items() if k != 'file_name' and isinstance(v, int)],
    key=lambda x: x[1],
    reverse=True
)[:10]

# Create a DataFrame for plotting
top_10_df = pl.DataFrame({
    'column_name': [item[0] for item in sorted_totals],
    'total_count': [item[1] for item in sorted_totals]
})

# Calculate the median of the totals
median_total = top_10_df['total_count'].median()

# Create a bar chart using Plotly Express
fig = px.bar(top_10_df, 
             x='column_name', 
             y='total_count', 
             labels={'column_name': 'Tipo de Entidad', 'total_count': 'Número de Apariciones'},
             title='Top 10 Columns by Total Count',
             color='total_count',  # Color bars by total count for visual distinction
             color_continuous_scale='Viridis')

# Add a horizontal line for the median total count
fig.add_hline(y=median_total, line_dash="dash", line_color="red",
              annotation_text=f"Mediana Aparicionest: {median_total:.2f}", 
              annotation_position="top right")

# Adjust the layout for better readability
fig.update_layout(
    autosize=False,
    width=1200,  # Adjust width as needed
    height=600,  # Adjust height as needed
    xaxis_tickangle=0,  # Rotate x-axis labels for better visibility
    xaxis_title='Tipo de Entidad',
    yaxis_title='Número de Apariciones',
    font=dict(size=12),
    margin=dict(l=50, r=50, t=100, b=50),  # Adjust margins to remove white space
    paper_bgcolor='rgba(0,0,0,0)',  # Transparent background
    plot_bgcolor='rgba(0,0,0,0)',  # Transparent plot background
)

# Show the plot
fig.show()


In [18]:

import polars as pl
import numpy as np
import plotly.express as px
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity


# Function to get embeddings from OpenAI
def get_embedding(text):
    response = client.embeddings.create(input=text, model="text-embedding-ada-002")
    return response.data[0].embedding

# Generate embeddings for each file name
embeddings = np.array([get_embedding(name) for name in processed_df['file_name']])


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embedding

In [19]:
# Compute cosine similarity matrix
cosine_sim_embeddings = cosine_similarity(embeddings)

# Create a DataFrame for the cosine similarity matrix
cosine_sim_df = pd.DataFrame(cosine_sim_embeddings, index=processed_df["file_name"], columns=processed_df["file_name"])

# Plot the heatmap using Plotly
fig = px.imshow(cosine_sim_df, 
                labels=dict(x="Podcast", y="Podcast", color="Similaridad (Coseno)"),
                x=processed_df["file_name"],
                y=processed_df["file_name"],
                color_continuous_scale='Viridis')

# Adjust the layout to make the graph larger and remove the white frame
fig.update_layout(
    title="Similaridad Semántica de los Títulos de los Podcasts (Embeddings)",
    autosize=False,
    width=1200,  # Adjust width as needed
    height=1000,  # Adjust height as needed
    margin=dict(l=50, r=50, t=100, b=50),  # Adjust margins to remove white space
    paper_bgcolor='rgba(0,0,0,0)',  # Transparent background
    plot_bgcolor='rgba(0,0,0,0)',  # Transparent plot background
    font=dict(size=10),  # Adjust font size for better readability
    xaxis_tickangle=-45,  # Rotate x-axis labels for better visibility
    yaxis_tickangle=0  # Keep y-axis labels horizontal
)

# Show the plot
fig.show()

In [27]:
def extract_relationships(text: str, retries=3) -> Dict:
    prompt = f"""Extract relationships between entities from this text segment.
    Return a JSON object exactly like this example:
    {{"relationships":[{{"source":"Steve Jobs","relationship":"founded","target":"Apple","context":"in 1976"}},{{"source":"Steve Jobs","relationship":"born","target":"San Francisco","context":"in 1955"}}]}}

    Only extract relationships that are explicitly mentioned in the text.
    Relationships should be specific and factual, not inferred.
    Include relevant context when available (dates, locations, or additional details).

    Text: {text}
    """ 
    for attempt in range(retries):
        try:
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    { "role": "system", "content": "You are a relationship extractor. Extract only explicit relationships between entities in the text."},
                    {"role": "user",  "content": prompt}
                ],
                temperature=0
            )
            
            # Log the raw response
            logger.info(f"Raw API response: {response}")
            
            # Check if the response contains a valid JSON
            if response.choices and response.choices[0].message.content:
                content = response.choices[0].message.content.strip()
                try:
                    result = json.loads(content)
                    if not isinstance(result, dict) or "relationships" not in result:
                        logger.warning(f"Response does not contain expected JSON structure: {content}")
                        return {"relationships": []}
                    return result
                except json.JSONDecodeError as e:
                    logger.error(f"JSON parsing error: {str(e)}. Content: {content}")
            else:
                logger.error(f"API response does not contain choices or content: {response}")
        
        except Exception as e:
            logger.error(f"Relationship extraction failed (attempt {attempt + 1}/{retries}): {str(e)}")
            if attempt == retries - 1:
                return {"relationships": []}
            time.sleep(2 ** attempt)  # Exponential backoff

    return {"relationships": []}



In [28]:
processed_df = pl.read_parquet("/Users/borja/Documents/Somniumrema/projects/genai/grag/pipeline_outcomes/podcasts_entities_20241212.parquet")

In [30]:
extract_relationships(processed_df["text_chunks"][0])

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:__main__:Raw API response: ChatCompletion(id='chatcmpl-AdqqICUJWgv9knpdjGmVwSqVqAC7o', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='```json\n{\n    "relationships": [\n        {\n            "source": "Costco",\n            "relationship": "founded",\n            "target": "Seattle",\n            "context": "in 1983 by Jim Sinegal and Jeffrey Brotman"\n        },\n        {\n            "source": "Jim Sinegal",\n            "relationship": "co-founded",\n            "target": "Costco",\n            "context": "in 1983"\n        },\n        {\n            "source": "Jeffrey Brotman",\n            "relationship": "co-founded",\n            "target": "Costco",\n            "context": "in 1983"\n        },\n        {\n            "source": "Sol Price",\n            "relationship": "founded",\n            "target": "FedMart",\n            "c

{'relationships': []}

In [31]:
relationships = extract_relationships(processed_df)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:__main__:Raw API response: ChatCompletion(id='chatcmpl-AdqqhQdfEVWcMQ2jt13U347QGTnDD', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='```json\n{"relationships":[{"source":"Jim Sinegal","relationship":"founded","target":"Costco","context":""}]}\n```', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1734061307, model='gpt-4o-2024-08-06', object='chat.completion', service_tier=None, system_fingerprint='fp_a34b2727d6', usage=CompletionUsage(completion_tokens=30, prompt_tokens=2376, total_tokens=2406, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=2176)))
ERROR:__main__:JSON parsing error: Expecting value: line 1 column 1 (char 0).

In [33]:
relationships

{'relationships': []}

In [None]:
import polars as pl
import plotly.express as px

# Assuming 'processed_df' is your Polars DataFrame
# Extract the totals row
totals_row = entities_count.filter(pl.col('file_name') == 'Total')

# Convert to a dictionary for easier manipulation
totals_dict = totals_row.to_dicts()[0]

# Sort the totals in descending order and get the top 10
sorted_totals = sorted(
    [(k, v) for k, v in totals_dict.items() if k != 'file_name' and isinstance(v, int)],
    key=lambda x: x[1],
    reverse=True
)[:10]

# Create a DataFrame for plotting
top_10_df = pl.DataFrame({
    'column_name': [item[0] for item in sorted_totals],
    'total_count': [item[1] for item in sorted_totals]
})

# Calculate the median of the totals
median_total = top_10_df['total_count'].median()

# Create a bar chart using Plotly Express
fig = px.bar(top_10_df, 
             x='column_name', 
             y='total_count', 
             labels={'column_name': 'Tipo de Entidad', 'total_count': 'Número de Apariciones'},
             title='Top 10 Columns by Total Count',
             color='total_count',  # Color bars by total count for visual distinction
             color_continuous_scale='Viridis')

# Add a horizontal line for the median total count
fig.add_hline(y=median_total, line_dash="dash", line_color="red",
              annotation_text=f"Mediana Aparicionest: {median_total:.2f}", 
              annotation_position="top right")

# Adjust the layout for better readability
fig.update_layout(
    autosize=False,
    width=1200,  # Adjust width as needed
    height=600,  # Adjust height as needed
    xaxis_tickangle=0,  # Rotate x-axis labels for better visibility
    xaxis_title='Tipo de Entidad',
    yaxis_title='Número de Apariciones',
    font=dict(size=12),
    margin=dict(l=50, r=50, t=100, b=50),  # Adjust margins to remove white space
    paper_bgcolor='rgba(0,0,0,0)',  # Transparent background
    plot_bgcolor='rgba(0,0,0,0)',  # Transparent plot background
)

# Show the plot
fig.show()


In [None]:
# First create the chunked dataframe
chunked_df = chunk_text(podcasts_clean)

# Verify the chunks were created correctly
print(f"Original rows: {len(podcasts_clean)}")
print(f"Chunks created: {len(chunked_df)}")


In [None]:
chunked_df

In [None]:
chunked_df['text_chunks'][0]

In [None]:
extract_podcast_entities(chunked_df['text_chunks'][0])

In [None]:
extract_relationships(chunked_df['text_chunks'][0])

In [14]:
#chunks_df.write_parquet("/Users/borja/Documents/Somniumrema/projects/genai/grag/pipeline_outcomes/chunks_df.parquet")