In [1]:
# Standard Library Imports
import sys
import logging
from pathlib import Path

# Third-Party Imports
import polars as pl
from tqdm import tqdm
from dotenv import load_dotenv
import plotly.express as px

# Local Imports
from grag.text_utils import load_and_process_podcasts

In [2]:
# Load environment variables
load_dotenv()

# Configure the logger
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [3]:
# Define paths
csv_path = "/Users/borja/Documents/Somniumrema/projects/genai/grag/data/acquired_metadata.csv"
text_folder = Path("/Users/borja/Documents/Somniumrema/projects/genai/grag/data/acquired-individual-transcripts/acquired-individual-transcripts")

In [4]:
import os
def read_text_files(folder_path):
    data = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                data.append({"filename": filename, "content": content})
    return data

# Read text files
data = read_text_files(text_folder)

# Create a Polars DataFrame
df = pl.DataFrame(data)

# Display the DataFrame
df

filename,content
str,str
"""highperformance_hiring_intervi…","""Transcript: (disclaimer: may…"
"""the_2022_state_of_crypto_web3_…","""Transcript: (disclaimer: may…"
"""slack_salesforce_emergency_pod…","""Transcript: (disclaimer: may…"
"""the_uber_didi_chuxing_merger_w…","""Transcript: (disclaimer: may…"
"""nvidia_part_i_the_gpu_company_…","""Transcript: (disclaimer: may…"
…,…
"""andreessen_horowitz_part_i.txt""","""Transcript: (disclaimer: may…"
"""kindergarten_ventures_vanta_in…","""Transcript: (disclaimer: may…"
"""adapting_episode_1_canlis.txt""","""Transcript: (disclaimer: may…"
"""short_the_death_of_sega.txt""","""Transcript: (disclaimer: may…"


In [5]:
metadata = pl.read_csv(csv_path)

metadata.filter(pl.col("has_transcript") == False)  

post_url,post_title,series_number,blog_date,blog_title,file_name,has_transcript
str,str,str,str,str,str,bool
"""https://www.acquired.fm/episod…","""Visa Follow-Up and Today’s Pay…","""ACQ2 Episode""","""December 3, 2023""","""Related Episodes""","""visa_followup_and_todays_payme…",false
"""https://www.acquired.fm/episod…","""Crypto Self-Custody 101 (with …","""ACQ2 Episode""","""December 2, 2022""","""Related Episodes""","""crypto_selfcustody_101_with_au…",false
"""https://www.acquired.fm/episod…","""Reinsurance, Climate + Kinderg…","""ACQ2 Episode""","""September 8, 2022""","""Related Episodes""","""reinsurance_climate_kindergart…",false
"""https://www.acquired.fm/episod…","""Travel for the Creator Economy""","""ACQ2 Episode""","""April 15, 2022""","""Related Episodes""","""travel_for_the_creator_economy""",false
"""https://www.acquired.fm/episod…","""Mission-Driven Founders, Globa…","""ACQ2 Episode""","""December 30, 2021""","""Related Episodes""","""missiondriven_founders_global_…",false
…,…,…,…,…,…,…
"""https://www.acquired.fm/episod…","""Pixar""","""Season 1, Episode 1""","""October 15, 2015""","""Related Episodes""","""pixar""",false
"""https://www.acquired.fm/episod…","""Instagram""","""Season 1, Episode 2""","""October 31, 2015""","""Related Episodes""","""instagram""",false
"""https://www.acquired.fm/episod…","""Twitch""","""Season 1, Episode 3""","""November 15, 2015""","""Related Episodes""","""twitch""",false
"""https://www.acquired.fm/episod…","""Bungie (with Xbox Co-Founder E…","""Season 1, Episode 4""","""November 29, 2015""","""Related Episodes""","""bungie_with_xbox_cofounder_ed_…",false


In [6]:
import os

# Specify the folder path
folder_path = "/Users/borja/Documents/Somniumrema/projects/genai/grag/data/acquired-individual-transcripts/acquired-individual-transcripts"

# List all files in the directory
files = os.listdir(folder_path)

# Count the number of files
file_count = len(files)

# Print the number of files
print(f"Number of files in the folder: {file_count}")

Number of files in the folder: 200


In [7]:
qa = pl.read_csv("/Users/borja/Documents/Somniumrema/projects/genai/grag/data/acquired-qa-evaluation.csv")

# Join the qa DataFrame with the metadata DataFrame on file_name
qa_with_metadata = qa.join(metadata, on="file_name", how="left")

# Ensure that the post_title from the metadata DataFrame is included in the qa DataFrame
qa_with_post_title = qa_with_metadata.select(["file_name", "question", "post_title"])

# Display the final DataFrame
qa_with_post_title

file_name,question,post_title
str,str,str
"""airbnb""","""When did Airbnb go public, wha…","""Airbnb"""
"""airbnb""","""Why did Wimdu unlike Airbnb no…","""Airbnb"""
"""airbnb""","""Why does market fragmentation …","""Airbnb"""
"""costco""","""How many hot dogs does Costco …","""Costco"""
"""costco""","""What store was created as ""the…","""Costco"""
…,…,…
"""enron""","""How did Enron remove investmen…","""Enron"""
"""enron""","""The fall of Enron is similar t…","""Enron"""
"""ftx_with_sam_bankmanfried_mari…","""What metrics did FTX judge its…","""FTX (with Sam Bankman-Fried & …"
"""the_electronic_arts_ipo_with_t…","""When was Trip Hawkins exposed …","""The Electronic Arts IPO (with …"


In [8]:
# Group by file_name and count the number of questions
question_counts = qa_with_post_title.group_by("post_title").agg(pl.count("question").alias("question_count"))

# Display the unique file names and their question counts
[question_counts["post_title"]]

[shape: (27,)
 Series: 'post_title' [str]
 [
 	"Disney, Plus"
 	"FTX (with Sam Bankman-Fried & …
 	"Enron"
 	"WhatsApp"
 	"Airbnb"
 	…
 	"Costco"
 	"Renaissance Technologies"
 	"Nvidia Part II: The Machine Le…
 	"Qualcomm"
 	"The NBA"
 ]]

In [9]:
# Define the updated titles as a dictionary
updated_titles = {
    "Porsche (with Doug DeMuro)": "Porsche",
    "The Electronic Arts IPO (with Trip Hawkins)": "EA IPO",
    "Nvidia Part I: The GPU Company (1993-2006)": "Nvidia Part I",
    "Nvidia Part II: The Machine Learning Company (2006-2022)": "Nvidia Part II",
    "Nvidia Part III: The Dawn of the AI Era (2022-2023)": "Nvidia Part III",
    "Arena Show Part II: Brooks Running (with CEO Jim Weber)": "Arena Show Part II",
    "Ethereum (with Packy McCormick)": "Ethereum",
    "FTX (with Sam Bankman-Fried & Mario Gabriele)": "FTX",
    "Amazon Web Services": "AWS",
    "Renaissance Technologies": "Renaissance Tech",
    "Berkshire Hathaway Part I": "B. Hathaway Part I"
}

# Apply the replacement using the `replace` method
question_counts = question_counts.with_columns(
    pl.col("post_title").replace(updated_titles).alias("post_title")
)

# Display the updated DataFrame
question_counts = question_counts.sort("question_count")

In [10]:
# Calculate the average number of questions
avg_questions = round(question_counts.select(pl.col("question_count").mean()).item(), 0)
avg_questions

3.0

In [120]:
import plotly.express as px

# Sort the DataFrame from less to more questions
question_counts = question_counts.sort("question_count")

# Calculate the average number of questions
avg_questions = question_counts.select(pl.col("question_count").mean()).item()

# Create the bar plot
fig = px.bar(
    question_counts.to_dict(as_series=False),  # Convert Polars DataFrame to list of dicts
    x="post_title",
    y="question_count",
    title="Distribución de Preguntas por Podcast",
    labels={"post_title": "Título del Post", "question_count": "Número de Preguntas"},
    hover_data=["post_title"]
)

# Customize the layout
fig.update_layout(
    # Make the background transparent
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',

    # Set y-axis to show all numbers from 0 to 6
    yaxis=dict(
        range=[0, 6],
        dtick=1
    ),

    # Rotate x-axis labels for better readability
    xaxis_tickangle=-45,

    # Update axis titles
    xaxis_title="Título del Post",
    yaxis_title="Número de Preguntas",

    # Remove the white frame (spikes)
    showlegend=False,
    margin=dict(l=40, r=40, t=60, b=150),
)

# Add the average line
fig.add_shape(
    type="line",
    x0=0,  # Start at the left of the plot
    x1=1,  # End at the right of the plot
    y0=avg_questions,
    y1=avg_questions,
    xref="paper",
    yref="y",
    line=dict(color="red", dash="dash")
)

# Add annotation for the average line
fig.add_annotation(
    x=1,  # Position at the far right of the x-axis
    y=avg_questions,
    text=f"Promedio: {avg_questions:.0f}",
    showarrow=False,
    xanchor="right",
    yanchor="bottom",
    font=dict(color="red")
)

# Show the plot
fig.show()

In [35]:
# Step 2: Count 'CORRECT' and 'INCORRECT' in 'ai_answer_with_the_transcript_correctness'
with_transcript_counts = qa.group_by('ai_answer_with_the_transcript_correctness').agg(
    pl.len().alias("Count")
).rename({
    'ai_answer_with_the_transcript_correctness': "Correctness"
}).with_columns(pl.lit("With Transcript").alias("Source"))

with_transcript_counts

Correctness,Count,Source
str,u32,str
"""INCORRECT""",14,"""With Transcript"""
"""CORRECT""",63,"""With Transcript"""
"""PARTIAL""",3,"""With Transcript"""


In [99]:
import polars as pl

# Paso 1: Mostrar filas con 'quality_rating_for_answer_with_transcript' == 4.5
filas_45 = qa.filter(
    pl.col('quality_rating_for_answer_with_transcript') == 4.5
)
print("Filas con calificación 4.5:")
print(filas_45)

Filas con calificación 4.5:
shape: (1, 9)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ question  ┆ human_ans ┆ ai_answer ┆ ai_answer ┆ … ┆ ai_answer ┆ quality_r ┆ post_url  ┆ file_nam │
│ ---       ┆ wer       ┆ _without_ ┆ _without_ ┆   ┆ _with_the ┆ ating_for ┆ ---       ┆ e        │
│ str       ┆ ---       ┆ the_trans ┆ transcrip ┆   ┆ _transcri ┆ _answer_w ┆ str       ┆ ---      │
│           ┆ str       ┆ cri…      ┆ t_c…      ┆   ┆ pt_…      ┆ ith…      ┆           ┆ str      │
│           ┆           ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆           ┆          │
│           ┆           ┆ str       ┆ str       ┆   ┆ str       ┆ f64       ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ What was  ┆ The Trump ┆ The       ┆ CORRECT   ┆ … ┆ CORRECT   ┆ 4.5       ┆ https://w ┆ qualcomm │
│ the role  ┆ administr ┆ attempted ┆           ┆

In [102]:
import polars as pl

# Paso 1: Mostrar filas con 'quality_rating_for_answer_with_transcript' == 4.5
filas_45 = qa.filter(
    pl.col('quality_rating_for_answer_with_transcript') == 4.5
)
print("Filas con calificación 4.5:")
print(filas_45)

# Paso 2: Reemplazar 4.5 por 5 en 'quality_rating_for_answer_with_transcript'
qa = qa.with_columns(
    pl.when(pl.col('quality_rating_for_answer_with_transcript') == 4.5)
      .then(5)
      .otherwise(pl.col('quality_rating_for_answer_with_transcript'))
      .alias('quality_rating_for_answer_with_transcript')
)

# Paso 3: Verificar que ya no existen valores 4.5
filas_45_despues = qa.filter(
    pl.col('quality_rating_for_answer_with_transcript') == 4.5
)
print("Filas con calificación 4.5 después del cambio:")
print(filas_45_despues)

Filas con calificación 4.5:
shape: (1, 9)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ question  ┆ human_ans ┆ ai_answer ┆ ai_answer ┆ … ┆ ai_answer ┆ quality_r ┆ post_url  ┆ file_nam │
│ ---       ┆ wer       ┆ _without_ ┆ _without_ ┆   ┆ _with_the ┆ ating_for ┆ ---       ┆ e        │
│ str       ┆ ---       ┆ the_trans ┆ transcrip ┆   ┆ _transcri ┆ _answer_w ┆ str       ┆ ---      │
│           ┆ str       ┆ cri…      ┆ t_c…      ┆   ┆ pt_…      ┆ ith…      ┆           ┆ str      │
│           ┆           ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆           ┆          │
│           ┆           ┆ str       ┆ str       ┆   ┆ str       ┆ f64       ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ What was  ┆ The Trump ┆ The       ┆ CORRECT   ┆ … ┆ CORRECT   ┆ 4.5       ┆ https://w ┆ qualcomm │
│ the role  ┆ administr ┆ attempted ┆           ┆

In [106]:
import polars as pl

# Paso 1: Mostrar filas con 'quality_rating_for_answer_with_transcript' == 4.5
filas_45 = qa.filter(
    pl.col('quality_rating_for_answer_with_transcript') == 0
)
print("Filas con calificación 4.5:")
print(filas_45)

# Paso 2: Reemplazar 4.5 por 5 en 'quality_rating_for_answer_with_transcript'
qa = qa.with_columns(
    pl.when(pl.col('quality_rating_for_answer_with_transcript') == 0)
      .then(1)
      .otherwise(pl.col('quality_rating_for_answer_with_transcript'))
      .alias('quality_rating_for_answer_with_transcript')
)

# Paso 3: Verificar que ya no existen valores 4.5
filas_45_despues = qa.filter(
    pl.col('quality_rating_for_answer_with_transcript') == 0
)
print("Filas con calificación 4.5 después del cambio:")
print(filas_45_despues)

Filas con calificación 4.5:
shape: (2, 9)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ question  ┆ human_ans ┆ ai_answer ┆ ai_answer ┆ … ┆ ai_answer ┆ quality_r ┆ post_url  ┆ file_nam │
│ ---       ┆ wer       ┆ _without_ ┆ _without_ ┆   ┆ _with_the ┆ ating_for ┆ ---       ┆ e        │
│ str       ┆ ---       ┆ the_trans ┆ transcrip ┆   ┆ _transcri ┆ _answer_w ┆ str       ┆ ---      │
│           ┆ str       ┆ cri…      ┆ t_c…      ┆   ┆ pt_…      ┆ ith…      ┆           ┆ str      │
│           ┆           ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆           ┆          │
│           ┆           ┆ str       ┆ str       ┆   ┆ str       ┆ f64       ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ What      ┆ IBM       ┆ Sam       ┆ INCORRECT ┆ … ┆ INCORRECT ┆ 0.0       ┆ https://w ┆ walmart  │
│ seminar   ┆ Computer  ┆ Walton    ┆           ┆

In [110]:
qa.shape

(80, 9)

In [108]:
# Paso 2: Filtrar las filas que tienen transcript
qa_with_transcript = qa.filter(pl.col('ai_answer_with_the_transcript_correctness').is_not_null())

In [117]:
import polars as pl
import plotly.express as px

# Paso 1: Verificar los nombres de las columnas para asegurar que son correctos
print("Available Columns in DataFrame:")
print(qa.columns)

# Paso 2: Filtrar las filas que tienen transcript
qa_with_transcript = qa.filter(
    pl.col('ai_answer_with_the_transcript_correctness').is_not_null()
)

# Paso 3: Cambiar los valores 4.5 a 5 y convertir a entero
qa_with_transcript = qa_with_transcript.with_columns(
    pl.when(pl.col("quality_rating_for_answer_with_transcript") == 4.5)
      .then(5)
      .otherwise(pl.col("quality_rating_for_answer_with_transcript"))
      .cast(pl.Int64)
      .alias("quality_rating_for_answer_with_transcript")
)

# Paso 4: Extraer las listas de calificaciones y correctitud
ratings = qa_with_transcript["quality_rating_for_answer_with_transcript"].to_list()
correctness = qa_with_transcript['ai_answer_with_the_transcript_correctness'].to_list()

# Paso 5: Determinar el número de bins basado en valores únicos
unique_ratings = sorted(set(ratings))
nbins = len(unique_ratings)

# Paso 6: Crear el histograma con colores basados en la correctitud
fig = px.histogram(
    x=ratings,
    color=correctness,
    nbins=nbins,  # Cada bin representa un valor único
    title="Distribución de Calificación de Calidad Basada en Corrección",
    labels={
        "x": "Calificación de Calidad",
        "y": "Frecuencia",
        "color": "Corrección"
    },
    color_discrete_map={
        "INCORRECT": "red",
        "PARTIAL": "yellow",
        "CORRECT": "green",
    },
    text_auto=True
)

# Paso 7: Personalizar el layout
fig.update_layout(
    xaxis_title="Calificación de Calidad",
    yaxis_title="Frecuencia",
    bargap=0.2,  # Espacio entre barras
    height=600,   # Altura del gráfico en píxeles
    plot_bgcolor='rgba(0,0,0,0)',      # Fondo de la trama transparente
    paper_bgcolor='rgba(0,0,0,0)',     # Fondo del papel transparente
    margin=dict(l=40, r=40, t=60, b=40),  # Márgenes alrededor del gráfico
    font=dict(
        size=14,                    # Tamaño de la fuente
        color="Black"               # Color de la fuente
    )
)

# Paso 8: Quitar el marco blanco y ajustar estilo de ejes, fijar los ticks en enteros
fig.update_xaxes(
    showgrid=False,
    zeroline=False,
    tickmode='linear',
    dtick=1  # Intervalo de 1 en el eje X
)
fig.update_yaxes(showgrid=False, zeroline=False)

# Paso 9: Mostrar el gráfico
fig.show()

Available Columns in DataFrame:
['question', 'human_answer', 'ai_answer_without_the_transcript', 'ai_answer_without_transcript_correctness', 'ai_answer_with_the_transcript', 'ai_answer_with_the_transcript_correctness', 'quality_rating_for_answer_with_transcript', 'post_url', 'file_name']


In [119]:
import polars as pl
import plotly.express as px

# Step 1: Verify the column names to ensure they are correct
print("Available Columns in DataFrame:")
print(qa.columns)

# Step 2: Ensure there are no leading/trailing whitespaces in column names
# (Adjust the column names here if there are any discrepancies)
# qa = qa.rename({
#     'ai_answer_without_the_transcript_correctness': 'ai_answer_without_transcript_correctness',
#     'ai_answer_with_the_transcript_correctness': 'ai_answer_with_the_transcript_correctness'
# })

# Step 3: Confirm the renaming
print("\nUpdated Columns in DataFrame:")
print(qa.columns)

# Step 4: Count 'CORRECT' and 'INCORRECT' in 'ai_answer_without_the_transcript_correctness'
without_transcript_counts = qa.group_by('ai_answer_without_transcript_correctness').agg(
    pl.len().alias("Count")
).rename({
    'ai_answer_without_transcript_correctness': "Correctness"
}).with_columns(pl.lit("Sin Transcripción").alias("Fuente"))

# Step 5: Count 'CORRECT' and 'INCORRECT' in 'ai_answer_with_the_transcript_correctness'
with_transcript_counts = qa.group_by('ai_answer_with_the_transcript_correctness').agg(
    pl.len().alias("Count")
).rename({
    'ai_answer_with_the_transcript_correctness': "Correctness"
}).with_columns(pl.lit("Con Transcripción").alias("Fuente"))

# Step 6: Combine both counts into a single DataFrame
combined_counts = pl.concat([without_transcript_counts, with_transcript_counts])

# Display the combined counts to verify
print("\nCombined Counts:")
print(combined_counts)

# Step 7: Calculate the total counts per Source for percentage calculation
total_per_source = combined_counts.group_by("Fuente").agg(
    pl.sum("Count").alias("Total")
)

# Step 8: Join the total counts back to the combined_counts and calculate percentages
combined_with_percentage = combined_counts.join(total_per_source, on="Fuente").with_columns(
    (pl.col("Count") / pl.col("Total") * 100).round(2).alias("Percentage")
)

# Display the DataFrame with percentages to verify
print("\nCombined Counts with Percentages:")
print(combined_with_percentage)

# Step 9: Sort the sources based on Correctness Percentage (from less to more)
# Extract sources sorted by 'CORRECT' percentage
source_order = combined_with_percentage.filter(
    pl.col("Correctness") == "CORRECT"
).sort("Percentage")["Fuente"].to_list()

# Step 10: Convert the combined Polars DataFrame with percentages to a dictionary for Plotly
data_dict = combined_with_percentage.select(["Fuente", "Correctness", "Percentage"]).to_dict(as_series=False)

# Step 11: Create a grouped bar plot to visualize the percentage distribution
fig = px.bar(
    data_dict,
    x="Fuente",
    y="Percentage",
    color="Correctness",
    barmode='group',
    title="Distribución de Respuestas Correctas e Incorrectas por Fuente (%)",
    labels={
        "Source": "Fuente de la Respuesta",
        "Percentage": "Porcentaje de Respuestas",
        "Correctness": "Corrección"
    },
    text="Percentage",  # Adding percentage labels on the bars
    #category_order={"Source": source_order}  # Sorting sources from less to more
    color_discrete_map={
        "CORRECT": "green",
        "PARTIAL": "yellow",
        "INCORRECT": "red",
    }
)

# Step 12: Customize the layout for better readability
fig.update_layout(
    paper_bgcolor='rgba(0,0,0,0)',          # Transparent paper background
    plot_bgcolor='rgba(0,0,0,0)',           # Transparent plot background
    xaxis_tickangle=0,                    # Rotate x-axis labels for better readability
    xaxis_title="Fuente de la Respuesta",    # X-axis title in Spanish
    yaxis_title="Porcentaje de Respuestas",   # Y-axis title in Spanish
    showlegend=True,                         # Show legend
    margin=dict(l=40, r=40, t=60, b=40),      # Adjust margins
    height=600                                # Set plot height
)

# Step 13: Add percentage labels on the bars for clarity
fig.update_traces(texttemplate='%{text}%', textposition='inside')

# Step 14: Show the plot
fig.show()

Available Columns in DataFrame:
['question', 'human_answer', 'ai_answer_without_the_transcript', 'ai_answer_without_transcript_correctness', 'ai_answer_with_the_transcript', 'ai_answer_with_the_transcript_correctness', 'quality_rating_for_answer_with_transcript', 'post_url', 'file_name']

Updated Columns in DataFrame:
['question', 'human_answer', 'ai_answer_without_the_transcript', 'ai_answer_without_transcript_correctness', 'ai_answer_with_the_transcript', 'ai_answer_with_the_transcript_correctness', 'quality_rating_for_answer_with_transcript', 'post_url', 'file_name']

Combined Counts:
shape: (6, 3)
┌─────────────┬───────┬───────────────────┐
│ Correctness ┆ Count ┆ Fuente            │
│ ---         ┆ ---   ┆ ---               │
│ str         ┆ u32   ┆ str               │
╞═════════════╪═══════╪═══════════════════╡
│ CORRECT     ┆ 33    ┆ Sin Transcripción │
│ PARTIAL     ┆ 5     ┆ Sin Transcripción │
│ INCORRECT   ┆ 42    ┆ Sin Transcripción │
│ INCORRECT   ┆ 14    ┆ Con Transcripci

In [12]:
# import plotly.express as px

# # Convert to dictionary for Plotly
# question_counts_dict = question_counts.to_dict(as_series=False)

# # Create the bar plot
# fig = px.bar(question_counts_dict, x="post_title", y="question_count", title="Distribución de Preguntas por Post")

# # Customize the layout
# fig.update_layout(
#     xaxis_title="Título del Post",
#     yaxis_title="Número de Preguntas",
#     xaxis_tickangle=-45
# )

# # Show the plot
# fig.show()

In [13]:
# Load and process podcasts
podcasts_clean = load_and_process_podcasts(csv_path, text_folder)

2024-12-12 12:59:58,657 - INFO - Loaded 275 podcasts from /Users/borja/Documents/Somniumrema/projects/genai/grag/data/acquired_metadata.csv
Processing Podcasts: 100%|██████████| 275/275 [00:20<00:00, 13.12it/s]
2024-12-12 13:00:19,660 - INFO - Processed 200 podcasts with transcripts


In [127]:
import polars as pl

# Paso 1: Verificar las columnas en ambos DataFrames
print("Columnas en 'podcasts_clean':")
print(podcasts_clean.columns)

print("\nColumnas en 'qa':")
print(qa.columns)

# Verificar si 'file_name' existe en ambos DataFrames
if 'file_name' not in podcasts_clean.columns:
    raise ValueError("La columna 'file_name' no existe en 'podcasts_clean'.")
if 'file_name' not in qa.columns:
    raise ValueError("La columna 'file_name' no existe en 'qa'.")

# Paso 2: Filtrar 'podcasts_clean' basado en 'file_name' presente en 'qa' utilizando Semi Join
podcasts_with_qa = podcasts_clean.join(
    qa.select('file_name').unique(),
    on='file_name',
    how='semi'
)

# Paso 3: Verificar el resultado del filtrado
count_filtered = podcasts_with_qa.height
print(f"\nNúmero de filas que tienen 'qa': {count_filtered}")

print("\nEjemplo de filas filtradas:")
podcasts_with_qa

Columnas en 'podcasts_clean':
['post_url', 'post_title', 'series_number', 'blog_date', 'blog_title', 'file_name', 'has_transcript', 'text', 'cleaned_text', 'tokens']

Columnas en 'qa':
['question', 'human_answer', 'ai_answer_without_the_transcript', 'ai_answer_without_transcript_correctness', 'ai_answer_with_the_transcript', 'ai_answer_with_the_transcript_correctness', 'quality_rating_for_answer_with_transcript', 'post_url', 'file_name']

Número de filas que tienen 'qa': 27

Ejemplo de filas filtradas:


post_url,post_title,series_number,blog_date,blog_title,file_name,has_transcript,text,cleaned_text,tokens
str,str,str,date,str,str,bool,str,str,i64
"""https://www.acquired.fm/episod…","""Costco""","""Season 13, Episode 2""",2023-08-20,"""The Complete History & Strateg…","""costco""",true,"""Transcript: (disclaimer: may…","""I don't think I have ever been…",37417
"""https://www.acquired.fm/episod…","""Nvidia Part III: The Dawn of t…","""Season 13, Episode 3""",2023-09-05,"""The Complete History & Strateg…","""nvidia_part_iii_the_dawn_of_th…",true,"""Transcript: (disclaimer: may…","""Do you like my Bucks T-shirt? …",35198
"""https://www.acquired.fm/episod…","""Visa""","""Season 13, Episode 4""",2023-11-26,"""The Complete History & Strateg…","""visa""",true,"""Transcript: (disclaimer: may…","""It's funny. When we picked thi…",45242
"""https://www.acquired.fm/episod…","""Renaissance Technologies""","""Season 14, Episode 3""",2024-03-17,"""The Complete History & Strateg…","""renaissance_technologies""",true,"""Transcript: (disclaimer: may…","""I always used to misspell Rena…",39313
"""https://www.acquired.fm/episod…","""Porsche (with Doug DeMuro)""","""Season 12, Episode 6""",2023-06-26,"""The Complete History & Strateg…","""porsche_with_doug_demuro""",true,"""Transcript: (disclaimer: may…","""It's definitely por-shuh. Por-…",43266
…,…,…,…,…,…,…,…,…,…
"""https://www.acquired.fm/episod…","""Airbnb""","""Season 7, Episode 8""",2020-12-10,"""‍""","""airbnb""",true,"""Transcript: (disclaimer: may…","""Welcome to season 7, episode 8…",31354
"""https://www.acquired.fm/episod…","""SpaceX""","""Season 6, Episode 7""",2020-05-26,"""Related Episodes""","""spacex""",true,"""Transcript: (disclaimer: may…","""Welcome to Season Six, Episode…",29261
"""https://www.acquired.fm/episod…","""Disney, Plus""","""Season 5, Episode 7""",2019-11-25,"""Related Episodes""","""disney_plus""",true,"""Transcript: (disclaimer: may…","""Disney makes it approachable, …",25734
"""https://www.acquired.fm/episod…","""WhatsApp""","""Season 6, Episode 1""",2020-01-28,"""Related Episodes""","""whatsapp""",true,"""Transcript: (disclaimer: may…","""I do have to say, that based o…",21434


In [None]:
# Verify the DataFrame
podcasts_clean

post_url,post_title,series_number,blog_date,blog_title,file_name,has_transcript,text,cleaned_text,tokens
str,str,str,date,str,str,bool,str,str,i64
"""https://www.acquired.fm/episod…","""Costco""","""Season 13, Episode 2""",2023-08-20,"""The Complete History & Strateg…","""costco""",true,"""Transcript: (disclaimer: may…","""I don't think I have ever been…",37417
"""https://www.acquired.fm/episod…","""Generative AI in Video and the…","""ACQ2 Episode""",2023-08-29,"""Related Episodes""","""generative_ai_in_video_and_the…",true,"""Transcript: (disclaimer: may…","""Hello, Acquired listeners. Wel…",11939
"""https://www.acquired.fm/episod…","""Nvidia Part III: The Dawn of t…","""Season 13, Episode 3""",2023-09-05,"""The Complete History & Strateg…","""nvidia_part_iii_the_dawn_of_th…",true,"""Transcript: (disclaimer: may…","""Do you like my Bucks T-shirt? …",35198
"""https://www.acquired.fm/episod…","""Doug Demuro on Analyzing the C…","""ACQ2 Episode""",2023-09-17,"""Related Episodes""","""doug_demuro_on_analyzing_the_c…",true,"""Transcript: (disclaimer: may…","""Doug DeMuro, it's great to see…",21593
"""https://www.acquired.fm/episod…","""NVIDIA CEO Jensen Huang""","""ACQ2 Episode""",2023-10-15,"""Related Episodes""","""nvidia_ceo_jensen_huang""",true,"""Transcript: (disclaimer: may…","""I will say, David, I would lov…",18077
…,…,…,…,…,…,…,…,…,…
"""https://www.acquired.fm/episod…","""ExactTarget (acquired by Sales…","""Season 1, Episode 15""",2016-07-05,"""Related Episodes""","""exacttarget_acquired_by_salesf…",true,"""Transcript: (disclaimer: may…","""This is going to be a great ep…",13883
"""https://www.acquired.fm/episod…","""Midroll + Stitcher (acquired b…","""Season 1, Episode 16""",2016-07-12,"""Related Episodes""","""midroll_stitcher_acquired_by_s…",true,"""Transcript: (disclaimer: may…","""We'd like to thank our one lis…",11059
"""https://www.acquired.fm/episod…","""Waze""","""Season 1, Episode 17""",2016-08-03,"""Related Episodes""","""waze""",true,"""Transcript: (disclaimer: may…","""Welcome to Episode 17 of Acqui…",11693
"""https://www.acquired.fm/episod…","""Special‚ An Acquirer's View in…","""Season 1, Episode 18""",2016-08-22,"""Related Episodes""","""special_an_acquirers_view_into…",true,"""Transcript: (disclaimer: may…","""Welcome to Episode of 18 of Ac…",13075


In [15]:
# Save dataframe to disk as Parquet
podcasts_clean.write_parquet("/Users/borja/Documents/Somniumrema/projects/genai/grag/pipeline_outcomes/podcasts_clean.parquet")

In [145]:
# Create a histogram of the token count distribution using Plotly
fig = px.histogram(
    podcasts_clean,
    x="tokens",
    nbins=25,  
    title="Distribución de tokens en Podcasts con QA",
    labels={"tokens": "Token count"},
)

# # Customize the layout
# fig.update_layout(
#     xaxis_title="Número de Tokens",
#     yaxis_title="Frecuencia",
#     bargap=0.2,
# )

fig.update_layout(
    paper_bgcolor='rgba(0,0,0,0)',          # Transparent paper background
    plot_bgcolor='rgba(0,0,0,0)',           # Transparent plot background
    xaxis_tickangle=0,  
    bargap=0.1,                  # Rotate x-axis labels for better readability
    xaxis_title="Número de Tokens",    # X-axis title in Spanish
    yaxis_title="Frecuencia",   # Y-axis title in Spanish
    margin=dict(l=40, r=40, t=60, b=40),      # Adjust margins
    height=600                                # Set plot height
)
# Show the plot
fig.show()

In [164]:
import polars as pl
import plotly.express as px

# Paso 1: Verificar los nombres de las columnas para asegurar que son correctos
print("Available Columns in DataFrame:")
print(podcasts_clean.columns)

# Paso 2: Filtrar las filas que tienen QA
podcasts_with_qa = podcasts_clean.join(
    qa.select('file_name').unique(),
    on='file_name',
    how='semi'
)

# Paso 3: Calcular la media de los tokens
mean_tokens = podcasts_with_qa["tokens"].median()
print(f"Media de tokens: {mean_tokens:.0f}")

# Paso 4: Crear el histograma con colores basados en la correctitud
fig = px.histogram(
    podcasts_with_qa,
    x="tokens",
    nbins=5,  
    title="Distribución de tokens en podcasts con transcripción",
    labels={"tokens": "Token count"},
)

# Paso 5: Personalizar el layout
fig.update_layout(
    paper_bgcolor='rgba(0,0,0,0)',          # Fondo transparente
    plot_bgcolor='rgba(0,0,0,0)',           # Fondo de la trama transparente
    xaxis_tickangle=0,  
    bargap=0.1,                  # Espacio entre barras
    xaxis_title="Número de Tokens",    # Título eje X en español
    yaxis_title="Frecuencia",   # Título eje Y en español
    margin=dict(l=40, r=40, t=60, b=40),      # Márgenes alrededor del gráfico
    height=600                                # Altura del gráfico en píxeles
)

# Paso 6: Añadir línea vertical roja con guiones y etiqueta de la media
fig.add_vline(
    x=mean_tokens,
    line=dict(color='red', dash='dash'),
    annotation=dict(
        text=f"Mediana: {mean_tokens:.2f}",
        showarrow=False,
        x=mean_tokens,
        y=0.95,
        yref='paper',
        xanchor='left',
        font=dict(color='red')
    )
)

# Paso 7: Quitar el marco blanco y ajustar estilo de ejes
fig.update_xaxes(showgrid=False, zeroline=False)
fig.update_yaxes(showgrid=False, zeroline=False)

# Paso 8: Mostrar el gráfico
fig.show()

Available Columns in DataFrame:
['post_url', 'post_title', 'series_number', 'blog_date', 'blog_title', 'file_name', 'has_transcript', 'text', 'cleaned_text', 'tokens']
Media de tokens: 34747
