In [1]:
# Standard Library Imports
import sys
import logging
from pathlib import Path

# Third-Party Imports
import polars as pl
from tqdm import tqdm
from dotenv import load_dotenv
import plotly.express as px

# Local Imports
from grag.text_utils import load_and_process_podcasts

In [2]:
# Load environment variables
load_dotenv()

# Configure the logger
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [3]:
# Define paths
csv_path = "/Users/borja/Documents/Somniumrema/projects/genai/grag/data/acquired_metadata.csv"
text_folder = Path("/Users/borja/Documents/Somniumrema/projects/genai/grag/data/acquired-individual-transcripts/acquired-individual-transcripts")

In [4]:
# Load and process podcasts
podcasts_clean = load_and_process_podcasts(csv_path, text_folder)

2024-11-18 15:29:42,536 - INFO - Loaded 275 podcasts from /Users/borja/Documents/Somniumrema/projects/genai/grag/data/acquired_metadata.csv
Processing Podcasts: 100%|██████████| 275/275 [00:23<00:00, 11.56it/s]
2024-11-18 15:30:06,366 - INFO - Processed 200 podcasts with transcripts


In [5]:
# Verify the DataFrame
podcasts_clean

post_url,post_title,series_number,blog_date,blog_title,file_name,has_transcript,text,cleaned_text,tokens
str,str,str,date,str,str,bool,str,str,i64
"""https://www.acquired.fm/episod…","""Costco""","""Season 13, Episode 2""",2023-08-20,"""The Complete History & Strateg…","""costco""",true,"""Transcript: (disclaimer: may…","""I don't think I have ever been…",37417
"""https://www.acquired.fm/episod…","""Generative AI in Video and the…","""ACQ2 Episode""",2023-08-29,"""Related Episodes""","""generative_ai_in_video_and_the…",true,"""Transcript: (disclaimer: may…","""Hello, Acquired listeners. Wel…",11939
"""https://www.acquired.fm/episod…","""Nvidia Part III: The Dawn of t…","""Season 13, Episode 3""",2023-09-05,"""The Complete History & Strateg…","""nvidia_part_iii_the_dawn_of_th…",true,"""Transcript: (disclaimer: may…","""Do you like my Bucks T-shirt? …",35198
"""https://www.acquired.fm/episod…","""Doug Demuro on Analyzing the C…","""ACQ2 Episode""",2023-09-17,"""Related Episodes""","""doug_demuro_on_analyzing_the_c…",true,"""Transcript: (disclaimer: may…","""Doug DeMuro, it's great to see…",21593
"""https://www.acquired.fm/episod…","""NVIDIA CEO Jensen Huang""","""ACQ2 Episode""",2023-10-15,"""Related Episodes""","""nvidia_ceo_jensen_huang""",true,"""Transcript: (disclaimer: may…","""I will say, David, I would lov…",18077
…,…,…,…,…,…,…,…,…,…
"""https://www.acquired.fm/episod…","""ExactTarget (acquired by Sales…","""Season 1, Episode 15""",2016-07-05,"""Related Episodes""","""exacttarget_acquired_by_salesf…",true,"""Transcript: (disclaimer: may…","""This is going to be a great ep…",13883
"""https://www.acquired.fm/episod…","""Midroll + Stitcher (acquired b…","""Season 1, Episode 16""",2016-07-12,"""Related Episodes""","""midroll_stitcher_acquired_by_s…",true,"""Transcript: (disclaimer: may…","""We'd like to thank our one lis…",11059
"""https://www.acquired.fm/episod…","""Waze""","""Season 1, Episode 17""",2016-08-03,"""Related Episodes""","""waze""",true,"""Transcript: (disclaimer: may…","""Welcome to Episode 17 of Acqui…",11693
"""https://www.acquired.fm/episod…","""Special‚ An Acquirer's View in…","""Season 1, Episode 18""",2016-08-22,"""Related Episodes""","""special_an_acquirers_view_into…",true,"""Transcript: (disclaimer: may…","""Welcome to Episode of 18 of Ac…",13075


In [6]:
# Save dataframe to disk as Parquet
podcasts_clean.write_parquet("/Users/borja/Documents/Somniumrema/projects/genai/grag/pipeline_outcomes/podcasts_clean.parquet")

In [7]:
# Create a histogram of the token count distribution using Plotly
fig = px.histogram(
    podcasts_clean,
    x="tokens",
    nbins=100,  
    title="Distribution of tokens in the podcast transcripts",
    labels={"tokens": "Token count"},
)

# Customize the layout
fig.update_layout(
    xaxis_title="Token count",
    yaxis_title="Frequency",
    bargap=0.2,
)

# Show the plot
fig.show()