In [25]:
import json
import re
import os

import pandas as pd

def timestamps_to_seconds(timestamps_str):
    """Converts timestamps in the format (HH:MM:SS) to seconds."""
    times = re.findall(r'\((\d+):(\d+):(\d+)\)', timestamps_str)



    seconds_list = []
    for timestamp in times:
        hours, minutes, seconds = map(int, timestamp)
        total_seconds = hours * 3600 + minutes * 60 + seconds
        seconds_list.append(total_seconds)
    return seconds_list

directory = "transcript_data"
if not os.path.exists(directory):
    os.makedirs(directory)

# Load JSON data
with open('RAW_UU2D2CMWXMOVWx7giW1n3LIg_youtube_videos.json', 'r') as json_file:
    json_data = json.load(json_file)

# Load RSS data from CSV
rss_data = pd.read_csv('rss.csv')

# Iterate over each podcast entry
for index, row in rss_data.iterrows():

    if json_data['Transcript'][str(index)] == 'N/A' or pd.isna(row['Timestamps']):
        continue



    # Extract relevant information for the current podcast entry
    title = row['Title']
    episode_num = index  # Assuming the index serves as episode number
    link = json_data['Video URL'][str(index)]
    timestamps = timestamps_to_seconds(row['Timestamps'])

    episode_summary = row['Summary']

    
    transcript_data = json.loads(json_data['Transcript'][str(index)])

    chunks = []
    if timestamps:
        firstTime=timestamps[0]
        idx=0
        entry=transcript_data[idx]
        while entry['start'] < firstTime:
            entry=transcript_data[idx]
            idx+=1
        
        
        for timestampOne,timestampTwo in zip(timestamps,timestamps[1:]):   
            # Extract text for the current timestamp from transcript
        
            chunk_text = ''
            while entry['start'] < timestampTwo and idx < len(transcript_data):
                # if title == 'AMA #1: Leveraging Ultradian Cycles, How to Protect Your Brain, Seed Oils Examined and More':
                #     print("Before:", entry['text'])
                entry=transcript_data[idx]
                chunk_text+=entry['text'].replace('\n',' ').strip() + ' '
        
                idx+=1
        
            if chunk_text:
                chunks.append({
                    "timestamp": timestampOne,
                    "text": chunk_text
                })
    
        chunk_text = ''
        finalStamp=timestamps[-1]
        for e in transcript_data[idx:]:
            chunk_text += e['text']
        
        if chunk_text:
            chunks.append({
                "timestamp": finalStamp,
                "text": chunk_text
            })
        

    # Create JSON object for the current podcast entry
    podcast_entry = {
        "title": title,
        "ep_num": str(episode_num),
        "link": link,
        "chunks": chunks,
        "episode_summary": episode_summary
    }
    dataTitle=title.replace(" ", "_").replace('/','')
    # Save the JSON object to a separate file
    with open(os.path.join(directory, f'{dataTitle}.json'), 'w') as output_file:
        json.dump(podcast_entry, output_file, indent=4)
