## Transcribe and summarize voice messages

In [2]:
from pathlib import Path

In [3]:
directory = Path(r'C:\Users\alexe\Downloads\Telegram Desktop\ChatExport_2024-09-02\voice_messages')

In [26]:
files = directory.glob('*.ogg')
files = sorted(files)

In [27]:
f = files[0]

In [4]:
from openai import OpenAI
client = OpenAI()

In [31]:
results = {}

In [5]:
from tqdm.auto import tqdm

In [34]:
for f in tqdm(files):
    with f.open('rb') as audio_file:
        transcription = client.audio.transcriptions.create(
            model="whisper-1", 
            file=audio_file,

        )
    text = transcription.text
    print(f.name, text[:100])
    results[f.name] = text

  0%|          | 0/200 [00:00<?, ?it/s]

audio_100@06-08-2024_11-40-47.ogg Another thing we should improve is, so right now all our process documents focus on how to do someth
audio_101@06-08-2024_11-41-16.ogg So first, we want to understand why we need to do this. So why is our community members want to have
audio_102@06-08-2024_11-41-41.ogg Then another thing is when to do this, so you do this every month when there is a reminder and this 
audio_103@06-08-2024_11-42-09.ogg And also what I just said about that process documents should be complete and that the process docum
audio_104@06-08-2024_11-43-24.ogg So I don't do to-do for this one. I don't know if you want me to do to edit to-do or maybe you want 
audio_105@06-08-2024_12-01-20.ogg Yeah, that's right try to use something very close to what I said
audio_106@06-08-2024_12-10-34.ogg One of the things you can do is download the voice messages that I sent. I don't know if you can do 
audio_107@06-08-2024_12-11-58.ogg Jā, tas ir ļoti kļūti treneris. Vēlētos vēlētos vēlētos.

In [36]:
import pickle

In [37]:
with open('results.bin', 'wb') as f_out:
    pickle.dump(results, f_out)

In [40]:
import re

def sort_audio_logs(log_dict):
    # Sort the dictionary items using a lambda function
    sorted_items = sorted(
        log_dict.items(),
        key=lambda x: int(re.search(r'audio_(\d+)@', x[0]).group(1))
    )
    
    return sorted_items

In [41]:
sorted_logs = sort_audio_logs(results)

In [61]:
import re
import pandas as pd
from datetime import datetime, timedelta


In [62]:

def create_audio_log_dataframe(log_list):
    data = []
    for filename, text in log_list:
        # Extract number
        number_match = re.search(r'audio_(\d+)@', filename)
        number = int(number_match.group(1)) if number_match else None

        datetime_match = re.search(r'@(\d{2}-\d{2}-\d{4}_\d{2}-\d{2}-\d{2})\.ogg', filename)
        if datetime_match:
            datetime_str = datetime_match.group(1)
            dt = datetime.strptime(datetime_str, '%d-%m-%Y_%H-%M-%S')
        else:
            dt = None
        
        data.append({
            'filename': filename,
            'number': number,
            'datetime': dt,
            'text': text.strip()
        })
    
    return pd.DataFrame(data)

In [63]:
df = create_audio_log_dataframe(results.items())

In [64]:
df = df.sort_values(by='number')

In [65]:
df

Unnamed: 0,filename,number,datetime,text
110,audio_1@04-07-2023_10-06-31.ogg,1,2023-07-04 10:06:31,Maria from YLABS is asking about the USD accou...
122,audio_2@23-08-2023_13-35-55.ogg,2,2023-08-23 13:35:55,"Hey, I just added, I just closed one of the ne..."
133,audio_3@20-11-2023_15-54-31.ogg,3,2023-11-20 15:54:31,"So what I was going to write here is, no, we u..."
144,audio_4@20-11-2023_15-55-06.ogg,4,2023-11-20 15:55:06,But you can tell here that we will tag weights...
155,audio_5@27-11-2023_15-30-31.ogg,5,2023-11-27 15:30:31,"So, I know that right now for Luma we use squa..."
...,...,...,...,...
105,audio_196@29-08-2024_20-19-30.ogg,196,2024-08-29 20:19:30,I just noticed that you added the ParadeDB vid...
106,audio_197@31-08-2024_09-52-31.ogg,197,2024-08-31 09:52:31,yeah when you download the files so you don't ...
107,audio_198@31-08-2024_09-52-35.ogg,198,2024-08-31 09:52:35,Is it not mentioned in the process documents?
108,audio_199@02-09-2024_09-58-02.ogg,199,2024-09-02 09:58:02,"So the process is, you need to create a copy, ..."


In [66]:
df.to_csv('data/transcribed.csv', index=False)

In [67]:
df.text.str.len()

110    1026
122     236
133     638
144     178
155     591
       ... 
105     768
106     161
107      45
108     550
111     163
Name: text, Length: 200, dtype: int64

In [68]:
def group_close_recordings(df):
    df = df.sort_values('datetime')
    grouped_data = []
    current_group = []
    
    for _, row in df.iterrows():
        if not current_group or (row['datetime'] - current_group[-1]['datetime'] <= timedelta(minutes=1)):
            current_group.append(row)
        else:
            grouped_data.append(process_group(current_group))
            current_group = [row]
    
    if current_group:
        grouped_data.append(process_group(current_group))
    
    return pd.DataFrame(grouped_data)


def process_group(group):
    return {
        'filename': ','.join(row['filename'] for row in group),
        'number': ','.join(str(row['number']) for row in group),
        'datetime': group[0]['datetime'],
        'text': ' '.join(row['text'] for row in group)
    }


In [69]:
df_grouped = group_close_recordings(df)

In [71]:
df_grouped.to_csv('data/transcribed-groups.csv', index=False)

In [73]:
df_grouped

Unnamed: 0,filename,number,datetime,text
0,audio_1@04-07-2023_10-06-31.ogg,1,2023-07-04 10:06:31,Maria from YLABS is asking about the USD accou...
1,audio_2@23-08-2023_13-35-55.ogg,2,2023-08-23 13:35:55,"Hey, I just added, I just closed one of the ne..."
2,"audio_3@20-11-2023_15-54-31.ogg,audio_4@20-11-...",34,2023-11-20 15:54:31,"So what I was going to write here is, no, we u..."
3,audio_5@27-11-2023_15-30-31.ogg,5,2023-11-27 15:30:31,"So, I know that right now for Luma we use squa..."
4,audio_6@09-12-2023_17-35-55.ogg,6,2023-12-09 17:35:55,"Hey, Francis, what would be convenient if you ..."
...,...,...,...,...
144,audio_194@27-08-2024_13-25-05.ogg,194,2024-08-27 13:25:05,I just forwarded you a message for a newslette...
145,audio_195@27-08-2024_13-34-49.ogg,195,2024-08-27 13:34:49,"Also, I want you to focus this week on getting..."
146,audio_196@29-08-2024_20-19-30.ogg,196,2024-08-29 20:19:30,I just noticed that you added the ParadeDB vid...
147,"audio_197@31-08-2024_09-52-31.ogg,audio_198@31...",197198,2024-08-31 09:52:31,yeah when you download the files so you don't ...


In [6]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [100]:
prompt_template = """
You are a professional transcript editor. We give you a voice note transcript,
and you need to edit it, so it reads naturally, clearly, and it's structured and concise. 

For each new thing discussed in the transcript, create a new paragraph.

Start the text with a title. Don't include any markup.

Transcript: {transcript}

Output format:

Title

Edited text
""".strip()

In [101]:
transcript = df_grouped.iloc[-1].text

In [102]:
prompt = prompt_template.format(transcript=transcript)

In [103]:
prompt

"You are a professional transcript editor. We give you a voice note transcript,\nand you need to edit it, so it reads naturally, clearly, and it's structured and concise. \n\nFor each new thing discussed in the transcript, create a new paragraph.\n\nStart the text with a title. Don't include any markup.\n\nTranscript: So the process is, you need to create a copy, like a special, we have a template, a document. You create a copy and then you send them and they fill it in. I think I forwarded you an email a while ago from Francis, so check how he did, so he created a copy and then he also included a PDF document and I asked you to upload this PDF document somewhere and instead of sending the PDF, send the link. Right now you can just send the PDF. Do this as soon as possible because we should have done this last week and they're still waiting for our response. and so in these cases when you are not sure what to answer please let me know as soon as possible do not wait for one week when I

In [104]:
result = llm(prompt)

In [105]:
print(result)

Process Overview for Document Submission

To initiate the process, you need to create a copy of a specific template document. After creating the copy, send it to the recipients so that they can fill it in. I previously forwarded you an email from Francis as an example, which you should review to understand his approach. He not only created the copy but also included a PDF document.

Once you have the PDF ready, please upload it to a designated location and share the link instead of sending the PDF directly. It's important that you complete this task as soon as possible since we should have addressed this last week, and the recipients are still waiting for our response.

In cases where you're uncertain about how to proceed or what to answer, please reach out to me immediately. Do not delay for an entire week; I expect you to be proactive in resolving these issues.


In [109]:
results = []

for transcript in tqdm(df_grouped.text):
    if len(transcript) <= 80:
        results.append(None)
        continue

    prompt = prompt_template.format(transcript=transcript)
    result = llm(prompt)
    results.append(result)

  0%|          | 0/149 [00:00<?, ?it/s]

In [117]:
titles = []
edited = []

for result in results:
    if result is None:
        titles.append(None)
        edited.append(None)
        continue
    title, text = result.split('\n', maxsplit=1)
    titles.append(title.strip())
    edited.append(text.strip())

In [120]:
df_grouped['title'] = titles
df_grouped['edited'] = edited

In [123]:
df_grouped.to_csv('data/edited-transcripts.csv', index=False)

In [124]:
df_sorted = df_grouped.sort_values(by='datetime', ascending=False)

In [125]:
df_sorted.head()

Unnamed: 0,filename,number,datetime,text,title,edited
148,"audio_199@02-09-2024_09-58-02.ogg,audio_200@02...",199200,2024-09-02 09:58:02,"So the process is, you need to create a copy, ...",Process for Document Submission,"To begin, you need to create a copy using our ..."
147,"audio_197@31-08-2024_09-52-31.ogg,audio_198@31...",197198,2024-08-31 09:52:31,yeah when you download the files so you don't ...,Downloading Files: Important Information,"When downloading the files, it's important to ..."
146,audio_196@29-08-2024_20-19-30.ogg,196,2024-08-29 20:19:30,I just noticed that you added the ParadeDB vid...,Open Source Spotlight Video Process,I noticed that you added the ParadeDB video to...
145,audio_195@27-08-2024_13-34-49.ogg,195,2024-08-27 13:34:49,"Also, I want you to focus this week on getting...",Podcast Priorities and Upcoming Tasks,"This week, I want you to focus on completing a..."
144,audio_194@27-08-2024_13-25-05.ogg,194,2024-08-27 13:25:05,I just forwarded you a message for a newslette...,Newsletter Promotion Process,I just forwarded you a message regarding a new...


In [136]:
with open('data/voice-notes-transcripts.txt', 'w', encoding='utf-8') as f_out:
    for _, row in df_sorted.iterrows():
        f_out.write(f"{row['title']}\n")
        f_out.write(f"{row['datetime']}\n\n")

        f_out.write(f"{row['edited']}\n")
        f_out.write("\n---\n\n")

## Edit and summarize Loom transcripts

In [41]:
!mkdir data\processes

In [7]:
transcripts_folder = Path('data/loom_transcripts')

In [10]:
transcript_files = sorted(transcripts_folder.glob('*.srt'))

In [11]:
transcript_files

[WindowsPath('data/loom_transcripts/A few comments about the podcast document.srt'),
 WindowsPath('data/loom_transcripts/A Guide to Free Online Courses at Data Talks Club.srt'),
 WindowsPath('data/loom_transcripts/Accessing Airtable and updating data there.srt'),
 WindowsPath("data/loom_transcripts/Accessing Alexey's calendar and seeing when to schedule events.srt"),
 WindowsPath('data/loom_transcripts/Add them to the Sponsorship CRM.srt'),
 WindowsPath('data/loom_transcripts/Adding an author to book of the week pages.srt'),
 WindowsPath('data/loom_transcripts/Adding conference sponsors to Sponsorship CRM.srt'),
 WindowsPath('data/loom_transcripts/Adding event hosts to meetup events.srt'),
 WindowsPath('data/loom_transcripts/Adding Events to Course Calendars 📅.srt'),
 WindowsPath('data/loom_transcripts/Adding guest bio to the podcast document.srt'),
 WindowsPath('data/loom_transcripts/Adding Images to Articles on Our Website.srt'),
 WindowsPath('data/loom_transcripts/Adding influencer 

In [17]:
import re

def extract_text_from_srt(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Remove subtitle numbers and timestamps
    pattern = r'\d+\n\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}\n'
    content_without_timecodes = re.sub(pattern, '', content)

    # Split the remaining content into lines and remove empty lines
    lines = [line.strip() for line in content_without_timecodes.split('\n') if line.strip()]

    return '\n'.join(lines)

In [43]:
f = transcript_files[1]
text = extract_text_from_srt(f)

In [74]:
prompt_template = """
You are a professional transcript editor. We give you a transcript of a video,
where we typically show a process, and your task is to create a process
document

In the document

- give a title (or use existing from the filename)
- describe the context:
  - what do we do
  - why do we do it
  - when should we do it
- in the context, be as laconic as possible. Don't include things like
    "in this process document...", "this should be executed...", "this document outlines".
    Go straight to the point. 
- describe the steps to execute the taks
- edit the content so it makes sense, it's concise and structured
- don't include markdown in the output

Output format:

Title

What: <WHAT>

Why: <WHY>

When: <WHEN>

1. Step 1

Details

2. Step 2

Details

...

If the transcript doesn't describe the process, simply output an edited concise text that reads naturally

Filename: {filename}

Transcript:

{transcript}
""".strip()

In [75]:
prompt = prompt_template.format(filename=f.name, transcript=text)

In [76]:
print(prompt)

You are a professional transcript editor. We give you a transcript of a video,
where we typically show a process, and your task is to create a process
document

In the document

- give a title (or use existing from the filename)
- describe the context:
  - what do we do
  - why do we do it
  - when should we do it
- in the context, be as laconic as possible. Don't include things like
    "in this process document...", "this should be executed...", "this document outlines".
    Go straight to the point. 
- describe the steps to execute the taks
- edit the content so it makes sense, it's concise and structured
- don't include markdown in the output

Output format:

Title

What: <WHAT>

Why: <WHY>

When: <WHEN>

1. Step 1

Details

2. Step 2

Details

...

If the transcript doesn't describe the process, simply output an edited concise text that reads naturally

Filename: A Guide to Free Online Courses at Data Talks Club.srt

Transcript:

uh so we have this new article by Valeria,
a guide 

In [77]:
result = llm(prompt)

In [78]:
print(result)

A Guide to Free Online Courses at Data Talks Club

What: Updating course information on the Data Talks Club website.

Why: To ensure accurate and current details about free online courses, particularly regarding starting dates.

When: When course information changes, especially before new cohorts start.

1. Access the repository

Navigate to the website repository for the Data Talks Club and locate the appropriate section (e.g., Boston).

2. Locate the article

Find the article titled "Guide to Free Online Courses at Data Talks Club."

3. Edit the article

Click the "edit" button to make changes to the content. Locate the section with the course details, particularly the starting dates.

4. Update course information

Update the starting date if necessary (e.g., changing from 2024 to 2025). 

5. Save changes

After making the required updates, save the changes to ensure the information is current.


In [80]:
new_file = f.parent.parent / 'processes' / (f.name[:-3] + 'txt')
new_file

WindowsPath('data/processes/A few comments about the podcast document.txt')

In [81]:
for f in tqdm(transcript_files):
    text = extract_text_from_srt(f)
    prompt = prompt_template.format(filename=f.name, transcript=text)
    result = llm(prompt)
    new_file = f.parent.parent / 'processes' / (f.name[:-3] + 'txt')

    with new_file.open('wt', encoding='utf-8') as f_out:
        f_out.write(result)

  0%|          | 0/218 [00:00<?, ?it/s]