## Clean and Convert Chats to '.TXT'

In [1]:
import re
import pandas as pd

# Define regex patterns
timestamp_pattern = r'^\d{1,2}/\d{1,2}/\d{4},\s\d{1,2}:\d{2}\s[ap]m\s-\s'
link_pattern = r'(https?://|www\.)\S+'
edited_message_pattern = r' <This message was edited>'
deleted_message_pattern = r'This message was deleted'
speaker_detection_pattern = r'^(.*?):\s'  # We'll improve how we use this
you_deleted_this_message_pattern = r'You deleted this message'

# Read input file
with open('/Users/aloshdenny/Downloads/WhatsApp Chat with Joachii🐧/WhatsApp Chat with Joachii🐧.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

# First pass: identify the two speakers
potential_speakers = []
for line in lines:
    # Skip lines without timestamp (continuation lines)
    if not re.match(timestamp_pattern, line):
        continue
        
    # Extract potential speaker
    line_without_timestamp = re.sub(timestamp_pattern, '', line).strip()
    speaker_match = re.match(speaker_detection_pattern, line_without_timestamp)
    if speaker_match:
        speaker = speaker_match.group(1)
        # Avoid capturing things like "Ps:" within a message
        if len(speaker) < 30 and ":" not in speaker:  # Reasonable length for a name
            potential_speakers.append(speaker)

# Find the two most common potential speakers
speaker_counts = {}
for speaker in potential_speakers:
    if speaker in speaker_counts:
        speaker_counts[speaker] += 1
    else:
        speaker_counts[speaker] = 1

# Sort speakers by frequency
sorted_speakers = sorted(speaker_counts.items(), key=lambda x: x[1], reverse=True)
valid_speakers = [speaker for speaker, count in sorted_speakers[:2]]

print(f"Identified speakers: {valid_speakers}")

# Second pass: extract messages
messages = []
continuation = False
current_message = None

for line in lines:
    # Check if this is a new message with timestamp
    if re.match(timestamp_pattern, line):
        # Process the previous message if there was one
        if current_message is not None:
            messages.append(current_message)
            
        # Remove timestamp
        line_without_timestamp = re.sub(timestamp_pattern, '', line).strip()
        
        # Skip lines with unwanted content
        if ('Media omitted' in line_without_timestamp or 
            'null' in line_without_timestamp or 
            re.search(link_pattern, line_without_timestamp) or 
            re.search(deleted_message_pattern, line_without_timestamp) or
            re.search(you_deleted_this_message_pattern, line_without_timestamp)):  # Add this line
            current_message = None
            continuation = False
            continue
            
        # Remove edited message indicator
        line_without_timestamp = re.sub(edited_message_pattern, '', line_without_timestamp).strip()
        
        # Check word count
        if len(line_without_timestamp.split()) > 100:
            current_message = None
            continuation = False
            continue
            
        # Identify speaker
        speaker_match = re.match(speaker_detection_pattern, line_without_timestamp)
        if speaker_match:
            potential_speaker = speaker_match.group(1)
            if potential_speaker in valid_speakers:
                speaker = potential_speaker
                message_text = line_without_timestamp[len(speaker)+2:].strip()
                current_message = {'speaker': speaker, 'text': message_text}
                continuation = True
            else:
                # If it's not a valid speaker, it might be a message with a colon
                unknown_speaker_found = False
                for valid_speaker in valid_speakers:
                    if line_without_timestamp.startswith(valid_speaker + ": "):
                        speaker = valid_speaker
                        message_text = line_without_timestamp[len(speaker)+2:].strip()
                        current_message = {'speaker': speaker, 'text': message_text}
                        continuation = True
                        unknown_speaker_found = True
                        break
                
                if not unknown_speaker_found:
                    current_message = None
                    continuation = False
        else:
            current_message = None
            continuation = False
    
    # If this is a continuation line (no timestamp)
    elif continuation and current_message is not None:
        current_message['text'] += " " + line.strip()

# Add the last message if there is one
if current_message is not None:
    messages.append(current_message)

# Concatenate consecutive messages from the same speaker
concatenated_messages = []
prev_speaker = None
buffer_text = ""

for msg in messages:
    if msg['speaker'] == prev_speaker:
        buffer_text += ". " + msg['text']
    else:
        if prev_speaker is not None:
            concatenated_messages.append({'speaker': prev_speaker, 'text': buffer_text})
        prev_speaker = msg['speaker']
        buffer_text = msg['text']

# Append the last buffered message
if prev_speaker is not None:
    concatenated_messages.append({'speaker': prev_speaker, 'text': buffer_text})

# Save parsed messages to a text file
with open('/Users/aloshdenny/Downloads/WhatsApp Chat with Joachii🐧/updated.txt', 'w', encoding='utf-8') as file:
    for msg in concatenated_messages:
        file.write(f"{msg['speaker']}: {msg['text']}\n")

print(f"Processed {len(concatenated_messages)} messages")

Identified speakers: ['Joachii🐧', 'joe']
Processed 10651 messages


## Convert to CSV

In [2]:
import pandas as pd
import re

# Path to the updated text file
input_file = '/Users/aloshdenny/Downloads/WhatsApp Chat with Joachii🐧/updated.txt'
output_file = '/Users/aloshdenny/Downloads/WhatsApp Chat with Joachii🐧/conversation.csv'

# Read the file
with open(input_file, 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Extract messages with speakers
messages = []
for line in lines:
    match = re.match(r'^(.*?):\s(.*)$', line.strip())
    if match:
        speaker = match.group(1)
        text = match.group(2)
        messages.append({'speaker': speaker, 'text': text})

# Identify the two speakers
speakers = list(set([msg['speaker'] for msg in messages]))

if len(speakers) != 2:
    print(f"Warning: Found {len(speakers)} speakers instead of 2. Using the first two speakers found.")
    speakers = speakers[:2]

speaker1 = speakers[0]
speaker2 = speakers[1]

print(f"Speaker 1: {speaker1}")
print(f"Speaker 2: {speaker2}")

# Initialize conversation dataframe
conversation = []
current_row = {speaker1: "", speaker2: ""}
last_speaker = None

for msg in messages:
    current_speaker = msg['speaker']
    
    # Skip speakers that aren't one of our main two (if there are more than 2)
    if current_speaker not in [speaker1, speaker2]:
        continue
    
    # If the same speaker speaks again, append to their previous message
    if current_speaker == last_speaker:
        current_row[current_speaker] += " " + msg['text']
    # If the other speaker starts talking
    elif last_speaker is not None and current_speaker != last_speaker:
        # If we have messages from both speakers, save the row and start a new one
        if current_row[speaker1] and current_row[speaker2]:
            conversation.append(current_row)
            current_row = {speaker1: "", speaker2: ""}
        
        # Add the current message to the appropriate column
        current_row[current_speaker] = msg['text']
    # First message in the conversation
    else:
        current_row[current_speaker] = msg['text']
    
    last_speaker = current_speaker

# Add the last row if it has content
if current_row[speaker1] or current_row[speaker2]:
    conversation.append(current_row)

# Convert to DataFrame and save as CSV
df = pd.DataFrame(conversation)
df.to_csv(output_file, index=False, encoding='utf-8-sig')

Speaker 1: joe
Speaker 2: Joachii🐧


### Test Gemini STT

In [4]:
from google import genai

client = genai.Client(api_key='AIzaSyDhjCyGvC9_EMJXBME28KRwm110qlbJZCg')

myfile = client.files.upload(file=r"C:\Users\alosh\Downloads\WhatsApp Chat with Joachii\PTT-20250308-WA0020.opus")

response = client.models.generate_content(
  model='gemini-2.0-flash',
  contents=['Translate this audio clip to English, word for word', myfile]
)

print(response.text)

The session just got over and everyone is so happy with it, including me. So fun. Honey, I loved it, man. They all came up to me saying enjoy, Chechi. I have never had such a fun session. There was no such a mental session. I'm so happy. I'm so happy.



## Bulk STT from Audios

In [None]:
import os
import json
from google import genai

# Initialize the client
client = genai.Client(api_key='AIzaSyDhjCyGvC9_EMJXBME28KRwm110qlbJZCg')

# Define the directory where .opus files are located
directory = r"C:\Users\alosh\Downloads\WhatsApp Chat with Joachii"

# Initialize a dictionary to store the transcriptions
transcriptions_dict = {}

# Open the JSON output file with UTF-8 encoding
with open('joanne.json', 'w', encoding='utf-8') as json_file:
    
    # Loop through each file in the directory
    for filename in os.listdir(directory):
        if filename.endswith(".opus"):  # Check if the file is an .opus file
            file_path = os.path.join(directory, filename)
            
            # Upload the .opus file
            myfile = client.files.upload(file=file_path)
            
            # Send the file to the model for transcription
            response = client.models.generate_content(
                model='gemini-2.0-flash',
                contents=['Translate this audio clip to English, word for word', myfile]
            )
            
            # Save the transcription in the dictionary using the filename as the key
            transcriptions_dict[filename] = response.text
            
            # Print the result to the console (optional)
            print(f"Transcription for {filename}:")
            print(response.text)
            print("\n")

    # After all transcriptions are done, write the dictionary to the JSON file with UTF-8 encoding
    json.dump(transcriptions_dict, json_file, indent=4, ensure_ascii=False)