In [1]:
import ray.data
import ray
import ray.cloudpickle as pickle
from collections import defaultdict
from dataclasses import dataclass
from typing import List
import requests
from pprint import pprint
import os

In [2]:
call_ids = [3166028376916322699]

In [3]:
ray.init()

2023-03-03 14:23:46,106	INFO worker.py:1360 -- Connecting to existing Ray cluster at address: 10.0.3.202:6379...
2023-03-03 14:23:46,113	INFO worker.py:1548 -- Connected to Ray cluster. View the dashboard at [1m[32mhttps://console.anyscale-staging.com/api/v2/sessions/ses_kb3ste2ly7ykmwplrp9lcbkrra/services?redirect_to=dashboard [39m[22m
2023-03-03 14:23:46,117	INFO packaging.py:330 -- Pushing file package 'gcs://_ray_pkg_7edc2ae5aff4f6292846244b76d85f65.zip' (0.29MiB) to Ray cluster...
2023-03-03 14:23:46,121	INFO packaging.py:343 -- Successfully pushed file package 'gcs://_ray_pkg_7edc2ae5aff4f6292846244b76d85f65.zip'.


0,1
Python version:,3.10.9
Ray version:,3.0.0.dev0
Dashboard:,http://console.anyscale-staging.com/api/v2/sessions/ses_kb3ste2ly7ykmwplrp9lcbkrra/services?redirect_to=dashboard


In [5]:
from api import get_call_data, get_transcript_data, Monologue, Sentence

calls_data = get_call_data(call_ids).get("calls")

call_summary = defaultdict(dict)
for call_data in calls_data:
    call_id = call_data["metaData"]["id"]
    call_title = call_data["metaData"]["title"]
    media_data = call_data.get("media")
    if media_data:
        call_summary[call_id].update({
            "call_id": call_id,
            "title": call_title,
            "audio": media_data.get("audioUrl"),
            "video": media_data.get("videoUrl"),
        })

transcripts_data = get_transcript_data(call_ids).get("callTranscripts")
transcript_monologues = []
transcript_text = ""
for t_data in transcripts_data:
    call_id = t_data["callId"]
    call_transcript = []

    for mono_data in t_data["transcript"]:
        speaker_id = mono_data["speakerId"]
        topic = mono_data["topic"]
        sentences = []
        monologue_str = []
        for s in mono_data["sentences"]:
            sentences.append(
                Sentence(s["text"], s["start"], s["end"])
            )
            monologue_str.append(s["text"])
            
        mono = Monologue(sentences, speaker_id, topic, call_id=int(call_id))
        transcript_monologues.append(mono)

        monologue_str = " ".join(monologue_str)
        call_transcript.append(str(mono))
        # transcript_text += "\n".join(monologue_str)
        # transcript_text += "------\n\n"
    
    call_summary[call_id].update({
        "transcript": "\n".join(call_transcript)
    })
    transcript_text += "\n".join(call_transcript)

In [6]:
data = ray.data.read_parquet("s3://antoni-test/gong-calls/3166028376916322699.parquet")
df = data.to_pandas()

(_get_read_tasks pid=182690)   pq_ds.pieces, **prefetch_remote_args
(_get_read_tasks pid=182690)   num_files = len(self._pq_ds.pieces)
(_get_read_tasks pid=182690)   self._pq_ds.pieces[idx]
Parquet Files Sample:   0%|          | 0/1 [00:00<?, ?it/s]
Parquet Files Sample: 100%|██████████| 1/1 [00:03<00:00,  3.27s/it]s pid=182690) 
(_get_read_tasks pid=182690)   np.array_split(self._pq_ds.pieces, parallelism),
Read progress: 100%|██████████| 1/1 [00:02<00:00,  2.72s/it]


In [7]:
segments = pickle.loads(df["word_segments"].iloc[0])

In [8]:
from typing import Optional

In [9]:
def to_ms_int(segment):
    segment["start"] = round(segment["start"] * 1000)
    segment["end"] = round(segment["end"] * 1000)
    return Sentence(text=segment["text"], start_ts=segment["start"], end_ts=segment["end"])

segments = [to_ms_int(segment) for segment in segments]

In [10]:
def modify_ts(segment, delta):
    segment.start_ts -= delta
    segment.end_ts -= delta
    return segment

def align_timestamps(segments, gong_monologues):
    delta_start = segments[0].start_ts - gong_monologues[0].start_ts
    return [modify_ts(segment, delta_start) for segment in segments]
    

In [11]:
aligned_segments = align_timestamps(segments, transcript_monologues)

In [179]:
import itertools
from copy import deepcopy
import re

def pairwise(iterable):
    "s -> (s0, s1), (s1, s2), (s2, s3), ..."
    a, b = itertools.tee(iterable)
    next(b, None)
    return zip(a, b)

end_of_sentence_regex = r"[\.\!\?\-\–]$"
start_of_sentence_regex = r"^[A-Z]"

def reverse_enumerate(data: list):
    for i in range(len(data)-1, -1, -1):
        yield (i, data[i])

def merge_speakers(transcript_monologues):
    original_transcript_monologues = transcript_monologues
    transcript_monologues = deepcopy(transcript_monologues)
    for i in range(3):
        merged_monologues = []
        last_monologue = transcript_monologues[0]
        for monologue, next_monologue in pairwise(transcript_monologues):

            if monologue.sentences and last_monologue.sentences and last_monologue.speaker == next_monologue.speaker and last_monologue.speaker != monologue.speaker:
                if not re.match(start_of_sentence_regex, monologue[0].text) and not re.search(end_of_sentence_regex, monologue[-1].text):
                    merged_monologues.pop()
                    monologue.sentences = last_monologue.sentences + monologue.sentences + next_monologue.sentences
                    next_monologue.sentences = []

            if monologue.speaker == next_monologue.speaker:
                monologue.sentences = monologue.sentences + next_monologue.sentences
                next_monologue.sentences = []
            if monologue.sentences:
                merged_monologues.append(monologue)
            last_monologue = monologue
        if next_monologue.sentences:
            merged_monologues.append(next_monologue)
        transcript_monologues = merged_monologues

    assert "".join(["".join([y.text for y in x]) for x in original_transcript_monologues]) == "".join(["".join([y.text for y in x]) for x in transcript_monologues])
    return transcript_monologues

def assign_gong_speaker(segments, transcript_monologues):
    whisper_monologues = []
    segments_in_monologue = []
    it = iter(transcript_monologues)
    monologue = next(it)
    next_monologue = next(it)
    for segment in segments:
        if segment.start_ts >= monologue.end_ts:
            whisper_monologues.append(
                Monologue(
                    segments_in_monologue,
                    monologue.speaker_id,
                    "None",
                    monologue.call_id
                )
            )
            segments_in_monologue = []
            monologue = next_monologue
            try:
                next_monologue = next(it)
            except StopIteration:
                pass
        segments_in_monologue.append(segment)
    if segments_in_monologue:
        whisper_monologues.append(
            Monologue(
                segments_in_monologue,
                monologue.speaker_id,
                "None",
                monologue.call_id
            )
        )
    return whisper_monologues

def fix_sentences(monologues):
    monologues = deepcopy(monologues)
    for monologue, next_monologue in pairwise(monologues):
        if not next_monologue:
            continue
        if not re.match(start_of_sentence_regex, next_monologue.sentences[0].text) or not re.search(end_of_sentence_regex, monologue.sentences[-1].text):
            delta_front = -1
            delta_back = -1
            num_words_back = -1
            num_words_front = -1
            found_capital = None
            index_front = None
            index_back = None
            for i, sentence in reverse_enumerate(monologue):
                num_words_back += 1
                if found_capital is not None and re.search(end_of_sentence_regex, sentence.text):
                    index_back = i+1
                    delta_back = found_capital - sentence.end_ts
                    break
                if re.match(start_of_sentence_regex, sentence.text):
                    found_capital = sentence.start_ts

            found_end = None

            for i, sentence in enumerate(next_monologue):
                num_words_front += 1
                if found_end is not None and re.match(start_of_sentence_regex, sentence.text):
                    index_front = i
                    delta_front = sentence.start_ts - found_end
                    break

                if re.search(end_of_sentence_regex, sentence.text):
                    found_end = sentence.end_ts

           # print(f"delta_front {delta_front} delta_back {delta_back}")
           # print(f"num_words_front {num_words_front} num_words_back {num_words_back}")
           # print(str(monologue))
           # print(str(next_monologue))

            if num_words_back == 1:
                delta_back = float("inf")

            if num_words_front == 1:
                delta_front = float("inf")

            if delta_front > delta_back and index_front is not None:
                index = index_front
                monologue.sentences = monologue.sentences + next_monologue.sentences[:index] 
                next_monologue.sentences = next_monologue.sentences[index:]
            elif index_back is not None:
                index = index_back
                next_monologue.sentences = monologue.sentences[index:] + next_monologue.sentences
                monologue.sentences = monologue.sentences[:index]
    return monologues

In [180]:
len(transcript_monologues)

139

In [181]:
merged_transcript_monologues = merge_speakers(transcript_monologues)

In [182]:
len(merged_transcript_monologues)

108

In [183]:
whisper_monologues = assign_gong_speaker(aligned_segments, transcript_monologues)

In [184]:
whisper_monologues_merged = merge_speakers(whisper_monologues)

In [185]:
whisper_monologues_fixed = merge_speakers(fix_sentences(whisper_monologues))

In [186]:
len(whisper_monologues_fixed)

104

In [187]:
whisper_monologues_fixed_merged = merge_speakers(whisper_monologues_fixed)

In [188]:
len(whisper_monologues_fixed_merged)

103

In [189]:
with open("fixed.txt", "w") as f:
    f.write("".join([str(m) for m in whisper_monologues_fixed_merged]))

In [190]:
with open("whisper_um.txt", "w") as f:
    f.write("".join([str(m) for m in whisper_monologues]))

In [191]:
with open("whisper.txt", "w") as f:
    f.write("".join([str(m) for m in whisper_monologues_merged]))

In [192]:
with open("gong.txt", "w") as f:
    f.write("".join([str(m) for m in merged_transcript_monologues]))

In [74]:
print(" ".join([str(m) for m in transcript_monologues[:11]]))

(1230) SCOTT CECIL: Hey, Alex. Good to see you.
 (2620) ALEX POST POST: Hey, you too. How's it going?
 (4740) SCOTT CECIL: Doing well. Doing well. I'm up today first time in three months. So, all right. I'm feeling it looks like you're in your, you're at your house?
 (12130) ALEX POST POST: Yep. Yep. So a, last week, but we're back now.
 (19370) SCOTT CECIL: Okay, very nice. Yeah, I'll travel this morning where Alex remind me where are you located again?
 (25170) ALEX POST POST: I am in green rapids Michigan. Okay. Nice. Cool. All right.
 (29660) SCOTT CECIL: It looks like we got David on David welcome. Nice to meet you. I don't think we met yet. Nice to me too. Cool. We have some new faces on our side. So maybe we can start with introductions today. Agenda is pretty straightforward today. Really wanna dive into Rllib, the training side of things as well. Even show you a demo. And on the call today of a, is one of our lead sales engineers who will perform a demo today on the Anyscale p

In [None]:
transcript_monologues[:2]

In [None]:
segments[:10]