In [1]:
import ray.data
import ray
import ray.cloudpickle as pickle
from collections import defaultdict
from dataclasses import dataclass
from typing import List
import requests
from pprint import pprint
import os

In [2]:
call_ids = [3166028376916322699]

In [3]:
ray.init()

2023-03-03 13:01:02,477	INFO worker.py:1360 -- Connecting to existing Ray cluster at address: 10.0.3.202:6379...
2023-03-03 13:01:02,484	INFO worker.py:1548 -- Connected to Ray cluster. View the dashboard at [1m[32mhttps://console.anyscale-staging.com/api/v2/sessions/ses_kb3ste2ly7ykmwplrp9lcbkrra/services?redirect_to=dashboard [39m[22m
2023-03-03 13:01:02,488	INFO packaging.py:330 -- Pushing file package 'gcs://_ray_pkg_987ad3bfc6c80bd19a712fd06485b82b.zip' (0.14MiB) to Ray cluster...
2023-03-03 13:01:02,490	INFO packaging.py:343 -- Successfully pushed file package 'gcs://_ray_pkg_987ad3bfc6c80bd19a712fd06485b82b.zip'.


0,1
Python version:,3.10.9
Ray version:,3.0.0.dev0
Dashboard:,http://console.anyscale-staging.com/api/v2/sessions/ses_kb3ste2ly7ykmwplrp9lcbkrra/services?redirect_to=dashboard


In [5]:
from api import get_call_data, get_transcript_data, Monologue, Sentence

calls_data = get_call_data(call_ids).get("calls")

call_summary = defaultdict(dict)
for call_data in calls_data:
    call_id = call_data["metaData"]["id"]
    call_title = call_data["metaData"]["title"]
    media_data = call_data.get("media")
    if media_data:
        call_summary[call_id].update({
            "call_id": call_id,
            "title": call_title,
            "audio": media_data.get("audioUrl"),
            "video": media_data.get("videoUrl"),
        })

transcripts_data = get_transcript_data(call_ids).get("callTranscripts")
transcript_monologues = []
transcript_text = ""
for t_data in transcripts_data:
    call_id = t_data["callId"]
    call_transcript = []

    for mono_data in t_data["transcript"]:
        speaker_id = mono_data["speakerId"]
        topic = mono_data["topic"]
        sentences = []
        monologue_str = []
        for s in mono_data["sentences"]:
            sentences.append(
                Sentence(s["text"], s["start"], s["end"])
            )
            monologue_str.append(s["text"])
            
        mono = Monologue(sentences, speaker_id, topic, call_id=int(call_id))
        transcript_monologues.append(mono)

        monologue_str = " ".join(monologue_str)
        call_transcript.append(str(mono))
        # transcript_text += "\n".join(monologue_str)
        # transcript_text += "------\n\n"
    
    call_summary[call_id].update({
        "transcript": "\n".join(call_transcript)
    })
    transcript_text += "\n".join(call_transcript)

In [6]:
data = ray.data.read_parquet("s3://antoni-test/gong-calls/3166028376916322699.parquet")
df = data.to_pandas()

(_get_read_tasks pid=129460)   pq_ds.pieces, **prefetch_remote_args
(_get_read_tasks pid=129460)   num_files = len(self._pq_ds.pieces)
(_get_read_tasks pid=129460)   self._pq_ds.pieces[idx]
Parquet Files Sample:   0%|          | 0/1 [00:00<?, ?it/s]
Parquet Files Sample: 100%|██████████| 1/1 [00:02<00:00,  2.41s/it]s pid=129460) 
(_get_read_tasks pid=129460)   np.array_split(self._pq_ds.pieces, parallelism),
Read progress: 100%|██████████| 1/1 [00:02<00:00,  2.72s/it]


In [7]:
segments = pickle.loads(df["word_segments"].iloc[0])

In [8]:
from typing import Optional

In [9]:
def to_ms_int(segment):
    segment["start"] = round(segment["start"] * 1000)
    segment["end"] = round(segment["end"] * 1000)
    return Sentence(text=segment["text"], start_ts=segment["start"], end_ts=segment["end"])

segments = [to_ms_int(segment) for segment in segments]

In [10]:
def modify_ts(segment, delta):
    segment.start_ts -= delta
    segment.end_ts -= delta
    return segment

def align_timestamps(segments, gong_monologues):
    delta_start = segments[0].start_ts - gong_monologues[0].start_ts
    return [modify_ts(segment, delta_start) for segment in segments]
    

In [11]:
aligned_segments = align_timestamps(segments, transcript_monologues)

In [73]:
import itertools
from copy import deepcopy
import re

def pairwise(iterable):
    "s -> (s0, s1), (s1, s2), (s2, s3), ..."
    a, b = itertools.tee(iterable)
    next(b, None)
    return zip(a, b)  

def reverse_enumerate(data: list):
    for i in range(len(data)-1, -1, -1):
        yield (i, data[i])

def assign_gong_speaker(segments, transcript_monologues):
    whisper_monologues = []
    segments_in_monologue = []
    it = iter(transcript_monologues)
    monologue = next(it)
    next_monologue = next(it)
    for segment in segments:
        if segment.start_ts >= monologue.end_ts:
            whisper_monologues.append(
                Monologue(
                    segments_in_monologue,
                    monologue.speaker_id,
                    "None",
                    monologue.call_id
                )
            )
            segments_in_monologue = []
            monologue = next_monologue
            try:
                next_monologue = next(it)
            except StopIteration:
                pass
        segments_in_monologue.append(segment)
    return whisper_monologues

def fix_sentences(monologues):
    monologues = deepcopy(monologues)
    for monologue, next_monologue in pairwise(monologues):
        if not next_monologue:
            continue
        if not re.match(r"^[A-Z]", next_monologue.sentences[0].text) or not re.search(r"[\.\!\?\-]$", monologue.sentences[-1].text):
            found_capital = False
            index = None
            for i, sentence in reverse_enumerate(monologue):
                if found_capital and re.search(r"[\.\!\?\-\–]$", sentence.text):
                    index = i+1
                    break
                if re.match(r"^[A-Z]", sentence.text):
                    print(f"found_capital {sentence.text}")
                    found_capital = True
            if index is not None:
                next_monologue.sentences = monologue.sentences[index:] + next_monologue.sentences
                monologue.sentences = monologue.sentences[:index]
    return monologues

In [74]:
whisper_monologues = assign_gong_speaker(aligned_segments, transcript_monologues)

In [75]:
whisper_monologues_fixed = fix_sentences(whisper_monologues[:11])

found_capital All
found_capital And
found_capital Yeah,
found_capital No,


In [76]:
print(" ".join([str(m) for m in whisper_monologues_fixed[:11]]))

(1230) SCOTT CECIL: Hey Alex, good to see you.
 (2583) ALEX POST POST: Hey, you too. How's it going?
 (4684) SCOTT CECIL: Doing well, doing well. I'm in office today, first time in three months. It's a feeling. It looks like you're at your house?
 (12427) ALEX POST POST: Yep. Very nice. Yep. So, I traveled last week, but we're back now.
 (19401) SCOTT CECIL: Okay, very nice. Yeah, I traveled this morning. Alex, remind me, where are you located again?
 (25248) ALEX POST POST: I am in Grand Rapids, Michigan. Okay, nice. Cool.
 (29042) SCOTT CECIL: All right, looks like we got David on. David, welcome. Nice to meet you. I don't think we met yet. Nice to meet you. Yeah, cool. And we have some new faces on our side, so maybe we can start with introductions today. The agenda is pretty straightforward today. Really want to dive into RL, the training side of things as well, even show you guys a demo. And on the call I have today, I have Uday. Uday is one of our lead sales engineers who will pe

In [19]:
print(" ".join([str(m) for m in whisper_monologues[:11]]))

(1230) SCOTT CECIL: Hey Alex, good to see you.
 (2583) ALEX POST POST: Hey, you too. How's it going?
 (4684) SCOTT CECIL: Doing well, doing well. I'm in office today, first time in three months. It's a feeling. It looks like you're at your house?
 (12427) ALEX POST POST: Yep. Very nice. Yep. So, I traveled last week, but we're back now.
 (19401) SCOTT CECIL: Okay, very nice. Yeah, I traveled this morning. Alex, remind me, where are you located again?
 (25248) ALEX POST POST: I am in Grand Rapids, Michigan. Okay, nice. Cool. All right,
 (29766) SCOTT CECIL: looks like we got David on. David, welcome. Nice to meet you. I don't think we met yet. Nice to meet you. Yeah, cool. And we have some new faces on our side, so maybe we can start with introductions today. The agenda is pretty straightforward today. Really want to dive into RL, the training side of things as well, even show you guys a demo. And on the call I have today, I have Uday. Uday is one of our lead sales engineers who will pe

In [74]:
print(" ".join([str(m) for m in transcript_monologues[:11]]))

(1230) SCOTT CECIL: Hey, Alex. Good to see you.
 (2620) ALEX POST POST: Hey, you too. How's it going?
 (4740) SCOTT CECIL: Doing well. Doing well. I'm up today first time in three months. So, all right. I'm feeling it looks like you're in your, you're at your house?
 (12130) ALEX POST POST: Yep. Yep. So a, last week, but we're back now.
 (19370) SCOTT CECIL: Okay, very nice. Yeah, I'll travel this morning where Alex remind me where are you located again?
 (25170) ALEX POST POST: I am in green rapids Michigan. Okay. Nice. Cool. All right.
 (29660) SCOTT CECIL: It looks like we got David on David welcome. Nice to meet you. I don't think we met yet. Nice to me too. Cool. We have some new faces on our side. So maybe we can start with introductions today. Agenda is pretty straightforward today. Really wanna dive into Rllib, the training side of things as well. Even show you a demo. And on the call today of a, is one of our lead sales engineers who will perform a demo today on the Anyscale p

In [None]:
transcript_monologues[:2]

In [None]:
segments[:10]