In [1]:
import json

In [39]:
def get_account_data(record):
    fields = []
    if "context" in record:
        for data in record["context"]:
            if data["system"] == "Salesforce":
                for object in data["objects"]:
                    if object["objectType"] in ["Account", "Opportunity"]:
                        fields.extend(object["fields"])
    return fields

In [88]:
def get_new_call(call):
    new_segments = []
    curr_segment = {}
    for segment in call['transcript']:
        segment_sents = [sent['text'] for sent in segment['sentences']]
        segment_start = segment['sentences'][0]['start']
        segment_end = segment['sentences'][-1]['end']
        if "speakerId" in curr_segment and segment['speakerId'] == curr_segment['speakerId']:
            curr_segment['sentences'] += segment_sents
            curr_segment['end'] = segment_end
            if segment['topic']:
                curr_segment['topics'].append(segment['topic'])
        else:
            if "sentences" in curr_segment and len(curr_segment['sentences']) > 0:
                new_segments.append(curr_segment)

            curr_segment = {
                "speakerId": segment['speakerId'],
                "speakerName": segment['speakerName'],
                "sentences": segment_sents,
                "start": segment_start,
                "end": segment_end,
                "topics": [],
            }
            if segment['topic']:
                curr_segment['topics'].append(segment['topic'])

    if "sentences" in curr_segment and len(curr_segment['sentences']) > 0:
        new_segments.append(curr_segment)

    affiliation_map = {party['speakerId']: party['affiliation'] for party in call['parties'] if party['speakerId']}
    name_map = {party['speakerId']: party['name'] for party in call['parties'] if party['speakerId']}

    output_segments = []
    # unknown_segments = []
    for i, curr_segment in enumerate(new_segments):
        if i > 0:
            prev_segment = new_segments[i-1]
            curr_segment['start'] = prev_segment['end']
        if i < len(new_segments) - 1:
            next_segment = new_segments[i+1]
            curr_segment['end'] += (next_segment['start'] - curr_segment['end']) / 2
        

    for new_segment in new_segments:
        if new_segment['speakerId'] in affiliation_map:
            # affiliation = affiliation_map[new_segment['speakerId']]
            # if affiliation not in ['Internal', 'External']:
            #     print("BAD AFFILIATION: ", new_segment)
            #     unknown_segments.append(new_segment)
            output_segment = {
                "speakerId": new_segment['speakerId'],
                "speakerName": name_map[new_segment['speakerId']],
                "speakerAffiliation": affiliation_map[new_segment['speakerId']],
                "topics": new_segment['topics'],
                "text": " ".join(new_segment['sentences']),
                "start": int(new_segment['start']),
                "end": int(new_segment['end'])
            }
            output_segments.append(output_segment)

    output_metadata = {
        "id": call['metaData']['id'],
        "title": call['metaData']['title'],
        "duration": call['metaData']['duration'],
    }

    output_call = {
        "metadata": output_metadata,
        "company_info": get_account_data(call),
        "segments": output_segments
    }

    return output_call

In [89]:
with open("/home/ubuntu/speech_to_text/data/intro.json", "r") as f:
    calls = json.load(f)

len(calls)

428

In [90]:
new_calls = []
# all_unknown_segments = []
for call in calls:
    new_call = get_new_call(call)
    new_calls.append(new_call)
    # all_unknown_segments.extend(unknown_segments)

In [91]:
sum([len(call['segments']) for call in new_calls])

37204

In [92]:
new_calls[0]

{'metadata': {'id': '956393886453945479',
  'title': 'Saras | Hightouch <> Introductions',
  'duration': 1419},
 'company_info': [{'name': 'Industry',
   'value': 'Internet Software & Services'},
  {'name': 'Website', 'value': 'sarasanalytics.com'},
  {'name': 'NumberOfEmployees', 'value': 150.0},
  {'name': 'Name', 'value': 'Saras Analytics'},
  {'name': 'Account_Level__c', 'value': 'COMM'}],
 'segments': [{'speakerId': '1223525977315950398',
   'speakerName': 'Ashraf',
   'speakerAffiliation': 'External',
   'topics': ['Small Talk'],
   'text': 'Done.',
   'start': 240,
   'end': 1540},
  {'speakerId': '8370954879572051754',
   'speakerName': 'John Alderman',
   'speakerAffiliation': 'Internal',
   'topics': ['Small Talk'],
   'text': 'Hey, good evening, Ashraf. How you doing?',
   'start': 1540,
   'end': 3820},
  {'speakerId': '1223525977315950398',
   'speakerName': 'Ashraf',
   'speakerAffiliation': 'External',
   'topics': ['Small Talk'],
   'text': 'Yeah. All good, John morning

In [93]:
with open("/home/ubuntu/speech_to_text/data/intro_calls_preprocessed_v2.json", "w") as f:
    json.dump(new_calls, f)

In [94]:
call_dir = "/home/ubuntu/speech_to_text/data/intro_calls_original"
for call in new_calls:
    with open(f"{call_dir}/{call['metadata']['id']}.json", "w") as f:
        json.dump(call, f)

In [95]:
durations = [call['metadata']['duration'] for call in new_calls][39:]

In [98]:
sum(durations) / len(durations)

1662.7300771208227

In [99]:
len(durations)

389

### Run the below cells after retranscribing calls

In [6]:
with open("/home/ubuntu/speech_to_text/data/intro_calls_preprocessed_v2.json", "r") as f:
    intro_calls = json.load(f)

len(intro_calls)

428

In [11]:
bad_call_ids = ["6781923412519057638"]
new_calls = []
for call in intro_calls:
    new_call_filename = f"/home/ubuntu/speech_to_text/data/intro_calls_retranscribed_v2/{call['metadata']['id']}.json"
    if os.path.exists(new_call_filename):
        with open(new_call_filename, "r") as f:
            new_call = json.load(f)
            if len(new_call['segments']) > 0 and call['metadata']['id'] not in bad_call_ids:
                new_calls.append(new_call)
            else:
                print(new_call['metadata']['id'])
    else:
        print(os.path.basename(new_call_filename))

1765591959477928931
6781923412519057638
1088054306404649799.json
7635408987259473649.json


In [12]:
with open("/home/ubuntu/speech_to_text/data/intro_calls_retranscribed_v2.json", "w") as f:
    json.dump(new_calls, f)

len(new_calls)

424