In [407]:
import pickle
import pandas as pd
import random
import numpy as np
import itertools

# import data 

In [2]:
with open("transcripts.pickle", "rb") as f:
    transcripts = pickle.load(f)
        

In [3]:
print(f"Number of transcripts: {len(transcripts)}")
print(f"Number of sentencess: {sum([len(sentences) for sentences in transcripts.values()])}")
print(f"Number of words: {sum([sum([len(sentence.split()) for sentence in sentences]) for sentences in transcripts.values()])}")

Number of transcripts: 412
Number of sentencess: 328733
Number of words: 4843941


In [21]:
ex=list(transcripts.keys())[0]
transcripts[ex][-1]
 

'Thank you.'

# clean data

In [422]:
t = pd.DataFrame(transcripts.keys(),columns=["transcript_id"])

t['sentences']=t["transcript_id"].apply(lambda x: transcripts[x])
t['length']=t['sentences'].apply(lambda x: len(x)) 



# generate segment

In [480]:
def generate_segment(t : pd, doc_count_limit: int = 10, sentence_min: int = 20) -> tuple: 
    '''
    generate text segment from transcript data that resembles a transcript using 1:doc_count_limit documents 
    
    Args:
        t: transcripts
        doc_count_limit: maximum number of docs to pull from
        sentence_min: minimum number of sentences per text
    
    Returns:
        segment (list): List of sentences
        labels (list): Number document sentence pulled from 
        
    
    '''

    #filter text for sufficiently long texts 
    t_long=t[t['length']>=(doc_count_limit*sentence_min)]

    #count of documents
    doc_count = random.randint(1,doc_count_limit)
    
    #pull documents based on document count
    text = t_long.sample(doc_count)

    #add row number as index
    text.reset_index(inplace=True,drop=True)
    text.reset_index(inplace=True)
    
    if doc_count>1:

        #get length of last document to be total length of new document
        #NOTE: this makes output length independent of doc_count
        length_sequence = text.iat[-1,int(text.columns.get_indexer(['length']))]

        #distribute sentences between documents - based on percentage of chosen length
        percents = np.random.randint(1,100, size=doc_count)
        text['sentences_to_use'] = length_sequence * percents / sum(percents)
        text['sentences_to_use']= text['sentences_to_use'].astype('int') + sentence_min

        #reallocate sentences from shorter documents to last documents if necessary
        #NOTE: this lets us incorporate smaller documents, rather than oversampling from larger documents
        text['reallocate']=text['length']-text['sentences_to_use']
        text.loc[text['reallocate']>0,'reallocate'] = 0
        text['sentences_to_use']=text['sentences_to_use']+text['reallocate']
        text['reallocate_cum']=text['reallocate'].cumsum()

        #decide where to pull the sentences from 
        text['sentences_start']=text.apply(lambda x: random.randint(0,x['length']-x['sentences_to_use']),axis=1)

        #the first segment start at 0, and the last end at -1
        text.iat[-1,int(text.columns.get_indexer(['sentences_start']))]=text.iloc[-1]['length']-text.iloc[-1]['sentences_to_use']+text.iloc[-1]['reallocate_cum']    
        #text.loc[text['sentences_start']<0,'sentences_start'] = 0 #prevent overflow: should only needed when doc_count=1
        text.iat[0,int(text.columns.get_indexer(['sentences_start']))]=0
    
    else: 
        text['sentences_to_use'] = text['length'] 
        text['sentences_start']=0
       
    #get sentences
    text['results']=text.apply(lambda x: x['sentences'][x.sentences_start:(x.sentences_start+x.sentences_to_use)],axis=1) #)
    text['labels']=text.apply(lambda x: [x['index']] * x['sentences_to_use'],axis=1)
    
    results = list(itertools.chain.from_iterable(text['results']))
    labels = list(itertools.chain.from_iterable(text['results']))

    return (results,labels,doc_count)
    
hold = generate_segment(t, doc_count_limit = 10, sentence_min = 20)  

