In [1]:
import pandas as pd
import create_test_data
import boto3
from unsupervised_topic_segmentation import dataset, core


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# AMI data cleaning

In [6]:
#words to remove
FILLERS=["um", "uh", "oh", "hmm", "mm-hmm", "uh-uh", "you know"]


In [7]:
#ami loading
topic_path='data/ami-and-icsi-corpora-master/ami-corpus/output/topics/'
topic_jsons=create_test_data.jsons_to_dict_list(topic_path)

#transcript ids
ami_ids=["AMI_"+str(x) for x in range(len(topic_jsons))]

#jsons to clean dfs
topic_dfs=[create_test_data.clean_topic_json(x,y,fillers=FILLERS) for x,y in zip(topic_jsons,ami_ids)]

#df with one sentence per row
df_ami=pd.concat(topic_dfs)

#df with one transcript per row
t_ami = pd.DataFrame({'transcript_id':ami_ids,
                      'sentences':[tuple(x['sentences']) for x in topic_dfs],
                      'length':[len(x['sentences']) for x in topic_dfs],
                      'topic_count':[tuple(x['topic_count']) for x in topic_dfs],
                      'mean_topic_length':[x.groupby(['topic_count']).size().mean() for x in topic_dfs],
                      'topic_desc':[tuple(x['topic_desc']) for x in topic_dfs],
                      'has_topic_desc':[tuple(x['has_topic_desc']) for x in topic_dfs]
                      })


In [35]:
#save to parquet
df_ami.to_parquet('data/ami.parquet')


# Transcript Data Loading

In [23]:
#transcript loading
t_cd = create_test_data.transcript_pickle_to_pd()


In [42]:
#df with one sentence per row
transcript_ids=list(t_cd['transcript_id'])
sentences=list(t_cd['sentences'])
df_transcripts=pd.concat([pd.DataFrame({'transcript_id':x,'sentences':y}) for x,y in zip(transcript_ids,sentences)])

#clean transcripts
df_transcripts=dataset.preprocessing(df_transcripts,'sentences',FILLERS,min_caption_len=20)
df_transcripts=df_transcripts[["transcript_id","sentences"]]

In [46]:
#save to parquet
df_transcripts.to_parquet('data/transcripts.parquet')

# Load to S3 - requires credential access

In [None]:
# Set the S3 bucket name and key for your CSV file
bucket_name = 'decoding-democracy-embed'
file_key = 'ami'

# Create an S3 client
s3 = boto3.client('s3')

# Upload the CSV file to S3
s3.upload_file('data/ami.parquet', bucket_name, 'ami.parquet')
s3.upload_file('data/transcript.parquet', bucket_name, 'transcript.parquet')


# Load Parquet

In [3]:
filename = 'data/ami_embed_results/part-00000-aa8e8005-9cf8-48a2-a561-e03cf4754b85-c000.snappy.parquet'
df = create_test_data.snappy_parquet_to_df(filename)
df

Unnamed: 0,transcript_id,sentences,topic_count,topic_desc,has_topic_desc,embeddings
0,AMI_0,"[ Well , let's start ., Okay . Okay . Not doi...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, ...","[opening, opening, opening, opening, opening, ...","[True, True, True, True, True, True, True, Tru...","[[tensor(0.0949), tensor(-0.3398), tensor(0.09..."
1,AMI_1,"[This is our third meeting already ., I hope y...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[opening, opening, opening, opening, opening, ...","[True, True, True, True, True, True, True, Tru...","[[tensor(0.1004), tensor(-0.0374), tensor(0.03..."
2,AMI_10,"[welcome everyone to our next meeting ., and b...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, ...","[opening, opening, opening, opening, opening, ...","[True, True, True, True, True, True, True, Tru...","[[tensor(-0.0851), tensor(-0.1397), tensor(0.0..."
3,AMI_100,"[ Wouldn't wanna be Project Manager ., , wha...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[opening, opening, opening, opening, opening, ...","[True, True, True, True, True, True, True, Tru...","[[tensor(0.1179), tensor(-0.0844), tensor(0.03..."
4,AMI_101,[let's start our second me meeting on concept...,"[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[opening, opening, opening, opening, opening, ...","[True, True, True, True, True, True, True, Tru...","[[tensor(0.0908), tensor(0.0328), tensor(0.088..."
...,...,...,...,...,...,...
131,AMI_95,[And then you have to place your laptop exactl...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[agenda/equipment issues, agenda/equipment iss...","[True, True, True, True, True, True, True, Tru...","[[tensor(-0.0273), tensor(0.0039), tensor(0.03..."
132,AMI_96,"[Sorry I'm a little late ., Got stuck in the t...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[opening, opening, opening, opening, opening, ...","[True, True, True, True, True, True, True, Tru...","[[tensor(0.0498), tensor(-0.1051), tensor(0.16..."
133,AMI_97,"[One , two , three , four ,, Welcome to this s...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[agenda/equipment issues, opening, opening, op...","[True, True, True, True, True, True, True, Tru...","[[tensor(0.1273), tensor(-0.1799), tensor(0.13..."
134,AMI_98,[being as a Marketing Exper Expert I will like...,"[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[discussion, discussion, discussion, discussio...","[True, True, True, True, True, True, True, Tru...","[[tensor(-0.0048), tensor(0.0419), tensor(0.03..."


In [16]:
df['embeddings_len']=df['embeddings'].apply(lambda x: len(x))
df['length']=df['sentences'].apply(lambda x: len(x))
sum(df['length']!=df['embeddings_len'])
