In [37]:
%pip install transformers


369277.02s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [38]:
import torch
from transformers import BertTokenizer, BertModel
import pandas as pd
from time import ctime
import tracemalloc

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)

import matplotlib.pyplot as plt
%matplotlib inline

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [39]:
# Create embeddings 
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )
# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [40]:
import nltk.data
import pandas as pd
tokenizer_nltk = nltk.data.load('tokenizers/punkt/english.pickle')

In [42]:
def split_paragraph(filename):
    df = pd.read_csv(filename+'.csv')
    col_names = list(df.columns)
    df_new = pd.DataFrame()
    rows = len(df)

    for row in range(rows):
        df_entry = pd.DataFrame()
        transcript_list = []

        transcript = df.iloc[row].transcripts
        transcript_list=tokenizer_nltk.tokenize(transcript)
        n_sentences = len(transcript_list)

        for col in range(len(col_names)):
            if col_names[col] != "transcripts":
                df_entry[col_names[col]] = [df.iloc[row][col_names[col]]]*n_sentences
            else:
                df_entry["transcripts"] = transcript_list

        df_new = pd.concat([df_new,df_entry])

    df_new = df_new.reset_index(drop=True)

    return df_new

def find_max_token (df):
    max_len = 0
    transcripts = df['transcripts'].values

    for transcript in transcripts:
        #transcript = "[SEP]".join(sent_detector.tokenize(transcript.strip())) #for each sentence we need a sentence seperator operator 
        input_ids = tokenizer.encode(transcript,add_special_tokens=True)
        max_len = max(max_len,len(input_ids))
    
    return max_len

df_whisper = split_paragraph("spontaneousDialogueOnly_whisper")
max_len_whisper = find_max_token(df_whisper)
df_google = split_paragraph("spontaneousDialogueOnly_google")
max_len_google = find_max_token(df_google)

display('Max token whisper:',max_len_whisper)
display('Max token google:',max_len_google)

'Max token whisper:'

140

'Max token google:'

84

In [44]:
## Create tokens 
def create_tokens(df,max_token_len):
    print("creating tokens")
    print(ctime())
    input_ids = []
    attention_masks = []

    transcripts = df.transcripts.values

    for transcript in transcripts:
        encoded_dict = tokenizer.encode_plus(
                        transcript,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_token_len,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    # Convert the lists into tensors.
    df["input_ids"] = input_ids
    df["attention_masks"] = attention_masks   

    return df

def create_embeddings(input_id,mask):
    outputs = model(input_id, mask)

    # Evaluating the model will return a different number of objects based on 
    # how it's  configured in the `from_pretrained` call earlier. In this case, 
    # becase we set `output_hidden_states = True`, the third item will be the 
    # hidden states from all layers. See the documentation for more details:
    # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
    hidden_states = outputs[2]

    # Concatenate the tensors for all layers. We use `stack` here to
    # create a new dimension in the tensor.
    token_embeddings = torch.stack(hidden_states, dim=0)

    # Remove dimension 1, the "batches".
    token_embeddings = torch.squeeze(token_embeddings, dim=1)

    # `token_vecs` is a tensor with shape [430 x 768]
    token_vecs = hidden_states[-2][0]

    # Calculate the sentence token
    return torch.mean(token_vecs, dim=0) 


def split_paragraph(filename):
    df = pd.read_csv(filename+'.csv')
    col_names = list(df.columns)
    df_new = pd.DataFrame()
    rows = len(df)

    for row in range(rows):
        df_entry = pd.DataFrame()
        transcript_list = []

        transcript = df.iloc[row].transcripts
        transcript_list=tokenizer_nltk.tokenize(transcript)
        n_sentences = len(transcript_list)

        for col in range(len(col_names)):
            if col_names[col] != "transcripts":
                df_entry[col_names[col]] = [df.iloc[row][col_names[col]]]*n_sentences
            else:
                df_entry["transcripts"] = transcript_list

        df_new = pd.concat([df_new,df_entry])

    df_new = df_new.reset_index(drop=True)

    return df_new

def collect_embeddings(filename:str,save_csv:bool,max_token_len:int):
    df = split_paragraph(filename)
    df = create_tokens(df,max_token_len)

    print("creating embeddings")
    print(ctime())
    embeddings = []
    rows = len(df)
    
    tracemalloc.start()
    for row in range(rows):
        print(f"{row} of {rows}")
        input_id = df.iloc[row].input_ids.reshape(1,max_token_len)
        mask = df.iloc[row].attention_masks.reshape(1,max_token_len)
        embeddings.append(create_embeddings(input_id,mask).detach().numpy())
        print("done")
    current, peak = tracemalloc.get_traced_memory()
    print(f"Current memory usage is {current / 10**6}MB; Peak was {peak / 10**6}MB")
    tracemalloc.stop()
    
    df["embeddings"] = embeddings
    if save_csv:
        out_filename = filename+'_bert_sentence_embeddings.csv'
        df.to_csv(out_filename)
    
    return df




In [45]:
df_embeddings_whisper = collect_embeddings("spontaneousDialogueOnly_whisper",False,150)
df_embeddings_whisper.head()

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


creating tokens
Mon Jun 19 16:15:33 2023




creating embeddings
Mon Jun 19 16:15:34 2023
0 of 720
done
1 of 720
done
2 of 720
done
3 of 720
done
4 of 720
done
5 of 720
done
6 of 720
done
7 of 720
done
8 of 720
done
9 of 720
done
10 of 720
done
11 of 720
done
12 of 720
done
13 of 720
done
14 of 720
done
15 of 720
done
16 of 720
done
17 of 720
done
18 of 720
done
19 of 720
done
20 of 720
done
21 of 720
done
22 of 720
done
23 of 720
done
24 of 720
done
25 of 720
done
26 of 720
done
27 of 720
done
28 of 720
done
29 of 720
done
30 of 720
done
31 of 720
done
32 of 720
done
33 of 720
done
34 of 720
done
35 of 720
done
36 of 720
done
37 of 720
done
38 of 720
done
39 of 720
done
40 of 720
done
41 of 720
done
42 of 720
done
43 of 720
done
44 of 720
done
45 of 720
done
46 of 720
done
47 of 720
done
48 of 720
done
49 of 720
done
50 of 720
done
51 of 720
done
52 of 720
done
53 of 720
done
54 of 720
done
55 of 720
done
56 of 720
done
57 of 720
done
58 of 720
done
59 of 720
done
60 of 720
done
61 of 720
done
62 of 720
done
63 of 720
done
64 of

Unnamed: 0,id,transcripts,classification,noPersonalQ,personalQ,input_ids,attention_masks,embeddings
0,ID00_hc_0_0_0.wav,"Yeah, in London you can go to Oxford Street, ...",0,0,0,"[[tensor(101), tensor(3398), tensor(1010), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[0.4941955, -0.18511416, 0.46687454, -0.095507..."
1,ID00_hc_0_0_0.wav,So it's a good place to see when you come to L...,0,0,0,"[[tensor(101), tensor(2061), tensor(2009), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[0.25287876, -0.32395035, 0.529815, -0.0971847..."
2,ID00_hc_0_0_0.wav,That's the Royal Family lives so you can come ...,0,0,0,"[[tensor(101), tensor(2008), tensor(1005), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[0.18737839, -0.5163338, 0.41265774, 0.4367547..."
3,ID00_hc_0_0_0.wav,"And there's other, there's Big Ben, the houses...",0,0,0,"[[tensor(101), tensor(1998), tensor(2045), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[0.38002455, -0.02911241, 0.37295663, 0.010036..."
4,ID00_hc_0_0_0.wav,So that's a good place to go and there's the L...,0,0,0,"[[tensor(101), tensor(2061), tensor(2008), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[0.55974483, -0.28615302, 0.37134624, -0.25334..."


In [46]:
df_embeddings_google = collect_embeddings("spontaneousDialogueOnly_google",True,90)
df_embeddings_google.head()

creating tokens
Mon Jun 19 16:19:17 2023




creating embeddings
Mon Jun 19 16:19:17 2023
0 of 650
done
1 of 650
done
2 of 650
done
3 of 650
done
4 of 650
done
5 of 650
done
6 of 650
done
7 of 650
done
8 of 650
done
9 of 650
done
10 of 650
done
11 of 650
done
12 of 650
done
13 of 650
done
14 of 650
done
15 of 650
done
16 of 650
done
17 of 650
done
18 of 650
done
19 of 650
done
20 of 650
done
21 of 650
done
22 of 650
done
23 of 650
done
24 of 650
done
25 of 650
done
26 of 650
done
27 of 650
done
28 of 650
done
29 of 650
done
30 of 650
done
31 of 650
done
32 of 650
done
33 of 650
done
34 of 650
done
35 of 650
done
36 of 650
done
37 of 650
done
38 of 650
done
39 of 650
done
40 of 650
done
41 of 650
done
42 of 650
done
43 of 650
done
44 of 650
done
45 of 650
done
46 of 650
done
47 of 650
done
48 of 650
done
49 of 650
done
50 of 650
done
51 of 650
done
52 of 650
done
53 of 650
done
54 of 650
done
55 of 650
done
56 of 650
done
57 of 650
done
58 of 650
done
59 of 650
done
60 of 650
done
61 of 650
done
62 of 650
done
63 of 650
done
64 of

Unnamed: 0,id,transcripts,classification,noPersonalQ,personalQ,input_ids,attention_masks,embeddings
0,ID00_hc_0_0_0.flac,"Yeah, I'm in London.",0,0,0,"[[tensor(101), tensor(3398), tensor(1010), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[0.32835236, -0.010593282, 0.32343817, -0.1283..."
1,ID00_hc_0_0_0.flac,"You can go to Oxford Street, which is famous f...",0,0,0,"[[tensor(101), tensor(2017), tensor(2064), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[0.23485234, -0.93995064, 0.49484923, 0.388636..."
2,ID00_hc_0_0_0.flac,And the Selfridge is there and a lot of touris...,0,0,0,"[[tensor(101), tensor(1998), tensor(1996), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[0.58746594, -0.122157216, 0.3925981, -0.27175..."
3,ID00_hc_0_0_0.flac,So it's a good place to see me come to London ...,0,0,0,"[[tensor(101), tensor(2061), tensor(2009), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[0.120439835, -0.10013776, 0.6384494, -0.02844..."
4,ID00_hc_0_0_0.flac,That's the way the royal family lives.,0,0,0,"[[tensor(101), tensor(2008), tensor(1005), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[0.2735068, -0.24023503, 0.20387809, 0.1197098..."
