In [2]:
%pip install transformers


20.08s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
import torch
from transformers import BertTokenizer, BertModel
import pandas as pd
from time import ctime

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)

import matplotlib.pyplot as plt
%matplotlib inline

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Create embeddings 
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )
# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [6]:
## Create tokens 
def create_tokens(filename:str):
    print("creating tokens")
    print(ctime())
    input_ids = []
    attention_masks = []
    csv_file = filename+'.csv'
    df = pd.read_csv(csv_file)
    transcripts = df.transcripts.values

    for transcript in transcripts:
        encoded_dict = tokenizer.encode_plus(
                        transcript,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 430,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    # Convert the lists into tensors.
    df["input_ids"] = input_ids
    df["attention_masks"] = attention_masks   

    return df

def create_embeddings(input_id,mask):
    outputs = model(input_id, mask)

    # Evaluating the model will return a different number of objects based on 
    # how it's  configured in the `from_pretrained` call earlier. In this case, 
    # becase we set `output_hidden_states = True`, the third item will be the 
    # hidden states from all layers. See the documentation for more details:
    # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
    hidden_states = outputs[2]

    # Concatenate the tensors for all layers. We use `stack` here to
    # create a new dimension in the tensor.
    token_embeddings = torch.stack(hidden_states, dim=0)

    # Remove dimension 1, the "batches".
    token_embeddings = torch.squeeze(token_embeddings, dim=1)

    # `token_vecs` is a tensor with shape [430 x 768]
    token_vecs = hidden_states[-2][0]

    # Calculate the sentence token
    return torch.mean(token_vecs, dim=0) 

def collect_embeddings(filename:str,save_csv:bool):

    df = create_tokens(filename)

    print("creating embeddings")
    print(ctime())
    embeddings = []
    rows = len(df)
    for row in range(rows):
        print(f"{row} of {rows}")
        input_id = df.iloc[row].input_ids.reshape(1,430)
        mask = df.iloc[row].attention_masks.reshape(1,430)
        embeddings.append(create_embeddings(input_id,mask).detach().numpy())
        print("done")
    print(embeddings)
    df["embeddings"] = embeddings
    if save_csv:
        out_filename = filename+'_bert_embeddings.csv'
        df.to_csv(out_filename)
    
    return df

df_embeddings_whisper = collect_embeddings("spontaneousDialougeOnly_whisper",True)


creating tokens
Wed Jun 14 21:44:25 2023




creating embeddings
Wed Jun 14 21:44:26 2023
0 of 57
done
1 of 57
done
2 of 57
done
3 of 57
done
4 of 57
done
5 of 57
done
6 of 57
done
7 of 57
done
8 of 57
done
9 of 57
done
10 of 57
done
11 of 57
done
12 of 57
done
13 of 57
done
14 of 57
done
15 of 57
done
16 of 57
done
17 of 57
done
18 of 57
done
19 of 57
done
20 of 57
done
21 of 57
done
22 of 57
done
23 of 57
done
24 of 57
done
25 of 57
done
26 of 57
done
27 of 57
done
28 of 57
done
29 of 57
done
30 of 57
done
31 of 57
done
32 of 57
done
33 of 57
done
34 of 57
done
35 of 57
done
36 of 57
done
37 of 57
done
38 of 57
done
39 of 57
done
40 of 57
done
41 of 57
done
42 of 57
done
43 of 57
done
44 of 57
done
45 of 57
done
46 of 57
done
47 of 57
done
48 of 57
done
49 of 57
done
50 of 57
done
51 of 57
done
52 of 57
done
53 of 57
done
54 of 57
done
55 of 57
done
56 of 57
done
[array([ 7.40067810e-02, -4.08791363e-01,  4.05946165e-01, -5.56644388e-02,
        8.64366740e-02, -3.60120893e-01,  1.84547957e-02,  1.01755142e+00,
       -1.484556

In [7]:
df_embeddings_whisper.head()

Unnamed: 0,id,transcripts,classification,noPersonalQ,personalQ,input_ids,attention_masks,embeddings
0,ID00_hc_0_0_0.wav,"Yeah, in London you can go to Oxford Street, ...",0,0,0,"[[tensor(101), tensor(3398), tensor(1010), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[0.07400678, -0.40879136, 0.40594617, -0.05566..."
1,ID01_hc_0_0_0.wav,Okay. Okay. Yes. Okay. So you want to know wh...,0,0,0,"[[tensor(101), tensor(3100), tensor(1012), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[0.13023636, -0.5625998, 0.4663836, -0.0846741..."
2,ID02_pd_2_0_0.wav,"So this is your first time in London, you've ...",1,0,0,"[[tensor(101), tensor(2061), tensor(2023), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[-0.00093209464, -0.27498776, 0.28973624, -0.1..."
3,ID03_hc_0_0_0_noPersonalQ.wav,"Okay, so I'm much very long than you're here....",0,1,0,"[[tensor(101), tensor(3100), tensor(1010), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[0.09951415, -0.39558145, 0.8188276, -0.263874..."
4,ID03_hc_0_0_0.wav,"Okay, so I'm much very long than you're here....",0,0,1,"[[tensor(101), tensor(3100), tensor(1010), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[0.053875744, -0.3047915, 0.7414313, -0.202025..."


In [8]:
df_embeddings_wav2vec = collect_embeddings("spontaneousDialougeOnly_wav2vec",True)
df_embeddings_wav2vec.head()

creating tokens
Wed Jun 14 21:51:43 2023




creating embeddings
Wed Jun 14 21:51:44 2023
0 of 57
done
1 of 57
done
2 of 57
done
3 of 57
done
4 of 57
done
5 of 57
done
6 of 57
done
7 of 57
done
8 of 57
done
9 of 57
done
10 of 57
done
11 of 57
done
12 of 57
done
13 of 57
done
14 of 57
done
15 of 57
done
16 of 57
done
17 of 57
done
18 of 57
done
19 of 57
done
20 of 57
done
21 of 57
done
22 of 57
done
23 of 57
done
24 of 57
done
25 of 57
done
26 of 57
done
27 of 57
done
28 of 57
done
29 of 57
done
30 of 57
done
31 of 57
done
32 of 57
done
33 of 57
done
34 of 57
done
35 of 57
done
36 of 57
done
37 of 57
done
38 of 57
done
39 of 57
done
40 of 57
done
41 of 57
done
42 of 57
done
43 of 57
done
44 of 57
done
45 of 57
done
46 of 57
done
47 of 57
done
48 of 57
done
49 of 57
done
50 of 57
done
51 of 57
done
52 of 57
done
53 of 57
done
54 of 57
done
55 of 57
done
56 of 57
done
[array([-1.76623672e-01, -8.22509266e-03,  6.40769362e-01, -2.62627602e-01,
       -1.00536637e-01, -3.91643584e-01,  3.13549697e-01,  4.62017268e-01,
        2.748162

Unnamed: 0,id,transcripts,classification,noPersonalQ,personalQ,input_ids,attention_masks,embeddings
0,ID00_hc_0_0_0.wav,Ye m in london ygoi to oxford streets which i...,0,0,0,"[[tensor(101), tensor(6300), tensor(1049), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[-0.17662367, -0.008225093, 0.64076936, -0.262..."
1,ID01_hc_0_0_0.wav,O k k yes o k am am so you want you want to kn...,0,0,0,"[[tensor(101), tensor(1051), tensor(1047), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[-0.40270266, 0.045286078, 0.857207, -0.373066..."
2,ID02_pd_2_0_0.wav,So this is your your first time in london you'...,1,0,0,"[[tensor(101), tensor(2061), tensor(2023), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[-0.3046138, 0.037615173, 0.39422023, -0.38926..."
3,ID03_hc_0_0_0_noPersonalQ.wav,E i so i am not very londoner here i camp her...,0,1,0,"[[tensor(101), tensor(1041), tensor(1045), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[-0.19965193, -0.20022212, 0.645765, -0.282061..."
4,ID03_hc_0_0_0.wav,I iso i am not very londoner here i came here ...,0,0,1,"[[tensor(101), tensor(1045), tensor(11163), te...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[-0.24024455, -0.1889159, 0.81587195, -0.39491..."


In [9]:
df_embeddings_google = collect_embeddings("spontaneousDialogueOnly_google",True)
df_embeddings_google.head()

creating tokens
Wed Jun 14 21:52:16 2023




creating embeddings
Wed Jun 14 21:52:17 2023
0 of 57
done
1 of 57
done
2 of 57
done
3 of 57
done
4 of 57
done
5 of 57
done
6 of 57
done
7 of 57
done
8 of 57
done
9 of 57
done
10 of 57
done
11 of 57
done
12 of 57
done
13 of 57
done
14 of 57
done
15 of 57
done
16 of 57
done
17 of 57
done
18 of 57
done
19 of 57
done
20 of 57
done
21 of 57
done
22 of 57
done
23 of 57
done
24 of 57
done
25 of 57
done
26 of 57
done
27 of 57
done
28 of 57
done
29 of 57
done
30 of 57
done
31 of 57
done
32 of 57
done
33 of 57
done
34 of 57
done
35 of 57
done
36 of 57
done
37 of 57
done
38 of 57
done
39 of 57
done
40 of 57
done
41 of 57
done
42 of 57
done
43 of 57
done
44 of 57
done
45 of 57
done
46 of 57
done
47 of 57
done
48 of 57
done
49 of 57
done
50 of 57
done
51 of 57
done
52 of 57
done
53 of 57
done
54 of 57
done
55 of 57
done
56 of 57
done
[array([ 8.30302760e-02, -2.12607518e-01,  5.46353042e-01, -1.37734458e-01,
        1.40883535e-01, -3.42835635e-01,  1.90243032e-02,  9.43496704e-01,
       -2.137524

Unnamed: 0,id,transcripts,classification,noPersonalQ,personalQ,input_ids,attention_masks,embeddings
0,ID00_hc_0_0_0.flac,"Yeah, I'm in London. You can go to Oxford Stre...",0,0,0,"[[tensor(101), tensor(3398), tensor(1010), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[0.083030276, -0.21260752, 0.54635304, -0.1377..."
1,ID01_hc_0_0_0.flac,"Okay. Okay. Yes. Okay, so you want you want t...",0,0,0,"[[tensor(101), tensor(3100), tensor(1012), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[0.07697948, -0.41965404, 0.55615896, 0.061804..."
2,ID02_pd_2_0_0.flac,"So this is your your first time in London, you...",1,0,0,"[[tensor(101), tensor(2061), tensor(2023), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[-0.037038386, -0.2024736, 0.45683667, -0.0652..."
3,ID03_hc_0_0_0_noPersonalQ.flac,Okay. So I'm a Londoner fewer. I came here for...,0,1,0,"[[tensor(101), tensor(3100), tensor(1012), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[0.1356038, -0.57575417, 0.6849178, -0.0581971..."
4,ID03_hc_0_0_0.flac,Okay. So I'm a Londoner fewer. I came here for...,0,0,1,"[[tensor(101), tensor(3100), tensor(1012), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[0.032855537, -0.6075742, 0.65772057, -0.02078..."
