In [1]:
# A dependency of the preprocessing for BERT inputs
#!pip install -q -U "tensorflow-text==2.8.*"

In [2]:
#!pip install -q tf-models-official==2.7.0

In [2]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')

In [3]:
#@title Choose a BERT model to fine-tune

bert_model_name = 'small_bert/bert_en_uncased_L-2_H-512_A-8' #@param ["bert_en_uncased_L-12_H-768_A-12", "bert_en_cased_L-12_H-768_A-12", "bert_multi_cased_L-12_H-768_A-12", "small_bert/bert_en_uncased_L-2_H-128_A-2", "small_bert/bert_en_uncased_L-2_H-256_A-4", "small_bert/bert_en_uncased_L-2_H-512_A-8", "small_bert/bert_en_uncased_L-2_H-768_A-12", "small_bert/bert_en_uncased_L-4_H-128_A-2", "small_bert/bert_en_uncased_L-4_H-256_A-4", "small_bert/bert_en_uncased_L-4_H-512_A-8", "small_bert/bert_en_uncased_L-4_H-768_A-12", "small_bert/bert_en_uncased_L-6_H-128_A-2", "small_bert/bert_en_uncased_L-6_H-256_A-4", "small_bert/bert_en_uncased_L-6_H-512_A-8", "small_bert/bert_en_uncased_L-6_H-768_A-12", "small_bert/bert_en_uncased_L-8_H-128_A-2", "small_bert/bert_en_uncased_L-8_H-256_A-4", "small_bert/bert_en_uncased_L-8_H-512_A-8", "small_bert/bert_en_uncased_L-8_H-768_A-12", "small_bert/bert_en_uncased_L-10_H-128_A-2", "small_bert/bert_en_uncased_L-10_H-256_A-4", "small_bert/bert_en_uncased_L-10_H-512_A-8", "small_bert/bert_en_uncased_L-10_H-768_A-12", "small_bert/bert_en_uncased_L-12_H-128_A-2", "small_bert/bert_en_uncased_L-12_H-256_A-4", "small_bert/bert_en_uncased_L-12_H-512_A-8", "small_bert/bert_en_uncased_L-12_H-768_A-12", "albert_en_base", "electra_small", "electra_base", "experts_pubmed", "experts_wiki_books", "talking-heads_base"]

map_name_to_handle = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/1',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_base/2',
    'electra_small':
        'https://tfhub.dev/google/electra_small/2',
    'electra_base':
        'https://tfhub.dev/google/electra_base/2',
    'experts_pubmed':
        'https://tfhub.dev/google/experts/bert/pubmed/2',
    'experts_wiki_books':
        'https://tfhub.dev/google/experts/bert/wiki_books/2',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1',
}

map_model_to_preprocess = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_preprocess/3',
    'electra_small':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'electra_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_pubmed':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_wiki_books':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
}

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

BERT model selected           : https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3


In [4]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

In [5]:
text_test = ['this is such \n an amazing movie!']
text_preprocessed = bert_preprocess_model(text_test)

print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')

bert_model = hub.KerasLayer(tfhub_handle_encoder)
bert_results = bert_model(text_preprocessed)
print(bert_results["pooled_output"])

Keys       : ['input_mask', 'input_word_ids', 'input_type_ids']
Shape      : (1, 128)
Word Ids   : [ 101 2023 2003 2107 2019 6429 3185  999  102    0    0    0]
Input Mask : [1 1 1 1 1 1 1 1 1 0 0 0]
Type Ids   : [0 0 0 0 0 0 0 0 0 0 0 0]
tf.Tensor(
[[-2.59698272e-01 -5.23643434e-01  9.03614283e-01 -5.51340282e-01
  -8.87709260e-01  9.88629103e-01 -3.03575844e-01 -7.45860068e-03
  -9.58391428e-01  6.63808286e-02  2.39761353e-01  6.13346547e-02
  -4.17112745e-02  9.93107080e-01 -5.16154282e-02  9.90459442e-01
   9.52714443e-01  8.48922133e-01 -3.54832113e-01 -1.67718247e-01
   2.86204964e-02 -2.46147841e-01 -9.95749533e-01 -5.12744844e-01
  -2.61097908e-01  9.89429533e-01  3.56763780e-01  7.39698231e-01
  -9.87034678e-01  4.17846799e-01  8.07003528e-02  6.45090222e-01
  -3.34005922e-01  6.02466643e-01 -9.37048197e-01 -3.06564301e-01
  -9.60181415e-01  2.20385909e-01 -8.45490769e-02 -9.94282722e-01
  -3.47029448e-01  3.22469831e-01 -1.73914790e-01 -1.03525810e-01
  -9.04495120e-01  1.586

In [6]:
bert_model = hub.KerasLayer(tfhub_handle_encoder)

In [7]:
import csv
class useBERT:
    def __init__(self, session):
        self.session = session
        self.bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
    def get_sesspaths(self):
        with open(self.session) as pathlist:
            sesspaths = pathlist.read().splitlines()
            print(sesspaths)
        return sesspaths
        
    def bert_sess(self, pathlist):
        #pathlist = './all_session_paths.txt'
        sesspaths = self.get_sesspaths()
        for sess in sesspaths:
            sess = sess.replace('\\', '/')
            sess.replace('\n', '')
            group = sess.split('/')[-1]
            group = group.split('-')[0]
            print("Running ", group)
            #audio = sess + "/" + group + "-audio_PCM"
            asrdir = sess + "/segments_oracle_google-asr"
            bertdir = sess + "/bert"
            bertReps = []

            if not os.path.isdir(bertdir):
                os.mkdir(bertdir)

            seglist = os.listdir(asrdir)
            for segASR in seglist:
                with open(asrdir + '/' + segASR) as asr:
                    segASRtxt = asr.read()#.splitlines()
            # for asr in segASRtxt:
            #     print(asr)
                segASRtxt = segASRtxt.replace("\n", '')
                segASRtxt = segASRtxt.replace(",", '')
                segASRtxt = segASRtxt.replace('.', '')
                segASRtxt = [segASRtxt]
                print(segASRtxt)
                asr_preprocessed = bert_preprocess_model(segASRtxt)
                bert_results = bert_model(asr_preprocessed)
                bertReps.append([segASR.split("_")[-1].split('.')[0],bert_results["pooled_output"]])
                print(bert_results["pooled_output"])
            with open(bertdir + '/' + group + "_oracle-seg_google-asr_bert.csv", "w", newline = '') as f:
                writer = csv.writer(f)
                writer.writerows(bertReps)
        return pathlist
mybert = useBERT('./all_session_paths.txt')
sesspaths = mybert.get_sesspaths()
mybert.bert_sess(sesspaths)
    #print(f'Word Ids   : {sample_text_preprocessed["input_word_ids"][0, :12]}')


['C:\\Users\\Bbykitty\\OneDrive - Colostate\\Research\\Initial Observations for Fib Weights\\Data\\Segment Analysis\\Group_05']
['C:\\Users\\Bbykitty\\OneDrive - Colostate\\Research\\Initial Observations for Fib Weights\\Data\\Segment Analysis\\Group_05']
Running  Group_05
['how great my cameras']
tf.Tensor(
[[ 1.17168665e-01  8.54863346e-01 -7.12035835e-01 -7.20619038e-02
  -8.98049653e-01 -2.59429626e-02 -4.63671118e-01  4.49945666e-02
  -9.97152030e-01 -2.59099722e-01  3.01087469e-01  2.81049192e-01
   3.30718607e-01  8.42198730e-01 -2.56673187e-01  9.90619183e-01
   8.72909248e-01  4.55173552e-01 -6.85322061e-02 -9.07233730e-02
   3.66787873e-02 -9.20740142e-02 -9.82850492e-01  7.94459701e-01
  -3.85434274e-03  9.98136401e-01  2.81561077e-01  9.82588530e-01
  -9.94767487e-01 -9.83799756e-01  4.83014971e-01  9.22739685e-01
  -1.13159604e-01 -3.14641535e-01 -8.93992007e-01 -1.10755622e-01
  -9.75291193e-01  1.63527951e-01  8.44683945e-02 -9.98921633e-01
  -6.08139224e-02 -1.00607313e

['C:\\Users\\Bbykitty\\OneDrive - Colostate\\Research\\Initial Observations for Fib Weights\\Data\\Segment Analysis\\Group_05']

: 

In [12]:
with open(r"D:\Research\Weights_Task\Weights_Task_Audio\Group_07-audio_PCM\asr_segwise\Group_07-audio_PCM_0.txt") as file:
  sample_text = file.readlines()
  
sample_text_preprocessed = bert_preprocess_model(sample_text)
print(f'Word Ids   : {sample_text_preprocessed["input_word_ids"][0, :12]}')

Word Ids   : [  101  1045  2342  2000 10250 12322 11657  2026  8629  2034   102     0]


In [26]:
bert_results = bert_model(sample_text_preprocessed)

print(f'Loaded BERT: {tfhub_handle_encoder}')
print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

Loaded BERT: https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Pooled Outputs Shape:(1, 512)
Pooled Outputs Values:[ 0.9889083   0.66865206 -0.05987267  0.2204324   0.73371845  0.92651063
  0.8043837  -0.99912757 -0.2367327  -0.99491245  0.16186891 -0.9901075 ]
Sequence Outputs Shape:(1, 128, 512)
Sequence Outputs Values:[[ 0.32648575 -0.5899734   0.26809376 ... -1.016001    0.6544438
  -0.3896049 ]
 [ 0.00251958  0.28074425  0.12274555 ... -0.7195308   0.12034444
  -0.20153977]
 [ 0.65652174 -0.4991601  -1.4501407  ... -0.3326539   0.03342106
  -0.25389194]
 ...
 [ 0.5591129  -0.5986114  -0.20513763 ... -0.01014173  0.6426295
   0.5044961 ]
 [ 0.57038784 -0.5277952   0.78075784 ... -0.5439537   0.6570256
   0.08065124]
 [-0.04800163 -0.69249517  0.31327167 ... -0.03270943  1.239066
   0.3046651 ]]


each utterance should give probabilities for each possible class

In [None]:
bert_model = hub.KerasLayer(tfhub_handle_encoder)

In [None]:
bert_results = bert_model(sample_text_preprocessed)

print(f'Loaded BERT: {tfhub_handle_encoder}')
print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

Loaded BERT: https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Pooled Outputs Shape:(101, 512)
Pooled Outputs Values:[ 0.9875273   0.28981873  0.00305407  0.4162918   0.22037777  0.8776239
  0.5865048  -0.9997581  -0.210161   -0.9997121   0.24876355 -0.9950486 ]
Sequence Outputs Shape:(101, 128, 512)
Sequence Outputs Values:[[ 8.0405876e-02 -1.9367257e-01  4.1003567e-01 ... -9.9298942e-01
   1.5285434e-01 -2.3450622e-01]
 [ 2.8769374e-01  7.7893382e-01  2.8492278e-01 ... -8.6487424e-01
   1.0535428e-01 -5.0569636e-01]
 [ 7.1708655e-01  2.5725734e-01  6.4034379e-01 ... -9.2086869e-01
  -5.2321923e-01 -3.4298849e-01]
 ...
 [-2.0108055e-01  1.0630233e+00  7.1559560e-01 ...  7.4237958e-03
   7.6636881e-01  2.4056776e-01]
 [ 7.8114495e-02  5.2257454e-01  6.3745415e-01 ... -6.4362621e-01
  -1.5558220e-01  8.1215632e-01]
 [-2.1598308e-01 -4.4722298e-01  1.4205417e-01 ... -9.3096495e-04
   4.1325310e-01  6.7909801e-01]]
