# LAB-2 - Paolo Teta & Ralfs Zangis
---
usefull:
https://www.sbert.net/examples/training/sts/README.html#training-data

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.getOrCreate()

In [2]:
schema = StructType([
    StructField("genre", StringType(), True),
    StructField("filename", StringType(), True),
    StructField("year", StringType(), True),
    StructField("year_id", IntegerType(), True),
    StructField("score", FloatType(), True),
    StructField("sentence1", StringType(), True),
    StructField("sentence2", StringType(), True)])

train = spark.read.csv("stsbenchmark/sts-train.csv", sep ='\t', header=False, schema=schema)
test = spark.read.csv("stsbenchmark/sts-test.csv", sep ='\t', header=False, schema=schema)
dev = spark.read.csv("stsbenchmark/sts-dev.csv", sep ='\t', header=False, schema=schema)

train.show()
test.show()
dev.show()

+-------------+--------+--------+-------+-----+--------------------+--------------------+
|        genre|filename|    year|year_id|score|           sentence1|           sentence2|
+-------------+--------+--------+-------+-----+--------------------+--------------------+
|main-captions|  MSRvid|2012test|      1|  5.0|A plane is taking...|An air plane is t...|
|main-captions|  MSRvid|2012test|      4|  3.8|A man is playing ...|A man is playing ...|
|main-captions|  MSRvid|2012test|      5|  3.8|A man is spreadin...|A man is spreadin...|
|main-captions|  MSRvid|2012test|      6|  2.6|Three men are pla...|Two men are playi...|
|main-captions|  MSRvid|2012test|      9| 4.25|A man is playing ...|A man seated is p...|
|main-captions|  MSRvid|2012test|     11| 4.25|Some men are figh...|Two men are fight...|
|main-captions|  MSRvid|2012test|     12|  0.5|   A man is smoking.|   A man is skating.|
|main-captions|  MSRvid|2012test|     13|  1.6|The man is playin...|The man is playin...|
|main-capt

## Normalize score

In [26]:
train = train.withColumn("score", col("score"))
test = test.withColumn("score", col("score"))
dev = dev.withColumn("score", col("score"))

dev.select("score").describe().show()

+-------+--------------------+
|summary|               score|
+-------+--------------------+
|  count|                1500|
|   mean|-1.03096793323211...|
| stddev|7.682486504435653E-6|
|    min|-1.15199999999999...|
|    max|-8.95999999999999...|
+-------+--------------------+



In [4]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import functools
import numpy as np

In [27]:
text_test = ['this is such an amazing movie!']

tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')

text_preprocessed = preprocessing_layer(text_test)

print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')

Keys       : ['input_type_ids', 'input_mask', 'input_word_ids']
Shape      : (1, 128)
Word Ids   : [ 101 2023 2003 2107 2019 6429 3185  999  102    0    0    0]
Input Mask : [1 1 1 1 1 1 1 1 1 0 0 0]
Type Ids   : [0 0 0 0 0 0 0 0 0 0 0 0]


In [46]:
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1'

encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')

bert_results = encoder(text_preprocessed)

print(f'Loaded BERT: {tfhub_handle_encoder}')
print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

Loaded BERT: https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Pooled Outputs Shape:(1, 512)
Pooled Outputs Values:[ 0.7626287   0.9928098  -0.18611833  0.3667385   0.1523368   0.65504456
  0.9681154  -0.9486271   0.00216172 -0.987773    0.06842685 -0.97630596]
Sequence Outputs Shape:(1, 128, 512)
Sequence Outputs Values:[[-0.28946376  0.3432126   0.3323155  ...  0.21300781  0.7102078
  -0.05771124]
 [-0.2874204   0.31981033 -0.23018603 ...  0.58455133 -0.21329767
   0.7269206 ]
 [-0.66157013  0.68876886 -0.8743305  ...  0.10877283 -0.26173198
   0.47855327]
 ...
 [-0.22561157 -0.28925607 -0.07064402 ...  0.47566003  0.83277047
   0.40025362]
 [-0.29824257 -0.2747315  -0.05450503 ...  0.4884973   1.0955352
   0.18163386]
 [-0.44378242  0.00930739  0.07223728 ...  0.17290103  1.1833243
   0.07897934]]


In [57]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
encoder_inputs = preprocessing_layer(text_input)
outputs = encoder(encoder_inputs)
net = outputs['pooled_output']
#et = tf.keras.layers.GlobalAveragePooling1D()(net)
#et = tf.keras.layers.Dense(1, activation=None, name='classifier')(outputs)
model = tf.keras.Model(outputs, outputs=net)

#model = tf.keras.layers.Concatenate()([get_x()])

tf.keras.utils.plot_model(model)
print(model)

bert_raw_result = model(tf.constant(text_test))
print(tf.sigmoid(bert_raw_result))

model.summary()

#https://www.sbert.net/docs/training/overview.html

ValueError: Found input tensor cannot be reached given provided output tensors. Please make sure the tensor KerasTensor(type_spec=TensorSpec(shape=(None,), dtype=tf.string, name='text'), name='text', description="created by layer 'text'") is included in the model inputs when building functional model.

In [8]:
from transformers import BertTokenizer, TFBertModel

def siamese_bert():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = TFBertModel.from_pretrained('bert-base-uncased')
    text = "Replace me by any text you'd like."
    encoded_input = tokenizer(text, return_tensors='tf')
    
    embedding1 = model(encoded_input)[0]
    embedding2 = model(encoded_input)[0]
    print(embedding1)
    print(model(encoded_input))
    
    x1 = tf.keras.layers.GlobalAveragePooling1D()(embedding1)
    x2 = tf.keras.layers.GlobalAveragePooling1D()(embedding2)
    
    x = tf.keras.layers.Concatenate()([x1, x2])
    x = tf.keras.layers.Dense(64, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    out = tf.keras.layers.Dense(10, activation='softmax')(x)

    model = tf.keras.layers.Model(inputs=[id1, mask1, atn1, id2, mask2, atn2], outputs=out)
    model.compile(loss='sparse_categorical_crossentropy', optimizer=opt)
    
    return model

siamese_bert()

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


tf.Tensor(
[[[ 0.13862708  0.15826827 -0.29666483 ... -0.27084935 -0.28436315
    0.45808426]
  [ 0.53636384 -0.2326964   0.17542008 ...  0.5540252   0.49807143
   -0.0024078 ]
  [ 0.30023718 -0.34751156  0.12084446 ... -0.45624873  0.32880202
    0.87728184]
  ...
  [ 0.37985942  0.12028778  0.828293   ... -0.8623725  -0.5956968
    0.04711557]
  [-0.02524214 -0.7176749  -0.6950481  ...  0.0757422  -0.6667816
   -0.34007484]
  [ 0.7535385   0.23910874  0.07174447 ...  0.24671492 -0.6458058
   -0.32129776]]], shape=(1, 12, 768), dtype=float32)
TFBaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=<tf.Tensor: shape=(1, 12, 768), dtype=float32, numpy=
array([[[ 0.13862708,  0.15826827, -0.29666483, ..., -0.27084935,
         -0.28436315,  0.45808426],
        [ 0.53636384, -0.2326964 ,  0.17542008, ...,  0.5540252 ,
          0.49807143, -0.0024078 ],
        [ 0.30023718, -0.34751156,  0.12084446, ..., -0.45624873,
          0.32880202,  0.87728184],
        ...,
        [ 0.

AttributeError: module 'keras.api._v2.keras.layers' has no attribute 'Model'

In [None]:
from transformers import BertTokenizer, TFBertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='tf')
output = model(encoded_input)

print(output)