# LAB-2 - Paolo Teta & Ralfs Zangis
---
useful:
https://www.sbert.net/examples/training/sts/README.html#training-data

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.getOrCreate()

In [2]:
# !pip install --upgrade tensorflow_hub
# !pip install --upgrade tensorflow_text

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

import functools

import numpy as np

In [3]:
schema = StructType([
    StructField("genre", StringType(), True),
    StructField("filename", StringType(), True),
    StructField("year", StringType(), True),
    StructField("year_id", IntegerType(), True),
    StructField("score", FloatType(), True),
    StructField("sentence1", StringType(), True),
    StructField("sentence2", StringType(), True)])

train = spark.read.csv("stsbenchmark/sts-train.csv", sep ='\t', header=False, schema=schema)
test = spark.read.csv("stsbenchmark/sts-test.csv", sep ='\t', header=False, schema=schema)
dev = spark.read.csv("stsbenchmark/sts-dev.csv", sep ='\t', header=False, schema=schema)

train.show()
test.show()
dev.show()

+-------------+--------+--------+-------+-----+--------------------+--------------------+
|        genre|filename|    year|year_id|score|           sentence1|           sentence2|
+-------------+--------+--------+-------+-----+--------------------+--------------------+
|main-captions|  MSRvid|2012test|      1|  5.0|A plane is taking...|An air plane is t...|
|main-captions|  MSRvid|2012test|      4|  3.8|A man is playing ...|A man is playing ...|
|main-captions|  MSRvid|2012test|      5|  3.8|A man is spreadin...|A man is spreadin...|
|main-captions|  MSRvid|2012test|      6|  2.6|Three men are pla...|Two men are playi...|
|main-captions|  MSRvid|2012test|      9| 4.25|A man is playing ...|A man seated is p...|
|main-captions|  MSRvid|2012test|     11| 4.25|Some men are figh...|Two men are fight...|
|main-captions|  MSRvid|2012test|     12|  0.5|   A man is smoking.|   A man is skating.|
|main-captions|  MSRvid|2012test|     13|  1.6|The man is playin...|The man is playin...|
|main-capt

## Normalize score column

In [4]:
train = train.withColumn("score", col("score"))
test = test.withColumn("score", col("score"))
dev = dev.withColumn("score", col("score"))

dev.select("score").describe().show()

+-------+------------------+
|summary|             score|
+-------+------------------+
|  count|              1500|
|   mean|2.3639075540602206|
| stddev|1.5004856453975888|
|    min|               0.0|
|    max|               5.0|
+-------+------------------+



In [5]:
text_test = ['this is such an amazing movie!']

tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')

text_preprocessed = preprocessing_layer(text_test)

print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')

Keys       : ['input_mask', 'input_type_ids', 'input_word_ids']
Shape      : (1, 128)
Word Ids   : [ 101 2023 2003 2107 2019 6429 3185  999  102    0    0    0]
Input Mask : [1 1 1 1 1 1 1 1 1 0 0 0]
Type Ids   : [0 0 0 0 0 0 0 0 0 0 0 0]


In [6]:
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1'

encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')

bert_results = encoder(text_preprocessed)

print(f'Loaded BERT: {tfhub_handle_encoder}')
print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

Loaded BERT: https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Pooled Outputs Shape:(1, 512)
Pooled Outputs Values:[ 0.76262885  0.99280983 -0.1861186   0.3667385   0.15233707  0.65504473
  0.9681154  -0.948627    0.00216191 -0.9877732   0.06842708 -0.97630596]
Sequence Outputs Shape:(1, 128, 512)
Sequence Outputs Values:[[-0.28946298  0.3432125   0.3323147  ...  0.2130084   0.710207
  -0.0577119 ]
 [-0.28741992  0.3198101  -0.23018652 ...  0.5845503  -0.21329783
   0.7269202 ]
 [-0.66157013  0.68876815 -0.87432945 ...  0.10877208 -0.2617318
   0.47855297]
 ...
 [-0.22561139 -0.2892564  -0.07064369 ...  0.4756601   0.83277094
   0.40025324]
 [-0.2982423  -0.27473155 -0.05450507 ...  0.48849753  1.0955356
   0.18163389]
 [-0.44378242  0.00930706  0.07223725 ...  0.17290062  1.1833242
   0.0789794 ]]


## Input layer

In [7]:
text_input_u = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text_u')
text_input_v = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text_v')

## Pre-processing layer

In [8]:
preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing_u')
encoder_input_u = preprocessing_layer(text_input_u)
preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing_v')
encoder_input_v = preprocessing_layer(text_input_v)

## BERT encoder layer

In [9]:
encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder_u')
outputs_u = encoder(encoder_input_u)
encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder_v')
outputs_v = encoder(encoder_input_v)

## Pooled output

In [10]:
u = tf.keras.layers.Dense(16, name='u')(outputs_u['pooled_output'])
v = tf.keras.layers.Dense(16, name='v')(outputs_v['pooled_output'])

## Concatenate layer

In [11]:
concat_layer = tf.keras.layers.Concatenate(name='concat')
output_concat = concat_layer([u, v])

## Output layer

In [12]:
final_layer = tf.keras.layers.Dense(1, name='classifier')
final = final_layer(output_concat)

## Model

In [15]:
model = tf.keras.Model(inputs=[text_input_u, text_input_v], outputs=[final])

tf.keras.utils.plot_model(model)
print(model)

model.summary()

#https://www.sbert.net/docs/training/overview.html

<keras.engine.functional.Functional object at 0x0000024630722CD0>
Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text_u (InputLayer)            [(None,)]            0           []                               
                                                                                                  
 text_v (InputLayer)            [(None,)]            0           []                               
                                                                                                  
 preprocessing_u (KerasLayer)   {'input_mask': (Non  0           ['text_u[0][0]']                 
                                e, 128),                                                          
                                 'input_type_ids':                                                
                          

In [16]:
optimizer = tf.optimizers.Adam(0.001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

df = test.select("sentence1","sentence2","score").toPandas()

print(df.isna().sum())
df = df.fillna('')

#import numpy as np
#df["score"] = np.asarray(df["score"]).astype(np.float32)

results = model.evaluate([df["sentence1"],df["sentence2"]],df["score"])
print("Results: ", results)

sentence1    0
sentence2    3
score        0
dtype: int64
Results:  [3.4427084922790527, 0.055837564170360565]
