# LAB-2 - Paolo Teta & Ralfs Zangis
---
usefull:
https://www.sbert.net/examples/training/sts/README.html#training-data

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.getOrCreate()

In [2]:
schema = StructType([
    StructField("genre", StringType(), True),
    StructField("filename", StringType(), True),
    StructField("year", StringType(), True),
    StructField("year_id", IntegerType(), True),
    StructField("score", FloatType(), True),
    StructField("sentence1", StringType(), True),
    StructField("sentence2", StringType(), True)])

train = spark.read.csv("stsbenchmark/sts-train.csv", sep ='\t', header=False, schema=schema)
test = spark.read.csv("stsbenchmark/sts-test.csv", sep ='\t', header=False, schema=schema)
dev = spark.read.csv("stsbenchmark/sts-dev.csv", sep ='\t', header=False, schema=schema)

train.show()
test.show()
dev.show()

+-------------+--------+--------+-------+-----+--------------------+--------------------+
|        genre|filename|    year|year_id|score|           sentence1|           sentence2|
+-------------+--------+--------+-------+-----+--------------------+--------------------+
|main-captions|  MSRvid|2012test|      1|  5.0|A plane is taking...|An air plane is t...|
|main-captions|  MSRvid|2012test|      4|  3.8|A man is playing ...|A man is playing ...|
|main-captions|  MSRvid|2012test|      5|  3.8|A man is spreadin...|A man is spreadin...|
|main-captions|  MSRvid|2012test|      6|  2.6|Three men are pla...|Two men are playi...|
|main-captions|  MSRvid|2012test|      9| 4.25|A man is playing ...|A man seated is p...|
|main-captions|  MSRvid|2012test|     11| 4.25|Some men are figh...|Two men are fight...|
|main-captions|  MSRvid|2012test|     12|  0.5|   A man is smoking.|   A man is skating.|
|main-captions|  MSRvid|2012test|     13|  1.6|The man is playin...|The man is playin...|
|main-capt

## Normalize score

In [3]:
train = train.withColumn("score", col("score"))
test = test.withColumn("score", col("score"))
dev = dev.withColumn("score", col("score"))

dev.select("score").describe().show()

+-------+------------------+
|summary|             score|
+-------+------------------+
|  count|              1500|
|   mean|2.3639075540602206|
| stddev|1.5004856453975888|
|    min|               0.0|
|    max|               5.0|
+-------+------------------+



In [4]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import functools
import numpy as np

In [5]:
text_test = ['this is such an amazing movie!']

tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')

text_preprocessed = preprocessing_layer(text_test)

print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')

Keys       : ['input_type_ids', 'input_mask', 'input_word_ids']
Shape      : (1, 128)
Word Ids   : [ 101 2023 2003 2107 2019 6429 3185  999  102    0    0    0]
Input Mask : [1 1 1 1 1 1 1 1 1 0 0 0]
Type Ids   : [0 0 0 0 0 0 0 0 0 0 0 0]


In [6]:
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1'

encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')

bert_results = encoder(text_preprocessed)

print(f'Loaded BERT: {tfhub_handle_encoder}')
print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

Loaded BERT: https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Pooled Outputs Shape:(1, 512)
Pooled Outputs Values:[ 0.7626287   0.9928098  -0.18611833  0.3667385   0.1523368   0.65504456
  0.9681154  -0.9486271   0.00216172 -0.987773    0.06842685 -0.97630596]
Sequence Outputs Shape:(1, 128, 512)
Sequence Outputs Values:[[-0.28946376  0.3432126   0.3323155  ...  0.21300781  0.7102078
  -0.05771124]
 [-0.2874204   0.31981033 -0.23018603 ...  0.58455133 -0.21329767
   0.7269206 ]
 [-0.66157013  0.68876886 -0.8743305  ...  0.10877283 -0.26173198
   0.47855327]
 ...
 [-0.22561157 -0.28925607 -0.07064402 ...  0.47566003  0.83277047
   0.40025362]
 [-0.29824257 -0.2747315  -0.05450503 ...  0.4884973   1.0955352
   0.18163386]
 [-0.44378242  0.00930739  0.07223728 ...  0.17290103  1.1833243
   0.07897934]]


In [7]:
text_input_u = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text_u')
text_input_v = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text_v')

preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing_u')
encoder_input_u = preprocessing_layer(text_input_u)
preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing_v')
encoder_input_v = preprocessing_layer(text_input_v)

encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder_u')
outputs_u = encoder(encoder_input_u)
encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder_v')
outputs_v = encoder(encoder_input_v)

u = tf.keras.layers.Dense(16, name='u')(outputs_u['pooled_output'])
v = tf.keras.layers.Dense(16, name='v')(outputs_v['pooled_output'])

concat_layer = tf.keras.layers.Concatenate(name='concatenate')
output_concat = concat_layer([u, v])

final = tf.keras.layers.Dense(1, name='classifier')
finaly = final(output_concat)

model = tf.keras.Model(inputs=[text_input_u, text_input_v], outputs=[finaly])

tf.keras.utils.plot_model(model)
print(model)

model.summary()

#https://www.sbert.net/docs/training/overview.html

<keras.engine.functional.Functional object at 0x7f7a3d0c9070>
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text_u (InputLayer)            [(None,)]            0           []                               
                                                                                                  
 text_v (InputLayer)            [(None,)]            0           []                               
                                                                                                  
 preprocessing_u (KerasLayer)   {'input_word_ids':   0           ['text_u[0][0]']                 
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                

In [None]:
optimizer = tf.optimizers.Adam(0.001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

df = test.select("sentence1","sentence2","score").toPandas()

print(df.isna().sum())
df = df.fillna('')

#import numpy as np
#df["score"] = np.asarray(df["score"]).astype(np.float32)

results = model.evaluate([df["sentence1"],df["sentence2"]],df["score"])
print("Results: ", results)

sentence1    0
sentence2    3
score        0
dtype: int64