In [2]:
# load data and write out sentence and target
import pandas as pd

loaded_set = pd.read_excel("Dataset/"+"training.xlsx")
loaded_set['Sentence']

0      Weitergehende Sicherungsmaßnahmen können eine ...
1      In der großen Kasernenanlage im Norden Kiels k...
2      Premierminister David Lloyd George honorierte ...
3      Der Beitrag der Truppen dieser Dominions währe...
4      Eine Balance zwischen verschiedenen Lebensbere...
                             ...                        
895    Zwischen 1901 und 2010 ist er um ca 1,7 cm pro...
896    Aus Furcht vor einem Bürgerkrieg wollte sie – ...
897    In den meisten Politikfeldern gilt dafür seit ...
898    Aufgrund der Wärmekapazität des Gesteins, und ...
899    Die Klinge ist zumeist aus nicht rostfreiem Ko...
Name: Sentence, Length: 900, dtype: object

In [3]:

from transformers import AutoModel, AutoTokenizer
# german tokens for bert
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
#model = AutoModel.from_pretrained("dbmdz/bert-base-german-cased")



tokens_num=[]
for sen in loaded_set['Sentence']:
    tokenized = (tokenizer.tokenize(sen)) 
    tokens_num.append( ['[CLS]'] + tokenized + ['[SEP]']) 
    
# get max_seq length    
lens = [len(i) for i in tokens_num]
max_seq_length = max(lens)
max_seq_length = int(1.5*max_seq_length)
#max_seq_length = 256
tokens_num[0]

['[CLS]',
 'Weiter',
 '##gehende',
 'Sicherungs',
 '##maßnahmen',
 'können',
 'eine',
 'Video',
 '##überwachung',
 'und',
 'eine',
 'Zugangs',
 '##kontrolle',
 'durch',
 'einen',
 'Tür',
 '##öff',
 '##ner',
 'sein',
 ',',
 'denn',
 'viele',
 'G',
 '##AA',
 'befinden',
 'sich',
 'in',
 'Vor',
 '##räumen',
 'der',
 'Geschäfts',
 '##stellen',
 'der',
 'Banken',
 ',',
 'sodass',
 'sie',
 'auch',
 'außerhalb',
 'der',
 'Schalter',
 '##öffnungs',
 '##zeiten',
 'zugänglich',
 'sind',
 '.',
 '[SEP]']

In [4]:
tokenizer.convert_tokens_to_ids(tokens_num[0])

[102,
 1784,
 13183,
 28847,
 4686,
 618,
 261,
 4770,
 20815,
 136,
 261,
 21093,
 11600,
 387,
 397,
 2451,
 706,
 432,
 290,
 818,
 1398,
 1358,
 159,
 10695,
 3857,
 251,
 153,
 445,
 7721,
 125,
 2484,
 984,
 125,
 8232,
 818,
 7415,
 307,
 313,
 5729,
 125,
 28802,
 17893,
 4083,
 10370,
 341,
 566,
 103]

In [5]:
def manual_features(x):
    letter_count = []
    avarange_letter_per_word = []
    num_words = []
    num_letters_array = []
    longest_word_length = []
    shortest_word_length = []
    genitiv = []
    akkusativ = []
    dativ = []
    dass = []

    for sen in x:
        current_sen_split = sen.split()
        num_words.append(len(current_sen_split))
        num_letters = []
            
        if "des" in sen:
            genitiv.append(1)
        else:
            genitiv.append(0)

        if "dem" in sen:
            akkusativ.append(1)
        else:
            akkusativ.append(0)

        if "den" in sen:
            dativ.append(1)
        else:
            dativ.append(0)

        if "dass" in sen:
            dass.append(1)
        else:
                dass.append(0)

        for y in range(len(current_sen_split)):
            current_word = current_sen_split[y]
    
            num_letters.append(len(current_word))
    
        current_lettercount = sum(num_letters)
        letter_count.append(current_lettercount) 
        avarange_letter_per_word.append(current_lettercount/len(current_sen_split))
        longest_word_length.append(max(num_letters)) 
        shortest_word_length.append(min(num_letters)) 
    



    feature_dict = {
        'dativ':dativ, 
        'akkusativ': akkusativ, 
        'genitiv': genitiv, 
        'dass': dass,
        'num_words':num_words,
        'letter_count':letter_count,
        'avarange_letter_per_word':avarange_letter_per_word,
        'longest_word_length':longest_word_length,
        'shortest_word_length':shortest_word_length, 
        }

    feature_dataframe = pd.DataFrame(data=feature_dict)
    scaler = StandardScaler()

    feature_dataframe[['num_words', 'longest_word_length', 'shortest_word_length', 'letter_count', 'avarange_letter_per_word']] = scaler.fit_transform(feature_dataframe[['num_words', 'longest_word_length', 'shortest_word_length', 'letter_count', 'avarange_letter_per_word']])

    feature_dataframe[['num_words', 'longest_word_length', 'shortest_word_length', 'letter_count', 'avarange_letter_per_word']] = scaler.transform(feature_dataframe[['num_words', 'longest_word_length', 'shortest_word_length', 'letter_count', 'avarange_letter_per_word']])


    tensorX = tf.constant(feature_dataframe.values)

    return tensorX

In [6]:
import numpy as np
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
def encode_names(n, tokenizer):
   tokens = list(tokenizer.tokenize(n))
   tokens.append('[SEP]')
   return tokenizer.convert_tokens_to_ids(tokens)

def bert_encode(string_list, tokenizer, max_seq_length):
    num_examples = len(string_list)
  


    letter_count = []
    avarange_letter_per_word = []
    num_words = []
    num_letters_array = []
    longest_word_length = []
    shortest_word_length = []
    genitiv = []
    akkusativ = []
    dativ = []
    dass = []

    for sen in string_list:
        current_sen_split = sen.split()
        num_words.append(len(current_sen_split))
        num_letters = []
            
        if "des" in sen:
            genitiv.append(1)
        else:
            genitiv.append(0)

        if "dem" in sen:
            akkusativ.append(1)
        else:
            akkusativ.append(0)

        if "den" in sen:
            dativ.append(1)
        else:
            dativ.append(0)

        if "dass" in sen:
            dass.append(1)
        else:
                dass.append(0)

        for y in range(len(current_sen_split)):
            current_word = current_sen_split[y]
    
            num_letters.append(len(current_word))
    
        current_lettercount = sum(num_letters)
        letter_count.append(current_lettercount) 
        avarange_letter_per_word.append(current_lettercount/len(current_sen_split))
        longest_word_length.append(max(num_letters)) 
        shortest_word_length.append(min(num_letters)) 
    



    feature_dict = {
        'num_words':num_words,
        'avarange_letter_per_word':avarange_letter_per_word,
        'longest_word_length':longest_word_length,
        }

    feature_dataframe = pd.DataFrame(data=feature_dict)
    scaler = StandardScaler()

    feature_dataframe[['num_words', 'longest_word_length',  'avarange_letter_per_word']] = scaler.fit_transform(feature_dataframe[['num_words', 'longest_word_length', 'avarange_letter_per_word']])

    feature_dataframe[['num_words', 'longest_word_length', 'avarange_letter_per_word']] = scaler.transform(feature_dataframe[['num_words', 'longest_word_length', 'avarange_letter_per_word']])


    X_train_mF = tf.constant(feature_dataframe.values)

  



    string_tokens = tf.ragged.constant([
      encode_names(n, tokenizer) for n in np.array(string_list)])

    cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*string_tokens.shape[0]
    input_word_ids = tf.concat([cls, string_tokens], axis=-1)

    input_mask = tf.ones_like(input_word_ids).to_tensor(shape=(None, max_seq_length))

    type_cls = tf.zeros_like(cls)
    type_tokens = tf.ones_like(string_tokens)
    input_type_ids = tf.concat(
      [type_cls, type_tokens], axis=-1).to_tensor(shape=(None, max_seq_length))
    scaler_input_word_ids = scaler.fit_transform(input_type_ids)  

    inputs = {
      #'sc': scaler_input_word_ids,
      #'input_word_ids': input_word_ids,
      'input_word_ids': input_word_ids.to_tensor(shape=(None, max_seq_length)),
      'input_mask': input_mask,
      'input_type_ids': input_type_ids,
      'X_train_mF': X_train_mF
      }

    return inputs

In [7]:
from sklearn.model_selection import train_test_split

x = loaded_set['Sentence']
y = loaded_set['MOS']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=32)
y_train = round(y_train, 2)
y_test = round(y_test, 2)



In [8]:
import tensorflow as tf
X_train = bert_encode(x_train, tokenizer, max_seq_length)
X_test = bert_encode(x_test, tokenizer, max_seq_length)



INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


In [9]:
import tensorflow_hub as hub
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/2",
                            trainable=False)

INFO:absl:Using C:\Users\phili\AppData\Local\Temp\tfhub_modules to cache modules.


In [71]:
embedding_size = 768
max_seq_length = max_seq_length  #length of the tokenised tensor

input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                       name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                   name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                    name="segment_ids")

X_train_mF = tf.keras.layers.Input(shape=(3,), dtype=tf.int32,
                                    name="X_train_mF")

pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])   
dropout = tf.keras.layers.Dropout(0.2)(pooled_output)
reshaped_bert = tf.keras.layers.Reshape((6,128))(dropout)

dense_mf_1 = tf.keras.layers.Dense(20)(X_train_mF)
dense_mf_2 = tf.keras.layers.Dense(128)(dense_mf_1)
#dense_mf_3 = tf.keras.layers.Dense(24)(dropout_mf)
dropout_mf = tf.keras.layers.Dropout(0.3)(dense_mf_2)
reshaped_mf = tf.keras.layers.Reshape((1,128))(dense_mf_2)

#concatinated_3 = tf.concat([concatinated_1, concatinated_2 ], 1)
#reshaped_mf = tf.keras.layers.Reshape((1,24))(dense_mf_3)

concatinated = tf.concat([reshaped_bert, reshaped_mf], 1)

gru_1_out = tf.keras.layers.GRU(200, return_sequences=True, activation='relu')(concatinated)
gru_2_out = tf.keras.layers.GRU(100, return_sequences=True, activation='relu')(gru_1_out)

flat = tf.keras.layers.Flatten()(gru_2_out)
dropout_2 = tf.keras.layers.Dropout(0.3)(flat)
dense_2 = tf.keras.layers.Dense(300)(dropout_2)
dense_3 = tf.keras.layers.Dense(100)(dense_2)
dense_4 = tf.keras.layers.Dense(50)(dense_3)

pred = tf.keras.layers.Dense(1)(dense_2)
     



model = tf.keras.Model(
    inputs={
        'input_word_ids': input_word_ids,
        'input_mask': input_mask,
        'input_type_ids': segment_ids,
        'X_train_mF':X_train_mF
        },
        outputs=pred)

In [72]:
model.compile(optimizer= tf.keras.optimizers.Adam(0.001),
              loss= "mean_absolute_error",
              metrics= ["mean_squared_error"])
model.summary()

Model: "model_13"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 144)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 144)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 144)]        0                                            
__________________________________________________________________________________________________
X_train_mF (InputLayer)         [(None, 3)]          0                                            
___________________________________________________________________________________________

In [73]:
epochs =  50
batch_size = 15

model.fit(X_train, y_train.values, epochs=epochs, batch_size=batch_size)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50

In [65]:
import numpy as np
pred = model.predict(X_test)
rounded_pred = np.around(pred, decimals=2)
rounded_pred

array([[2.78],
       [4.23],
       [2.64],
       [3.45],
       [2.32],
       [3.12],
       [4.66],
       [4.44],
       [3.72],
       [4.27],
       [2.76],
       [4.57],
       [3.96],
       [2.84],
       [3.12],
       [4.35],
       [2.57],
       [3.31],
       [3.79],
       [2.49],
       [4.07],
       [2.41],
       [3.98],
       [2.86],
       [2.62],
       [3.5 ],
       [3.88],
       [3.42],
       [3.  ],
       [2.57],
       [2.61],
       [3.6 ],
       [4.22],
       [4.52],
       [3.42],
       [3.47],
       [3.74],
       [4.83],
       [4.47],
       [4.97],
       [3.31],
       [3.11],
       [3.17],
       [2.59],
       [3.91],
       [4.13],
       [3.47],
       [3.31],
       [3.62],
       [3.91],
       [5.07],
       [4.02],
       [2.89],
       [3.52],
       [3.03],
       [1.72],
       [2.87],
       [3.55],
       [2.36],
       [4.07],
       [2.99],
       [2.75],
       [3.38],
       [4.67],
       [4.56],
       [3.36],
       [3.

In [67]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

rmse(rounded_pred.transpose(), y_test.values)


1.067631593325197