In [1]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from collections import deque
from itertools import cycle

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from collections import defaultdict, namedtuple

In [2]:
df = pd.read_csv("data_raw.csv")
df.tema[df.tema == "sociálni politika"] = "sociální politika"
df.head()

Unnamed: 0,argumentace,demonizace,emoce,fabulace,lokace,nalepkovani,nazor,odbornik,relativizace,rusko,strach,tema,vina,vyzneni_celku,zamereni,zanr,zdroj,text
0,ne,ne,missing,ne,Rusko,ne,ne,ne,ne,neutrální,ne,jiné,ne,neutrální,zahraniční,zpravodajství,ne,Záhad je tady více než v Bermudském trojúhelní...
1,ne,ne,missing,ne,Rusko,ano,ne,ne,ne,neutrální,ano,zbrojní politika,ano,pozitivní,zahraniční,zpravodajství,ano,Putin potvrdil novou zbraň: nepřemožitelná jad...
2,ano,ne,missing,ano,EU,ano,ano,ano,ne,missing,ano,ekonomika / finance,ano,negativní,obojí,komentář,ne,Markéta Šichtařová: Slušný člověk nekrade (A n...
3,ano,ne,missing,ne,Rusko + USA,ano,ne,ne,ne,neutrální,ne,konflikt v Sýrii,ne,neutrální,zahraniční,zpravodajství,ano,Rusko varovalo USA před využitím obvinění prot...
4,ne,ne,missing,ne,jiná země,ne,ano,ne,ne,missing,ne,konflikt v Sýrii,ne,neutrální,zahraniční,zpravodajství,ano,Izrael v jižní Sýrii přímo vyzbrojuje nejméně ...


In [3]:
InOut = namedtuple('InOut', ['x', 'y'])
data = InOut(None, None)

# Targets

In [4]:
feature_key = {                                                      
    'zanr':  ['zpravodajství', 'rozhovor', 'komentář'],                       
    'tema': ['migrační krize', 'domácí politika',                             
        'zahraniční politika / diplomacie',                                   
        'společnost / společenská situace', 'jiné', 'energetika',             
        'sociální politika', 'konflikt na Ukrajině', 'kultura',               
        'konflikt v Sýrii', 'zbrojní politika', 'ekonomika / finance',        
        'konspirace'],                                                        
    'zamereni': ['zahraniční', 'domácí', 'obojí', 'nelze určit'],             
    'lokace': ['EU', 'Česká republika', 'USA', 'jiná země',                   
        'jiné / nelze určit', 'Rusko', 'NATO', 'Rusko + USA'],                
    'argumentace': ['ne', 'ano'],                                             
    'emoce': ['missing', 'rozhořčení', 'soucit', 'strach', 'nenávist', 'jiná'],
    'vyzneni_celku': ['neutrální', 'negativní', 'pozitivní'],                 
    'rusko': ['missing', 'pozitivní příklad', 'neutrální', 'oběť',            
        'negativní příklad', 'hrdina'],                                       
    'vyzneni1': 
        ['neutrální', 'negativní', 'missing', 'pozitivní', 'velebící', 'nenávistné'],                                                        
    'vyzneni2':
        ['neutrální', 'negativní', 'missing', 'pozitivní', 'velebící', 'nenávistné'],                                                        
    'vyzneni3':
        ['neutrální', 'negativní', 'missing', 'pozitivní', 'velebící', 'nenávistné'],                                                        
    'obrazek': ['ne', 'ano'],                                                 
    'video': ['ne', 'ano'],                                                   
    'nazor': ['ne', 'ano'],                                                   
    'odbornik': ['ne', 'ano'],                                                
    'zdroj': ['ne', 'ano'],                                                   
    'strach': ['ne', 'ano'],                                                  
    'vina': ['ne', 'ano'],                                                    
    'nalepkovani': ['ne', 'ano'],                                             
    'demonizace': ['ne', 'ano'],                                              
    'relativizace': ['ne', 'ano'],                                                                                                                           
    'fabulace': ['ne', 'ano'],                                                
    'year': ['2016', '2017', '2018']
}

In [5]:
ydf = pd.DataFrame()
for column in df.loc[:, df.columns != 'text']:
    ydf[column] = df[column].map(lambda x: feature_key[column].index(x))
display(ydf.head())
y = [ydf[col].to_numpy() for col in ydf]
print(len(y), "x", len(y[0]))

Unnamed: 0,argumentace,demonizace,emoce,fabulace,lokace,nalepkovani,nazor,odbornik,relativizace,rusko,strach,tema,vina,vyzneni_celku,zamereni,zanr,zdroj
0,0,0,0,0,5,0,0,0,0,2,0,4,0,0,0,0,0
1,0,0,0,0,5,1,0,0,0,2,1,10,1,2,0,0,1
2,1,0,0,1,0,1,1,1,0,0,1,11,1,1,2,2,0
3,1,0,0,0,7,1,0,0,0,2,0,9,0,0,0,0,1
4,0,0,0,0,3,0,1,0,0,0,0,9,0,0,0,0,1


17 x 8642


# Inputs

In [6]:
x_raw = df.text

In [7]:
import sentencepiece as spm

max_seq_len = 256

class SPMEmbedder():
    def __init__(self, path):
        sp = spm.SentencePieceProcessor()
        sp.Load(path)
        self.sp = sp
        
    def encode(self, text):
        return self.sp.EncodeAsIds(text)
    
    def decode(self, indexes):
        return self.sp.DecodeIds(list(map(int, indexes)))
    
    def decode_pieces(self, indexes):
        return " ".join(self.sp.id_to_piece(int(id_)) for id_ in indexes)

In [8]:
emb = SPMEmbedder("../albert_ckpt/tenten_smp_5_30K.model")

def test_encoding_fcn():
    sentence = "pes a kočka hráli neznáméslovo na zahradě"
    print(f"Sentence: {sentence}")
    
    encoded = emb.encode(sentence)
    print(f"Encoded:  {np.array(encoded)}")
    
    padded = pad_sequences([encoded], 12, padding="post", truncating="post")
    print(f"Padded:   {padded[0]}")
    
    decoded = emb.decode(padded[0])
    print(f"Decoded:  {decoded}")
    
    print(f"Decoded_: {emb.decode_pieces(padded[0])}")

    

def embedData(data, embedder, padding=50):
    embed = lambda sentence: pad_sequences([embedder.encode(sentence)], padding,
                                           padding="post", truncating="post")[0]
    
    out = InOut(np.stack(data.x.map(embed)), np.stack(data.y))
    return out

def lemmatize(text):
    text = text.lower()
    text = re.sub("[^0-9a-zóěščřžýáíďéťňůú\\.,\\!\\?% ]", "", text)
    text = re.sub("[0-9]+", " # ", text)
    return text

test_encoding_fcn()

Sentence: pes a kočka hráli neznáméslovo na zahradě
Encoded:  [ 2353     5 10792  3462 12381  1700  1855     6  4968]
Padded:   [ 2353     5 10792  3462 12381  1700  1855     6  4968     0     0     0]
Decoded:  pes a kočka hráli neznáméslovo na zahradě ⁇  ⁇  ⁇ 
Decoded_: ▁pes ▁a ▁kočka ▁hráli ▁neznámé sl ovo ▁na ▁zahradě <unk> <unk> <unk>


In [75]:
# x = x_raw.map(lemmatize)
embed = lambda sentence: pad_sequences([emb.encode(sentence)], max_seq_len,
                                           padding="post", truncating="post")[0]
x = np.stack(x_raw.map(lemmatize).map(embed))

# Model


In [92]:
import tensorflow as tf
from tensorflow import keras as keras
from tensorflow import keras
import bert
from pprint import pprint

column = "nalepkovani"
if column:
    column_index = ydf.columns.to_list().index(column)
    print(column_index)
    
def get_albert_layer(path, max_seq_len, name, trainable=True):
    albert_params = bert.albert_params(path)
    l_albert = bert.BertModelLayer.from_params(albert_params, name=name, 
                                             shared_layer=True, trainable=trainable)
    return l_albert
    
def get_model(path, max_seq_len, feature_key, columns, name):
    l_input_ids = keras.layers.Input(shape=(max_seq_len,), dtype='int32')
    l_albert = get_albert_layer(path, max_seq_len, "l_albert")
    
    l_middle = l_albert(l_input_ids)
#     l_middle = keras.layers.Flatten()(l_middle)
    l_middle = keras.layers.Lambda(lambda seq: seq[:, 0, :])(l_middle)
    
    
    l_middle = keras.layers.Dense(units=128, activation="tanh", name="dense_1")(l_middle)
    l_middle = keras.layers.Dense(units=128, activation="tanh", name="dense_2")(l_middle)
    
    outputs = []
    losses = {}
    
    for col in columns:
        if column and col != column:
            continue
        num_classes = len(feature_key[col])
        if num_classes == 2:
            output = keras.layers.Dense(1, activation='sigmoid', name=col)(l_middle)
            losses[col] = "binary_crossentropy"
        else:
            output = keras.layers.Dense(num_classes, activation='softmax', name=col)(l_middle)
            losses[col] = "sparse_categorical_crossentropy"
        
        outputs.append(output)
    
    model = keras.Model(inputs=l_input_ids, outputs=outputs, name=name)
    
    opt = keras.optimizers.Adam(learning_rate=1e-5, beta_1=0.9, beta_2=0.999, amsgrad=True)
    model.compile(optimizer=opt,
                  loss=losses,
                  metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name="acc")]
    )
    
    return model
    
model = get_model("../csbase1", 256, feature_key, ydf.columns, "propaganda_csbase1")
model.summary()

5
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'defaults'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'defaults'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: invalid value for "node": expected "ast.AST", got "<class 'NoneType'>"; to visit lists of nodes, use "visit_block" instead
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: invalid value for "node": expected "ast.AST", got "<class 'NoneType'>"; to visit lists of nodes, use "visit_block

In [93]:
import datetime

log_dir = "log/" + datetime.datetime.now().strftime("%m-%d_%H-%M-%s")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, update_freq=5, profile_batch=0)

hist = model.fit(
    x,
    y[column_index] if column else y,
    batch_size=16, epochs=20,
    validation_split=0.2,
    callbacks=[
#       TestCallback((test.x, test.y), dataHolder.index2str),
        tensorboard_callback,
#       create_learning_rate_scheduler(),
#       keras.callbacks.EarlyStopping(patience=4, restore_best_weights=True)
    ]
)

Train on 6913 samples, validate on 1729 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

KeyboardInterrupt: 

In [None]:
from IPython.display import HTML, SVG, Image
# import base64

def show_model(model, width="100%"):
    _html_template='<img width="{}" src="data:image/svg+xml;utf8,{}" >'
    d = keras.utils.model_to_dot(model, show_shapes= True, show_layer_names=True)
    d.set_splines("ortho")
    d.set_size("10")
    d.set_rankdir("LR")
    d.write_png('model.png')
    display(Image("model.png"))
#     svg = d.create(prog='dot', format='svg')
#     display(SVG(svg))
#     html = _html_template.format(width, svg)
#     print(html)
#     display(HTML(html))

show_model(model)

