In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import tensorflow as tf
from tensorflow import keras
from keras.layers import *
from keras.models import Sequential, Model
from tensorflow.keras.constraints import max_norm



In [2]:
df_short = pd.read_csv("data/processed/processed_short.csv")
df_medium = pd.read_csv("data/processed/processed_medium.csv")
df_dank = pd.read_csv("data/processed/processed_dank.csv")


In [3]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers import Embedding
from sklearn.model_selection import train_test_split



vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)



In [82]:
def glove_100d(df):
    text = np.asarray(df['text'])
    y = df['label']
    vectorizer.adapt(text)

    voc = vectorizer.get_vocabulary()
    word_index = dict(zip(voc, range(len(voc))))
    embeddings_index = {}
    with open('data/raw/glove.6B.100d.txt',encoding="utf8") as f:
        for line in f:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, "f", sep=" ")
            embeddings_index[word] = coefs
    print("Found %s word vectors." % len(embeddings_index))

    num_tokens = len(voc) + 2
    embedding_dim = 100
    hits = 0
    misses = 0

    # Prepare embedding matrix
    embedding_matrix = np.zeros((num_tokens, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            # This includes the representation for "padding" and "OOV"
            embedding_matrix[i] = embedding_vector
            hits += 1
        else:
            misses += 1
    print("Converted %d words (%d misses)" % (hits, misses))
    

    

    return embedding_matrix

In [83]:
import pickle
universal = pickle.load(open( "universal_embed_layer.p", "rb" ))


In [113]:
def train_model(df):
    name =[x for x in globals() if globals()[x] is df][0]
    #creating the desired vectors
    text = np.asarray(df['text'])
    y = df["label"]
    text_train, text_test, y_train, y_test = train_test_split(
    text, y, test_size=0.33, random_state=42)

    text_train, text_val, y_train, y_val = train_test_split(
    text_train, y_train, test_size=0.33, random_state=42)

    X_train = vectorizer(text_train)
    X_val = vectorizer(text_val)
    X_test = vectorizer(text_test)

    model = build_model()

    model.fit([X_train, X_train, X_train],
                     y_train,
                     batch_size=64,
                     epochs=25,
                     validation_data=([X_val, X_val, X_val],
                     y_val))
    
    loss, accuracy =  model.evaluate([X_test, X_test, X_test], y_test)

    print("Loss: ", loss)
    print("Accuracy: ", accuracy)
        
    return model



In [85]:
df_joint = pd.concat([df_short, df_medium, df_dank])
df_joint = df_joint.reset_index(drop=True)

In [123]:
def build_model():
    init_matrix = glove_100d(df_joint)
    submodels = []
    for kw in (3, 4, 5):    # kernel sizes
        submodel = Sequential()
        submodel.add(Embedding(
        20002,
        100,
        embeddings_initializer=keras.initializers.Constant(init_matrix),
        trainable=False
    ))
        submodel.add(Conv1D(100,    #should be 100 maps
                            kw,
                            padding='valid',
                            activation='relu',
                            strides=1, kernel_constraint=max_norm(3.65))) #elvileg egy maxnorm nem árt még bele
        submodel.add(GlobalMaxPooling1D())
        submodel._name = str(kw)
        submodels.append(submodel)

    submodel1 = submodels[0]
    submodel2 = submodels[1]
    submodel3 = submodels[2]

    x = add([submodel1.output, submodel2.output, submodel3.output])
    
    big_model = Sequential()
    big_model.add(Dropout(0.3))
    big_model.add(Dense(55))
    big_model.add(Dropout(0.6))
    big_model.add(Activation('relu'))
    big_model.add(Dense(1))
    big_model.add(Activation('sigmoid'))

    for layer in big_model.layers:
        layer._name = layer.name + str("_1")
    big_model_output = big_model(x)

    model = Model([submodel1.input, submodel2.input, submodel3.input], big_model_output)

    model.compile(loss='binary_crossentropy',
                    optimizer='adam',
                    metrics=['accuracy'])  

    print(model.summary())  


    return model

In [124]:
train_model(df_medium)

Found 400000 word vectors.
Converted 18809 words (1191 misses)
Model: "model_33"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
embedding_70_input (InputLayer) [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_71_input (InputLayer) [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_72_input (InputLayer) [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_70 (Embedding)        (None, None, 100)    2000200     embedding_70_input[0][0]         
____________________________

<tensorflow.python.keras.engine.functional.Functional at 0x2a397913b20>