In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import tensorflow as tf
from tensorflow import keras
from keras.layers import *
from keras.models import Sequential, Model



In [2]:
df_short = pd.read_csv("data/processed/processed_short.csv")
df_medium = pd.read_csv("data/processed/processed_medium.csv")
df_dank = pd.read_csv("data/processed/processed_dank.csv")


In [3]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers import Embedding
from sklearn.model_selection import train_test_split



vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)



In [14]:
def create_embedding_layer(df):
    name =[x for x in globals() if globals()[x] is df][0]
    #text = df['text'].to_numpy()
    text = np.asarray(df['text'])
    y = df['label']
    vectorizer.adapt(text)

    voc = vectorizer.get_vocabulary()
    word_index = dict(zip(voc, range(len(voc))))
    print("data/" + str(name) + ".wordvectors")
    wv = KeyedVectors.load("data/" + str(name) + ".wordvectors", mmap='r')
    embeddings_index= {}
    for word in voc:
        if wv.has_index_for(word):
            embeddings_index[word] = wv[word]
            
        else:
            embeddings_index[word] = np.zeros(100)
    
    num_tokens = len(voc) + 2
    embedding_dim = 100
    hits = 0
    misses = 0
    
    embedding_matrix = np.zeros((num_tokens, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            hits += 1
        else:
            misses += 1
    print("Converted %d words (%d misses)" % (hits, misses))

    embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,)

    return embedding_layer


In [5]:
def build_model(df):
    submodels = []
    for kw in (3, 4, 5):    # kernel sizes
        submodel = Sequential()
        submodel.add(create_embedding_layer(df))
        submodel.add(Conv1D(100,    #should be 100 maps
                            kw,
                            padding='valid',
                            activation='relu',
                            strides=1)) #elvileg egy maxnorm nem árt még bele
        submodel.add(GlobalMaxPooling1D())
        submodels.append(submodel)

    submodel1 = submodels[0]
    submodel2 = submodels[1]
    submodel3 = submodels[2]

    x = add([submodel1.output, submodel2.output, submodel3.output])
    
    big_model = Sequential()
    big_model.add(Dense(100))
    big_model.add(Dropout(0.5))
    big_model.add(Activation('relu'))
    big_model.add(Dense(1))
    big_model.add(Activation('sigmoid'))

    big_model_output = big_model(x)

    model = Model([submodel1.input, submodel2.input, submodel3.input], big_model_output)

    model.compile(loss='binary_crossentropy',
                    optimizer='adam',
                    metrics=['accuracy'])  

    print(model.summary())  

    return model

In [22]:
def train_model(df):
    name =[x for x in globals() if globals()[x] is df][0]
    #creating the desired vectors
    text = np.asarray(df['text'])
    y = df["label"]
    text_train, text_test, y_train, y_test = train_test_split(
    text, y, test_size=0.33, random_state=42)

    text_train, text_val, y_train, y_val = train_test_split(
    text_train, y_train, test_size=0.33, random_state=42)

    X_train = vectorizer(text_train)
    X_val = vectorizer(text_val)
    X_test = vectorizer(text_test)

    model = build_model(df)

    model.fit([X_train, X_train, X_train],
                     y_train,
                     batch_size=64,
                     epochs=10,
                     validation_data=([X_val, X_val, X_val],
                     y_val))
    
    evaluation =  model.evaluate([X_test, X_test, X_test], y_test)

    print("Loss: ", loss)
    print("Accuracy: ", accuracy)
        
    return model



In [18]:
train_model(df_dank)

data/df_dank.wordvectors
Converted 12428 words (0 misses)
data/df_dank.wordvectors
Converted 12428 words (0 misses)
data/df_dank.wordvectors
Converted 12428 words (0 misses)
Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
embedding_4_input (InputLayer)  [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_5_input (InputLayer)  [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_6_input (InputLayer)  [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_4 

<tensorflow.python.keras.engine.functional.Functional at 0x1e5b2e3d610>