In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras.layers import *
from keras.models import Sequential, Model
from keras.callbacks import EarlyStopping

import requests
import io

In [2]:
df_short = pd.read_csv("https://raw.githubusercontent.com/bocsardigergely/bachelor-thesis/main/data/processed/processed_short.csv")
df_medium = pd.read_csv("https://raw.githubusercontent.com/bocsardigergely/bachelor-thesis/main/data/processed/processed_medium.csv")
df_dank = pd.read_csv("https://raw.githubusercontent.com/bocsardigergely/bachelor-thesis/main/data/processed/processed_dank.csv")
response = requests.get('https://raw.githubusercontent.com/bocsardigergely/bachelor-thesis/main/universal.npy')
init_matrix = np.load(io.BytesIO(response.content)) 

In [3]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers import Embedding
from sklearn.model_selection import train_test_split

vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)

In [4]:
df_joint = pd.concat([df_short, df_medium, df_dank])
df_joint = df_joint.reset_index(drop=True)
df_train = df_joint.sample(frac=1).reset_index(drop=True)
vectorizer.adapt(np.asarray(df_train["text"]))

In [15]:
def train_model(df):    
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
    name =[x for x in globals() if globals()[x] is df][0]
    #creating the desired vectors
    text = np.asarray(df['text'])
    y = df["label"]
    text_train, text_test, y_train, y_test = train_test_split(
    text, y, test_size=0.33, random_state=42)

    text_train, text_val, y_train, y_val = train_test_split(
    text_train, y_train, test_size=0.33, random_state=42)

    X_train = vectorizer(text_train)
    X_val = vectorizer(text_val)
    X_test = vectorizer(text_test)

    model = build_model()

    model.fit([X_train, X_train, X_train],
                     y_train,
                     batch_size=32,
                     epochs=25,
                     validation_data=([X_val, X_val, X_val],
                     y_val), 
                     callbacks=es)
    loss, accuracy =  model.evaluate([X_test, X_test, X_test], y_test)

    print("Loss: ", loss)
    print("Accuracy: ", accuracy)
        
    return model

In [28]:
def build_model():

    response = requests.get('https://raw.githubusercontent.com/bocsardigergely/bachelor-thesis/main/universal.npy')
    init_matrix = np.load(io.BytesIO(response.content))
    model = Sequential()
    model.add(Embedding(
      20002,
      100,
      embeddings_initializer=keras.initializers.Constant(init_matrix),
      trainable=False
  ))
    model.add(Bidirectional(LSTM(100, return_sequences=True)))
    model.add(Dropout(0.4143619965361732))
    model.add(Bidirectional(LSTM(1, return_sequences=False)))
    model.add(Dropout(0.09225974322037533))
    model.add(Dense(50, activation='relu'))
    model.add(Dropout(0.20942239619394942))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                    optimizer='adam',
                    metrics=['accuracy'])
    

    print(model.summary())
    return model


In [29]:
train_model(df_medium)

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, None, 100)         2000200   
_________________________________________________________________
bidirectional_16 (Bidirectio (None, None, 200)         160800    
_________________________________________________________________
dropout_18 (Dropout)         (None, None, 200)         0         
_________________________________________________________________
bidirectional_17 (Bidirectio (None, 10)                8240      
_________________________________________________________________
dropout_19 (Dropout)         (None, 10)                0         
_________________________________________________________________
dense_13 (Dense)             (None, 10)                110       
_________________________________________________________________
dropout_20 (Dropout)         (None, 10)               

<tensorflow.python.keras.engine.sequential.Sequential at 0x7f6428a27b50>