In [1]:
import re
import statistics
import gensim
import pandas as pd
import numpy as np

from sklearn import model_selection

import keras
from keras import layers
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer

import load_vectors_300

In [2]:
total_tweets = 4887
mechanism_id = {
        'absurd': 1,
        'analogy': 2,
        'embarrassment': 3,
        'exaggeration': 4,
        'insults': 5,
        'irony': 6,
        'misunderstanding': 7,
        'parody': 8,
        'reference': 9,
        'stereotype': 10,
        'unmasking': 11,
        'wordplay': 12
    }

model_functions = ['elu', 'relu', 'selu','sigmoid', 'tanh']
layer_units = [15, 30, 50, 75, 100]
epoch = 150

unwanted_chars = ['!', ',', '"', '-', '...','–','XD', 'xD', '¿', '?', '—', '\n', "#", '¡', ':', "“", '.', '(', ')']
unwanted_chars.extend(["¬¬", "\('.')/", "*", '\n', '»', '\x97', '\x85'])

In [3]:
tweets = []
mechanisms = []
words_per_tweets = []

In [4]:
train_data = pd.read_csv("sources/haha_mechanism_target_train.csv")

train_data.drop("id", axis=1, inplace=True)
train_data.drop("target", axis=1, inplace=True)

In [5]:
def sanitize_tweet(tweet):
    for char in unwanted_chars:
        tweet = tweet.replace(char, ' ')
    tweet = re.sub('@\w*', '', tweet) #remove user references
    tweet = re.sub('\$', '$ ', tweet) #split prices chars
    tweet = tweet.split(" ")
    tweet = [token for token in tweet if token != ''] #remove unwanted spaces
    return tweet

In [6]:
global tweets, mechanisms
for idx, row in train_data.iterrows():
    mechanisms.append(mechanism_id.get(train_data.loc[idx, "mechanism"]))
    
    sanitized_tweet = sanitize_tweet(train_data.loc[idx, "text"])
    tweets.append(sanitized_tweet)

In [7]:
def get_vector_word(word_embedding, word):
    if word in word_embedding:
        return word_embedding[word]
    elif word.capitalize() in word_embedding:
        return word_embedding[word.capitalize()]
    elif word.lower() in word_embedding:
        return word_embedding[word.lower()]
    elif word.upper() in word_embedding:
        return word_embedding[word.upper()]
    else:
        return None

def generate_embedding_matrix(word_embedding):
    word_embedding_words = list(word_embedding.vocab.keys())
    word_embedding_word_counter = len(word_embedding_words)

    tokenizer = Tokenizer(num_words=word_embedding_word_counter, filters='',
                                                      lower=False, split=' ', char_level=False, oov_token='<UNK>')
    tokenizer.fit_on_texts(word_embedding_words)
    embeddings_word_index = tokenizer.word_index

    embedding_matrix = np.zeros((word_embedding_word_counter + 2, 300))
    for word, index in embeddings_word_index.items():
        if index != 1:
            embedding_vector = get_vector_word(word_embedding, word)
            embedding_matrix[index] = embedding_vector
    return embedding_matrix, tokenizer


In [8]:
word_embedding = load_vectors_300.load("emb39-word2vec")
word_embedding = word_embedding.wv
word_embedding_matrix, tokenizer = generate_embedding_matrix(word_embedding)

In [9]:
num_words_list = list(map(lambda x: len(x), tweets))
median_words_tweet = statistics.median(num_words_list)
print(median_words_tweet)

18


In [10]:
from keras.preprocessing.sequence import pad_sequences
indexes_list = tokenizer.texts_to_sequences(tweets)
indexes_list = pad_sequences(indexes_list, maxlen=median_words_tweet)

In [11]:
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(indexes_list, mechanisms, test_size=0.2, random_state=42)

In [12]:
my_callbacks = [
    EarlyStopping(monitor="val_loss", patience=10, verbose=0, mode="auto", restore_best_weights= True)
]

In [None]:
global word_embedding_matrix
print(word_embedding_matrix)
for activation_function in model_functions:
    for recurrent_function in model_functions:
        gru_gru_model = keras.Sequential(
        [
            layers.Embedding(input_dim = word_embedding_matrix.shape[0], output_dim = word_embedding_matrix.shape[1],
                                 input_length = 1, weights = [word_embedding_matrix], trainable = False, mask_zero = True),
            layers.GRU(units = 1, dropout = 0.1, recurrent_dropout = 0.2, activation=activation_function, return_sequences=True),
                    layers.GRU(units = 1, dropout = 0.1, recurrent_dropout = 0.3, activation=activation_function),
                    layers.Dense(12, activation=activation_function)
                ]
            )

[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.04379441 -0.04954839 -0.06157952 ... -0.0709765   0.06606881
   0.02376981]
 ...
 [ 0.05253253 -0.01243497 -0.0963236  ... -0.05829482  0.04032185
   0.00046087]
 [ 0.03058515 -0.07465189  0.00142497 ... -0.06569502  0.01332951
  -0.04353482]
 [ 0.02943384  0.03231565  0.04918109 ... -0.13639031 -0.00750405
   0.03460548]]


In [None]:
for activation_function in model_functions:
    for recurrent_function in model_functions:
        for recursive_layers_architecture in recursive_layers_architectures:
            gru_lstm_model = keras.Sequential(
                [
                    layers.Embedding(input_dim = embedding_matix.shape[0], output_dim = embedding_matrix.shape[1],
                                 input_length = 1, weights = [], trainable = False, mask_zero = True),
                    layers.GRU(units = 1, dropout = DROPOUT, recurrent_dropout = RECURRENT_DROPOUT,
                           kernel_initializer=KERNEL_INITIALIZER1, activation=activation_function, return_sequences=True),
                    layers.LSTM(units = 1, dropout = DROPOUT, recurrent_dropout = RECURRENT_DROPOUT,
                           kernel_initializer=KERNEL_INITIALIZER1, activation=activation_function),
                    layers.Dense(TARGETS, activation=activation_function)
                ]
            )

In [None]:
for activation_function in model_functions:
    for recurrent_function in model_functions:
        for recursive_layers_architecture in recursive_layers_architectures:
            lstm_gru_model = keras.Sequential(
                [
                    layers.Embedding(input_dim = embedding_matix.shape[0], output_dim = embedding_matrix.shape[1],
                                 input_length = 1, weights = [], trainable = False, mask_zero = True),
                    layers.LSTM(units = 1, dropout = DROPOUT, recurrent_dropout = RECURRENT_DROPOUT,
                           kernel_initializer=KERNEL_INITIALIZER1, activation=activation_function, return_sequences=True),
                    layers.GRU(units = 1, dropout = DROPOUT, recurrent_dropout = RECURRENT_DROPOUT,
                           kernel_initializer=KERNEL_INITIALIZER1, activation=activation_function),
                    layers.Dense(TARGETS, activation=activation_function)
                ]
            )

In [None]:
for activation_function in model_functions:
    for recurrent_function in model_functions:
        for recursive_layers_architecture in recursive_layers_architectures:
            lstm_lstm_model = keras.Sequential(
                [
                    layers.Embedding(input_dim = embedding_matix.shape[0], output_dim = embedding_matrix.shape[1],
                                 input_length = 1, weights = [], trainable = False, mask_zero = True),
                    layers.LSTM(units = 1, dropout = DROPOUT, recurrent_dropout = RECURRENT_DROPOUT,
                           kernel_initializer=KERNEL_INITIALIZER1, activation=activation_function, return_sequences=True),
                    layers.LSTM(units = 1, dropout = DROPOUT, recurrent_dropout = RECURRENT_DROPOUT,
                           kernel_initializer=KERNEL_INITIALIZER1, activation=activation_function),
                    layers.Dense(TARGETS, activation=activation_function)
                ]
            )