# Red Neuronal RNN

In [1]:
# importacion general de librerias y de visualizacion (matplotlib y seaborn)
import pandas as pd
import scipy 
import numpy as np
import random as rd
import nltk
import pickle
import sklearn
from sklearn.neighbors import NearestNeighbors
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
from datetime import datetime
from tqdm import tqdm

%matplotlib inline
nltk.download('stopwords')

plt.style.use('default') # haciendo los graficos un poco mas bonitos en matplotlib
#plt.rcParams['figure.figsize'] = (20, 10)

sns.set(style="whitegrid") # seteando tipo de grid en seaborn

pd.options.display.float_format = '{:20,.2f}'.format # suprimimos la notacion cientifica en los outputs

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\igna-\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Leo y spliteo el set de datos

In [2]:
tweets = pd.read_csv('train_EN.csv')

In [3]:
train = tweets.iloc[:,1:3]
train.dropna(inplace =True)

In [4]:
train

Unnamed: 0,tweet,sarcastic
0,The only thing I got from college is a caffein...,1
1,I love it when professors draw a big question ...,1
2,Remember the hundred emails from companies whe...,1
3,Today my pop-pop told me I was not “forced” to...,1
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1
...,...,...
3463,The population spike in Chicago in 9 months is...,0
3464,You'd think in the second to last English clas...,0
3465,I’m finally surfacing after a holiday to Scotl...,0
3466,Couldn't be prouder today. Well done to every ...,0


In [5]:
train['tweet'] = train['tweet'].str.lower()
train['tweet'] = train['tweet'].astype(str)

Tokenizo los textos

In [6]:
tokenized_texts = []
for text in tqdm(train['tweet']):
    tokenized_texts.append(nltk.word_tokenize(text))

100%|████████████████████████████████████████████████████████████████████████████| 3467/3467 [00:00<00:00, 4197.35it/s]


Creo la matriz de embbedings

In [7]:
import gensim
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [8]:
 w2v_model.vectors.shape

(3000000, 300)

In [9]:
emb_matrix = np.concatenate([np.zeros((1,300)), 
                             np.random.normal(size=(1,300)),
                             w2v_model.vectors[:500000]], axis=0)

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train['tweet'], train['sarcastic'], test_size=0.2, train_size=0.8, random_state=1999)

In [11]:
X_test = X_test.dropna()
y_test = y_test.dropna()

In [12]:
MAX_LEN = int(np.percentile([len(t) for t in tokenized_texts], 95))
MAX_LEN

50

In [13]:
def preprocess_toks(tokenized_text):
    tokenized_text =tokenized_text.copy()
    if len(tokenized_text) > MAX_LEN:
        tokenized_text = tokenized_text[:MAX_LEN]
    for i in range(len(tokenized_text)):
        if tokenized_text[i] in w2v_model and w2v_model.key_to_index[tokenized_text[i]] < 500000:
            tokenized_text[i] = w2v_model.key_to_index[tokenized_text[i]]+2
        else:
            tokenized_text[i] = 1
    return tokenized_text

In [14]:
emb_matrix.shape

(500002, 300)

In [15]:
for i in tqdm(range(len(tokenized_texts)), total=len(tokenized_texts)):
    tokenized_texts[i] = preprocess_toks(tokenized_texts[i])

100%|███████████████████████████████████████████████████████████████████████████| 3467/3467 [00:00<00:00, 10829.23it/s]


In [16]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenized_texts = pad_sequences(
    tokenized_texts, maxlen=MAX_LEN, dtype='int32', padding='post',
    truncating='post', value=0
)

In [42]:
from keras.layers import Dense, Input, GRU, Embedding, LSTM
from keras.models import Model

In [18]:
word_indexes = Input((50,), dtype='int32')

word_emb = Embedding(500002, 300, weights=[emb_matrix], 
                     trainable=False, mask_zero=True)

emb_sequence = word_emb(word_indexes) # (22, 300)
rnn1 = GRU(50, activation='tanh')(emb_sequence) # (1,50)
dense1 = Dense(50, activation='tanh')(rnn1)
dense2 = Dense(50, activation='tanh')(dense1)
dense3 = Dense(50, activation='tanh')(dense2)

out = Dense(1, activation=None)(dense3)
model = Model(inputs=word_indexes, outputs=out)

In [19]:
from tensorflow import keras
opt = keras.optimizers.Adam(learning_rate=0.001)

model.compile(optimizer=opt, loss="mse", metrics=["mae"])

In [20]:
model.fit(tokenized_texts, train['sarcastic'].values, epochs=20, batch_size=256)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x24892703a60>

In [21]:
X_test

298     damn, imagine being vaxxed and then getting a ...
1691    that second vaccine dose is no joke is it 😵 be...
1407    my essay was so bad that my prof is giving me ...
1888    @iamgryphoneer i don't like to rag on people's...
1047    my stray boy playing… he is looking well but s...
                              ...                        
2275    damn instagram engagement is slow 😭 2 likes in...
1006    i gotta get out of this city. i made $18 doord...
1563    my grandpa has covid so i made a lil care pack...
2630    the last football game i went too. 1 year toda...
1615         words cannot express how much i hate glitter
Name: tweet, Length: 694, dtype: object

In [22]:
tokenized_texts_test = []
for texto in tqdm(X_test):
    tokenized_texts_test.append(nltk.word_tokenize(texto))

100%|██████████████████████████████████████████████████████████████████████████████| 694/694 [00:00<00:00, 3652.78it/s]


In [23]:
for i in tqdm(range(len(tokenized_texts_test)), total=len(tokenized_texts_test)):
    tokenized_texts_test[i] = preprocess_toks(tokenized_texts_test[i])

100%|██████████████████████████████████████████████████████████████████████████████| 694/694 [00:00<00:00, 3691.68it/s]


In [26]:
tokenized_texts_test = pad_sequences(
    tokenized_texts_test, maxlen=MAX_LEN, dtype='int32', padding='post',
    truncating='post', value=0
)

In [39]:
preds = model.predict(tokenized_texts_test)

In [40]:
from sklearn.metrics import classification_report, roc_auc_score
roc_auc_score(y_test, preds)

0.8208948296667595

In [43]:

word_indexes = Input((50,), dtype='int32')

word_emb = Embedding(500002, 300, weights=[emb_matrix], 
                     trainable=False, mask_zero=True)

emb_sequence = word_emb(word_indexes) # (22, 300)
rnn1 = LSTM(50, activation='tanh')(emb_sequence) # (1,50)
dense1 = Dense(50, activation='tanh')(rnn1)
dense2 = Dense(50, activation='tanh')(dense1)
dense3 = Dense(50, activation='tanh')(dense2)

out = Dense(1, activation=None)(dense3)
model = Model(inputs=word_indexes, outputs=out)

In [62]:
from tensorflow import keras
opt = keras.optimizers.Adam(learning_rate=0.001)
auc = keras.metrics.AUC(
    num_thresholds=200, curve='ROC',
    summation_method='interpolation', name=None, dtype=None,
    thresholds=None, multi_label=False, num_labels=None, label_weights=None,
    from_logits=False
)


model.compile(optimizer=opt, loss="mse", metrics=[auc])
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

In [63]:
history = model.fit(tokenized_texts, train['sarcastic'].values, validation_split=0.1, callbacks=[callback], epochs=20, batch_size=256)

Epoch 1/20


InvalidArgumentError:  assertion failed: [predictions must be >= 0] [Condition x >= y did not hold element-wise:] [x (model_1/dense_7/BiasAdd:0) = ] [[0.346226][-0.0469980575][0.429081261]...] [y (Cast_3/x:0) = ] [0]
	 [[node assert_greater_equal/Assert/AssertGuard/Assert
 (defined at C:\Users\igna-\anaconda3\lib\site-packages\keras\utils\metrics_utils.py:608)
]] [Op:__inference_train_function_44512]

Errors may have originated from an input operation.
Input Source operations connected to node assert_greater_equal/Assert/AssertGuard/Assert:
In[0] assert_greater_equal/Assert/AssertGuard/Assert/assert_greater_equal/All:	
In[1] assert_greater_equal/Assert/AssertGuard/Assert/data_0:	
In[2] assert_greater_equal/Assert/AssertGuard/Assert/data_1:	
In[3] assert_greater_equal/Assert/AssertGuard/Assert/data_2:	
In[4] assert_greater_equal/Assert/AssertGuard/Assert/model_1/dense_7/BiasAdd:	
In[5] assert_greater_equal/Assert/AssertGuard/Assert/data_4:	
In[6] assert_greater_equal/Assert/AssertGuard/Assert/Cast_3/x:

Operation defined at: (most recent call last)
>>>   File "C:\Users\igna-\anaconda3\lib\runpy.py", line 194, in _run_module_as_main
>>>     return _run_code(code, main_globals, None,
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\runpy.py", line 87, in _run_code
>>>     exec(code, run_globals)
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
>>>     app.launch_new_instance()
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\site-packages\traitlets\config\application.py", line 845, in launch_instance
>>>     app.start()
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 612, in start
>>>     self.io_loop.start()
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 149, in start
>>>     self.asyncio_loop.run_forever()
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\asyncio\base_events.py", line 570, in run_forever
>>>     self._run_once()
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\asyncio\base_events.py", line 1859, in _run_once
>>>     handle._run()
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\asyncio\events.py", line 81, in _run
>>>     self._context.run(self._callback, *self._args)
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\site-packages\tornado\ioloop.py", line 690, in <lambda>
>>>     lambda f: self._run_callback(functools.partial(callback, future))
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\site-packages\tornado\ioloop.py", line 743, in _run_callback
>>>     ret = callback()
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\site-packages\tornado\gen.py", line 787, in inner
>>>     self.run()
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\site-packages\tornado\gen.py", line 748, in run
>>>     yielded = self.gen.send(value)
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 365, in process_one
>>>     yield gen.maybe_future(dispatch(*args))
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\site-packages\tornado\gen.py", line 209, in wrapper
>>>     yielded = next(result)
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 268, in dispatch_shell
>>>     yield gen.maybe_future(handler(stream, idents, msg))
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\site-packages\tornado\gen.py", line 209, in wrapper
>>>     yielded = next(result)
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 543, in execute_request
>>>     self.do_execute(
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\site-packages\tornado\gen.py", line 209, in wrapper
>>>     yielded = next(result)
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 306, in do_execute
>>>     res = shell.run_cell(code, store_history=store_history, silent=silent)
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 536, in run_cell
>>>     return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2877, in run_cell
>>>     result = self._run_cell(
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2923, in _run_cell
>>>     return runner(coro)
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\site-packages\IPython\core\async_helpers.py", line 68, in _pseudo_sync_runner
>>>     coro.send(None)
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3146, in run_cell_async
>>>     has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3338, in run_ast_nodes
>>>     if (await self.run_code(code, result,  async_=asy)):
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3418, in run_code
>>>     exec(code_obj, self.user_global_ns, self.user_ns)
>>> 
>>>   File "<ipython-input-63-ad48a4671b9b>", line 1, in <module>
>>>     model.fit(tokenized_texts, train['sarcastic'].values, validation_split=0.1, epochs=20, batch_size=256)
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\site-packages\keras\engine\training.py", line 1216, in fit
>>>     tmp_logs = self.train_function(iterator)
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\site-packages\keras\engine\training.py", line 878, in train_function
>>>     return step_function(self, iterator)
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\site-packages\keras\engine\training.py", line 867, in step_function
>>>     outputs = model.distribute_strategy.run(run_step, args=(data,))
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\site-packages\keras\engine\training.py", line 860, in run_step
>>>     outputs = model.train_step(data)
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\site-packages\keras\engine\training.py", line 817, in train_step
>>>     self.compiled_metrics.update_state(y, y_pred, sample_weight)
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\site-packages\keras\engine\compile_utils.py", line 460, in update_state
>>>     metric_obj.update_state(y_t, y_p, sample_weight=mask)
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\site-packages\keras\utils\metrics_utils.py", line 73, in decorated
>>>     update_op = update_state_fn(*args, **kwargs)
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\site-packages\keras\metrics.py", line 177, in update_state_fn
>>>     return ag_update_state(*args, **kwargs)
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\site-packages\keras\metrics.py", line 2343, in update_state
>>>     return metrics_utils.update_confusion_matrix_variables(
>>> 
>>>   File "C:\Users\igna-\anaconda3\lib\site-packages\keras\utils\metrics_utils.py", line 608, in update_confusion_matrix_variables
>>>     tf.compat.v1.assert_greater_equal(
>>> 

Function call stack:
train_function -> assert_greater_equal_Assert_AssertGuard_false_44379


In [None]:
preds = model.predict(tokenized_texts_test)

In [None]:
from sklearn.metrics import classification_report, roc_auc_score
roc_auc_score(y_test, preds)