<a href="https://colab.research.google.com/github/alheir/22-67-neural-networks/blob/main/tp_04.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [34]:
import os, re, csv, math, codecs, logging
from collections import Counter
from pathlib import Path
from io import StringIO
import pickle
import gdown

import numpy as np
from matplotlib import pyplot as plt

from sklearn.datasets import fetch_20newsgroups
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [2]:
# cargamos los datos (ya separados de forma predeterminada en train y test)
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))
class_num = 20

In [3]:
# descargamos los embeddings de palabras de Fasttext para inglés y descomprimimos el archivo.
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
!unzip wiki-news-300d-1M.vec.zip

--2024-06-29 01:36:40--  https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 3.163.189.14, 3.163.189.108, 3.163.189.51, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|3.163.189.14|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 681808098 (650M) [application/zip]
Saving to: ‘wiki-news-300d-1M.vec.zip’


2024-06-29 01:36:46 (98.3 MB/s) - ‘wiki-news-300d-1M.vec.zip’ saved [681808098/681808098]

Archive:  wiki-news-300d-1M.vec.zip
  inflating: wiki-news-300d-1M.vec   


In [4]:
# cargamos los embeddings de palabras
print('loading word embeddings...')
embeddings_index = {}
f = codecs.open('wiki-news-300d-1M.vec', encoding='utf-8')

for line in f:
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print(f'found {len(embeddings_index)} word vectors')

loading word embeddings...
found 999995 word vectors


In [5]:
# instanciamos el tokenizador
token = Tokenizer(num_words=30000,
                filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                lower=True,
                split=' ',
                char_level=False,
                oov_token="UNK",
                document_count=0)

In [6]:
# fiteamos el tokenizador
token.fit_on_texts(newsgroups_train.data)

In [7]:
# obtenemos los diccionarios idx2word y word2idx
reverse_dictionary = token.index_word
dictionary = dict([(value, key) for (key, value) in reverse_dictionary.items()])
# CHECK QUE EMPIEZA POR 0

In [8]:
# cargamos en una matriz los embeddings de las palabras
# presentes en el vocabulario
embed_dim=300
num_words=len(dictionary)+1
embedding_matrix=np.zeros([num_words,embed_dim])
for word, idx in dictionary.items():
  if idx <= num_words and word in embeddings_index:
    embedding_matrix[idx,:]=embeddings_index[word]

In [9]:
embedding_matrix.shape

(105374, 300)

In [10]:
# se tokenizan los textos
train_sequences=token.texts_to_sequences(newsgroups_train.data)
test_sequences=token.texts_to_sequences(newsgroups_test.data)

In [11]:
# En este punto seleccionamos el tamaño de contexto a procesar en la variable `max_len`
max_len=500
train_sequences=pad_sequences(train_sequences,maxlen=max_len)
test_sequences=pad_sequences(test_sequences,maxlen=max_len)

In [12]:
from keras.layers import Bidirectional, LSTM, Dense, Embedding, Dropout
from keras.models import Sequential
from keras.losses import SparseCategoricalCrossentropy
from keras.callbacks import EarlyStopping

In [13]:

model = Sequential()

# la primera capa es de embedding entrenable. Recordar que se puede variar el tamaño
# del embedding a entrenar
model.add(Embedding(input_dim=num_words, output_dim=embed_dim, weights=[embedding_matrix], input_shape=(None,), trainable = False))

model.add(LSTM(100, return_sequences=True))
# model.add(Dropout(0.2))

model.add(LSTM(100))
# model.add(Dense(32, activation='relu'))

# Predicción de clasificación con softmax
# La salida es del tamaño del vocabulario
model.add(Dense(class_num, activation='softmax'))


# Clasificación multiple categórica --> loss = categorical_crossentropy
# notar que usamos la versión Sparse para utilizar sólo índices en lugar de OHE
model.compile(loss=SparseCategoricalCrossentropy(), optimizer='rmsprop', metrics=['accuracy'])


model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 300)         31612200  
                                                                 
 lstm (LSTM)                 (None, None, 100)         160400    
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 20)                2020      
                                                                 
Total params: 31855020 (121.52 MB)
Trainable params: 242820 (948.52 KB)
Non-trainable params: 31612200 (120.59 MB)
_________________________________________________________________


In [14]:
early_stopping = EarlyStopping(monitor="val_accuracy",
    min_delta=0,
    patience=5,
    verbose=1,
    mode="max",
    baseline=None,
    restore_best_weights=True,
    start_from_epoch=0,
)

history = model.fit(train_sequences, newsgroups_train.target,
                    batch_size=128,
                    epochs=100,
                    validation_split=0.2,
                    callbacks=[early_stopping]
                    )


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100


In [18]:
y_pred = np.argmax(model.predict(test_sequences), axis=-1)



In [27]:
print(y_pred.shape)
print(newsgroups_test.target.shape)

print(y_pred)
print(newsgroups_test.target)

(7532,)
(7532,)
[ 9  5  9 ...  9  6 15]
[ 7  5  0 ...  9  6 15]


In [35]:
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix

In [40]:
print(f1_score(newsgroups_test.target, y_pred, average='micro'))
print(f1_score(newsgroups_test.target, y_pred, average='macro'))
print(f1_score(newsgroups_test.target, y_pred, average='weighted'))

0.5650557620817844
0.5409683424487132
0.5551654432896305


In [42]:
f1_score_test = f1_score(newsgroups_test.target, y_pred, average='weighted')
recall_score_test = recall_score(newsgroups_test.target, y_pred, average='weighted')
precision_score_test = precision_score(newsgroups_test.target, y_pred, average='weighted')

In [43]:
print(f'F1-score en test: {f1_score_test}')
print(f'Recall en test: {recall_score_test}')
print(f'Precision en test: {precision_score_test}')

F1-score en test: 0.5551654432896305
Recall en test: 0.5650557620817844
Precision en test: 0.5729787432277426


In [None]:
'''
Tokenizacion: opciones
Elman, LSTM, GRU
Bidireccional
Tamaño de capas y cantidad
Dropout
RMSProp, ADAM
BATCH_SIZE
Unloop
TPU?
Embedding entrenable
Forma de colapsar las secuencias
Reduccion de dimensionalidad embedding
'''