<a href="https://colab.research.google.com/github/alheir/22-67-neural-networks/blob/main/RNN_TP4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os, re, csv, math, codecs, logging
from collections import Counter
from pathlib import Path
from io import StringIO
import pickle
import gdown

import numpy as np
from matplotlib import pyplot as plt

from sklearn.datasets import fetch_20newsgroups
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import f1_score, recall_score, precision_score, classification_report, accuracy_score, confusion_matrix

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

import seaborn as sns
import pickle

## Carga de datos

In [None]:
# stopwords de NLTK
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = word_tokenize(text)
    words_filtered = [word for word in words if word.lower() not in stop_words]
    return ' '.join(words_filtered)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
newsgroups_train_exists = os.path.exists('newsgroups_train.txt')
newsgroups_test_exists = os.path.exists('newsgroups_test.txt')

if newsgroups_train_exists:
    with open('newsgroups_train.txt', 'rb') as fp:
        newsgroups_train = pickle.load(fp)
else:
    newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
    newsgroups_train.data = [remove_stopwords(text) for text in newsgroups_train.data]
    with open('newsgroups_train.txt', 'wb') as fp:
        pickle.dump(newsgroups_train, fp)

if newsgroups_test_exists:
    with open('newsgroups_test.txt', 'rb') as fp:
        newsgroups_test = pickle.load(fp)
else:
    newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))
    newsgroups_test.data = [remove_stopwords(text) for text in newsgroups_test.data]
    with open('newsgroups_test.txt', 'wb') as fp:
        pickle.dump(newsgroups_test, fp)

In [6]:
class_num = 20

*Dataset ya analizado en TP1...*

## Descarga y carga de embeddings de Fasttext

In [7]:
wiki_news_exists = os.path.exists('wiki-news-300d-1M.vec')
embeddings_index_exists = os.path.exists('embeddings_index.pkl')

embeddings_index = {}

if not embeddings_index_exists:
    if not wiki_news_exists:
        !wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
        !unzip wiki-news-300d-1M.vec.zip
        os.remove('wiki-news-300d-1M.vec.zip')

    # Carga de embeddings de palabras
    with codecs.open('wiki-news-300d-1M.vec', encoding='utf-8') as f:
        for line in f:
            values = line.rstrip().rsplit(' ')
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    with open('embeddings_index.pkl', 'wb') as fp:
        pickle.dump(embeddings_index, fp)
else:
    with open('embeddings_index.pkl', 'rb') as fp:
        embeddings_index = pickle.load(fp)

print(f'Found {len(embeddings_index)} word vectors')

--2024-06-29 23:13:25--  https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.226.34.7, 13.226.34.122, 13.226.34.53, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.226.34.7|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 681808098 (650M) [application/zip]
Saving to: ‘wiki-news-300d-1M.vec.zip’


2024-06-29 23:13:34 (75.2 MB/s) - ‘wiki-news-300d-1M.vec.zip’ saved [681808098/681808098]

Archive:  wiki-news-300d-1M.vec.zip
  inflating: wiki-news-300d-1M.vec   
Found 999995 word vectors


## Tokenización y preparación de secuencias

In [8]:
# vocabulario de 30000
token = Tokenizer(num_words=30000, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False, oov_token="UNK")
token.fit_on_texts(newsgroups_train.data)

# idx2word y word2idx
reverse_dictionary = token.index_word
dictionary = token.word_index

# embeddings de las palabras presentes en el vocabulario
embed_dim = 300
num_words = min(30000, len(dictionary) + 1)
embedding_matrix = np.zeros((num_words, embed_dim))
for word, idx in dictionary.items():
    if idx < num_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[idx] = embedding_vector

# tokenización de textos
train_sequences = token.texts_to_sequences(newsgroups_train.data)
test_sequences = token.texts_to_sequences(newsgroups_test.data)

# tamaño de contexto a procesar
max_len = 500
train_sequences = pad_sequences(train_sequences, maxlen=max_len)
test_sequences = pad_sequences(test_sequences, maxlen=max_len)

print(f"Train sequences shape: {train_sequences.shape}")
print(f"Test sequences shape: {test_sequences.shape}")

Train sequences shape: (11314, 500)
Test sequences shape: (7532, 500)


## Definición y compilación del modelo

In [9]:
from keras.layers import Embedding, LSTM, GRU, Bidirectional, Dense, Dropout
from keras.models import Sequential
from keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam, RMSprop
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

In [11]:
early_stopping = EarlyStopping(monitor="val_loss",
                               patience=5,
                               verbose=1,
                               restore_best_weights=True)

model_checkpoint = ModelCheckpoint(filepath='best_model.keras',
                                   monitor = "val_loss",
                                   verbose = 1,
                                   mode='min',
                                   save_best_only = True)

reduce_lr = ReduceLROnPlateau(monitor = "val_accuracy",
                              factor = 0.5,
                              patience = 5,
                              verbose = 1,
                              min_lr = 1e-5,
                              min_delta=0.001)

In [10]:
lstm_units = 100
gru_units = 100
dropout_rate = 0.5
batch_size = 128

In [12]:
model = Sequential()

# 1°: embedding entrenable
model.add(Embedding(input_dim=num_words,
                    output_dim=embed_dim,
                    weights=[embedding_matrix],
                    input_shape=(None,),
                    trainable=True))

# LSTM bidireccional con dropout
model.add(Bidirectional(LSTM(lstm_units, return_sequences=True)))
model.add(Dropout(0.3))

# Capa GRU
model.add(GRU(gru_units))
model.add(Dropout(0.4))

# GRU con bidireccional con dropout
# model.add(GRU(gru_units))
#model.add(Dropout(0.4))

# LSTM
# model.add(LSTM(lstm_units))
# model.add(Dropout(dropout_rate))

model.add(Dense(64, activation='swish'))

# predicción de clasificación con softmax
model.add(Dense(class_num, activation='softmax'))

model.compile(loss=SparseCategoricalCrossentropy(), optimizer=Adam(1e-4), metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 300)         9000000   
                                                                 
 bidirectional (Bidirection  (None, None, 200)         320800    
 al)                                                             
                                                                 
 dropout (Dropout)           (None, None, 200)         0         
                                                                 
 gru (GRU)                   (None, 100)               90600     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 dense (Dense)               (None, 64)                6464      
                                                        

## Entrenamiento del modelo

In [13]:
history = model.fit(train_sequences,
                    newsgroups_train.target,
                    batch_size=batch_size,
                    epochs=50,
                    validation_split=0.2,
                    callbacks=[early_stopping, model_checkpoint])

Epoch 1/50
 6/71 [=>............................] - ETA: 8:50 - loss: 2.9985 - accuracy: 0.0482

KeyboardInterrupt: 

## Evaluación del modelo y visualización de resultados

In [None]:
# con datos de prueba
test_loss, test_accuracy = model.evaluate(test_sequences, newsgroups_test.target)
print(f'Test accuracy: {test_accuracy}')

In [None]:
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
test_predictions = model.predict(test_sequences)
test_predictions = np.argmax(test_predictions, axis=1)

In [None]:
# métricas
f1 = f1_score(newsgroups_test.target, test_predictions, average='micro')
precision = precision_score(newsgroups_test.target, test_predictions, average='micro')
accuracy = accuracy_score(newsgroups_test.target, test_predictions)
recall = recall_score(newsgroups_test.target, test_predictions, average='micro')
conf_matrix = confusion_matrix(newsgroups_test.target, test_predictions)
class_report = classification_report(newsgroups_test.target, test_predictions, target_names=newsgroups_test.target_names)

print(f'F1-score: {f1}')
print(f'Precision: {precision}')
print(f'Accuracy: {accuracy}')
print(f'Recall: {recall}')
print(f'Classification Report:\n{class_report}')

In [None]:
plt.figure(figsize=(12, 12))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=newsgroups_test.target_names, yticklabels=newsgroups_test.target_names)
plt.title('Matriz de Confusión')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

## Para aventurarse


In [None]:
'''
Tokenizacion: opciones
Elman, LSTM, GRU
Bidireccional
Tamaño de capas y cantidad
Dropout
RMSProp, ADAM
BATCH_SIZE
Unloop
TPU?
Embedding entrenable
Forma de colapsar las secuencias
Reduccion de dimensionalidad embedding
'''