# 20 Newsgroup

## Preparación de los datos

In [None]:
import os, sys

TEXT_DATA_DIR = "20_newsgroup"

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        for fname in sorted(os.listdir(path)):
            if fname.isdigit():
                fpath = os.path.join(path, fname)
                if sys.version_info < (3,):
                    f = open(fpath)
                else:
                    f = open(fpath, encoding='latin-1')
                t = f.read()
                i = t.find('\n\n')  # skip header
                if 0 < i:
                    t = t[i:]
                texts.append(t)
                f.close()
                labels.append(label_id)

print('Found %s texts.' % len(texts))

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np

MAX_NB_WORDS = 20000
MAX_SEQUENCE_LENGTH = 1000
VALIDATION_SPLIT = 0.2

tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

### GloVe

In [None]:
GLOVE_DIR = "GloVe"

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'), encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
EMBEDDING_DIM = 100

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
from keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False) # Change to True later

# Red

In [None]:
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Dense, Dropout
from keras.layers import LSTM, TimeDistributed
from keras.layers import Concatenate
from keras.layers import Input, Conv1D, MaxPooling1D
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.optimizers import SGD, RMSprop


# MODEL 1 ------------------------------------------------------------------

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = Dropout(0.5)(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = Dropout(0.5)(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = Dropout(0.5)(x)
x = MaxPooling1D(35)(x)  # global max pooling
x = Flatten()(x)

x = Dense(128, activation='relu')(x)
preds = Dense(len(labels_index), activation='softmax')(x)

'''
# MODEL 2 ------------------------------------------------------------------

convs = []
filter_sizes = [3,4,5]
 
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')



embedded_sequences = Embedding(len(word_index) + 1, 
                               EMBEDDING_DIM,
                               input_length=MAX_SEQUENCE_LENGTH,
                               trainable=True)(sequence_input)
# embedded_sequences = embedding_layer(sequence_input) # Uncomment this later



for fsz in filter_sizes:
    l_conv = Conv1D(nb_filter=128,filter_length=fsz,activation='relu')(embedded_sequences)
    l_pool = MaxPooling1D(5)(l_conv)
    convs.append(l_pool)
    
x = Concatenate(axis=1)(convs)
x = Conv1D(128, 5, activation='relu')(x)
x = Dropout(0.5)(x)
x = MaxPooling1D(5)(x)
x = Conv1D(256, 5, activation='relu')(x)
x = Dropout(0.5)(x)
x = MaxPooling1D(20)(x)

#x = LSTM(units=128)(x)

x = Flatten()(x)
x = Dense(128, activation='relu')(x) 
preds = Dense(20, activation='softmax')(x)
'''

#------------------------------------------------------------------

#optimizer = SGD(lr=0.1, momentum=0.8, decay=0.001, nesterov=True)
#optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=0.0)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer= 'adam',
              metrics=['acc'])

print(model.summary())

# Learning!
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          epochs=10, batch_size=128)