<a href="https://colab.research.google.com/github/aletscn/NLP-REAL-OR-NOT-/blob/master/subwords_idpy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

BPE 

In [1]:
import re
import os
from keras.callbacks import TensorBoard
import tensorflow as tf
from math import log


class BPE(object):

    def __init__(self, vocab_file):
        with open(vocab_file, encoding="utf8") as f:
            self.words = [l.split()[0] for l in f]
            log_len = log(len(self.words))
            self.wordcost = {
                k: log((i+1) * log_len)
                for i, k in enumerate(self.words)}
            self.maxword = max(len(x) for x in self.words)

    def encode(self, s):
        """Uses dynamic programming to infer the location of spaces in a string
        without spaces."""

        s = s.replace(" ", "▁")

        # Find the best match for the i first characters, assuming cost has
        # been built for the i-1 first characters.
        # Returns a pair (match_cost, match_length).
        def best_match(i):
            candidates = enumerate(reversed(cost[max(0, i - self.maxword):i]))
            return min(
                (c + self.wordcost.get(s[i-k-1:i], 9e999), k+1)
                for k, c in candidates)

        # Build the cost array.
        cost = [0]
        for i in range(1, len(s) + 1):
            c, k = best_match(i)
            cost.append(c)

        # Backtrack to recover the minimal-cost string.
        out = []
        i = len(s)
        while i > 0:
            c, k = best_match(i)
            assert c == cost[i]
            out.append(s[i-k:i])

            i -= k

        return " ".join(reversed(out))


#========plot train and validation scalars in a same figure=======
class TrainValTensorBoard(TensorBoard):
    def __init__(self, log_dir='./logs', **kwargs):
        # Make the original `TensorBoard` log to a subdirectory 'training'
        training_log_dir = os.path.join(log_dir, 'training')
        super(TrainValTensorBoard, self).__init__(training_log_dir, **kwargs)

        # Log the validation metrics to a separate subdirectory
        self.val_log_dir = os.path.join(log_dir, 'validation')

    def set_model(self, model):
        # Setup writer for validation metrics
        self.val_writer = tf.summary.FileWriter(self.val_log_dir)
        super(TrainValTensorBoard, self).set_model(model)

    def on_epoch_end(self, epoch, logs=None):
        # Pop the validation logs and handle them separately with
        # `self.val_writer`. Also rename the keys so that they can
        # be plotted on the same figure with the training metrics
        logs = logs or {}
        val_logs = {k.replace('val_', ''): v for k, v in logs.items() if k.startswith('val_')}
        for name, value in val_logs.items():
            summary = tf.Summary()
            summary_value = summary.value.add()
            summary_value.simple_value = value.item()
            summary_value.tag = name
            self.val_writer.add_summary(summary, epoch)
        self.val_writer.flush()

        # Pass the remaining logs to `TensorBoard.on_epoch_end`
        logs = {k: v for k, v in logs.items() if not k.startswith('val_')}
        super(TrainValTensorBoard, self).on_epoch_end(epoch, logs)

    def on_train_end(self, logs=None):
        super(TrainValTensorBoard, self).on_train_end(logs)
        self.val_writer.close()
import numpy as np
import re
import itertools
from collections import Counter
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()


def load_data_and_labels(positive_data_file, negtive_data_file):
    """
    Load data from files, split data into words and generate labels
    Input: the positive data file path and negative data file path
    Output:
        x_text: list of words for sentences. e.g [['i', 'am', is'], ['word', 'is', 'too', 'long'], ...,]
        y: For each sentence, using `[neg, pos]` to represent the lables.
           - If we have a positive label, we represent it as `[0, 1]`
           - If we have a negative label, we represent it as `[1, 0]`
    """

    # Load data from files
    positive_examples = list(open(positive_data_file, 'r', encoding='utf-8').readlines())
    positive_examples = [s.strip() for s in positive_examples]
    negative_examples = list(open(negtive_data_file, 'r', encoding='utf-8').readlines())
    negative_examples = [s.strip() for s in negative_examples]

    # Split by words
    x_text = positive_examples + negative_examples
    x_text = [clean_str(sen) for sen in x_text]
    # x_text = [sen.split(" ") for sen in x_text]

    # Generate labels
    positive_lables = [[0, 1] for _ in positive_examples]
    negative_lables = [[1, 0] for _ in negative_examples]
    y = np.concatenate((positive_lables, negative_lables), 0)
    return x_text, y


def pad_sentences(sentences, padding_word='<PAD/>'):
    """
    :param sentences: sentences as list of words,  [['i', 'am', is'], ['word', 'is', 'too', 'long'], ...,]
    :return: pad sentence to longest length, [['i', 'am', is', '<PAD>', '<PAD>'], ['word', 'is', 'too', 'long', '<PAD>'], ...,]
    """
    sequence_length = max(len(sen) for sen in sentences)
    padded_sentences = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        num_padding = sequence_length - len(sentence)
        new_sentence = sentence + [padding_word] * num_padding
        padded_sentences.append(new_sentence)
    return padded_sentences


def build_vocab(sentences):
    """
    :param sentences:  sentences after padding
    :return:
        vocabulary: a dict object, key is word and value is index. e.g. {'i': 0, 'am':1}
        vocabulary_inv: a dict object, the inverse of vocabulary. e.g. {0: 'i', 1:'am'}
    """
    # Count words
    word_counts = Counter(itertools.chain(*sentences))
    # Sort the word as frequency order
    vocabulay_inv = [x[0] for x in word_counts.most_common()]
    # Build vocabulary, word: index
    vocabulay = {word: i for i, word in enumerate(vocabulay_inv)}
    # Build inverse vocabulary, index: word
    vocabulay_inv = {value: key for key, value in vocabulay.items()}

    return [vocabulay, vocabulay_inv]

def build_index_sentence(sentences, vocabulary):
    # x = []
    # for sen in sentences:
    #     one_sen = []
    #     for word in sen:
    #         one_sen.append(vocabulary[word])
    #     x.append(one_sen)
    # return np.array(x)

    # write above code as one line
    x = np.array([[vocabulary[word] for word in sen] for sen in sentences])
    return x

# New Section

Libraries

In [15]:
import os
import numpy as np
import pandas as pd
import pickle
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Input, Embedding, Activation, Flatten, Dense, Concatenate, Bidirectional, LeakyReLU,SpatialDropout1D
from keras import regularizers
from keras.optimizers import Adam
from keras.layers import Conv1D, MaxPooling1D, Dropout, LSTM
from keras.models import Model, Sequential
from keras.callbacks import CSVLogger
from sklearn.model_selection import GridSearchCV, cross_validate
from keras.wrappers.scikit_learn import KerasClassifier
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import f1_score




In [3]:
#Para poder abrir cosas en el Drive
import sys
#Donde se encuentran los documentos (train, text)
DRIVE_DIR='/content/drive'
BASE_DIR=''
DATA_DIR = 'drive/My Drive/Colab Notebooks'
TWEETS_DIR='data_tp2'

from google.colab import drive
drive.mount(DRIVE_DIR)

os.chdir(DATA_DIR)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [4]:
train=pd.read_csv(os.path.join(TWEETS_DIR, 'train_clean.csv'))
train=train.fillna(' ')

test=pd.read_csv(os.path.join(TWEETS_DIR, 'test_clean.csv'))
test=test.fillna(' ')

real_test=pd.read_csv(os.path.join(TWEETS_DIR, 'submission.csv'))

x_text = train['text_clean_nosw']
y = train['target'].values
y = to_categorical(y)

x_test = test['text_clean_nosw']
id_test=test['id']


In [5]:
# Convert subword to index, function version
def subword2index(texts, vocab):
    sentences = []
    for s in texts:
        s = s.split()
        one_line = []
        for word in s:
            if word not in vocab.keys():
                one_line.append(vocab['unk'])
            else:
                one_line.append(vocab[word])
        sentences.append(one_line)
    return sentences


# replace all digits with 0
import re


#bpe = BPE("./pre-trained-model/en.wiki.bpe.op25000.vocab.txt")
bpe = BPE("./pre-trained-model/en.wiki.bpe.vs100000.vocab.txt")
train_texts = [bpe.encode(s) for s in x_text]
test_texts = [bpe.encode(s) for s in x_test]

# Build vocab, {token: index}
vocab = {}
for i, token in enumerate(bpe.words):
    vocab[token] = i + 1

# Convert train and test
train_sentences = subword2index(train_texts, vocab)
test_sentences = subword2index(test_texts, vocab)


In [6]:

# See subword level length
length = [len(sent) for sent in train_sentences]
print('The max length is: ', max(length))
print('The min length is: ', min(length))
print('The average length is: ', sum(length)/len(length))

The max length is:  91
The min length is:  1
The average length is:  10.144420131291028


In [7]:

# Padding
from keras.preprocessing.sequence import pad_sequences

train_data = pad_sequences(train_sentences, maxlen=max(length), padding='post')
test_data = pad_sequences(test_sentences, maxlen=max(length), padding='post')

x_train=train_data
y_train=y
x_test=test_data

#Adding numerical features

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

#define and scale our added features
meta_train = StandardScaler().fit_transform(train.iloc[:, 9:])
meta_test = StandardScaler().fit_transform(test.iloc[:, 8:])


#meta_train = MinMaxScaler().fit_transform(train.iloc[:, 2:])
#meta_test = MinMaxScaler().fit_transform(test.iloc[:, 2:])

#meta_train = RobustScaler().fit_transform(train.iloc[:, 9:])
#meta_test = RobustScaler().fit_transform(test.iloc[:, 8:])


# Embedding Initialization
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format("./pre-trained-model/en.wiki.bpe.vs100000.d200.w2v.bin", binary=True)
from keras.layers import Embedding

input_size = max(length)
embedding_dim = 200
embedding_weights = np.zeros((len(vocab) + 1, embedding_dim)) # (25001, 50)


for subword, i in vocab.items():
    if subword in model.vocab:
        embedding_vector = model[subword]
        if embedding_vector is not None:
            embedding_weights[i] = embedding_vector
    else:
        continue

embedding_layer = Embedding(len(vocab)+1,
                            embedding_dim,
                            weights=[embedding_weights],
                            input_length=input_size,
                            trainable=False)



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


LSTM

In [12]:
#function to create lstm model
def create_lstm(spatial_dropout, dropout, recurrent_dropout, learning_rate, bidirectional):
    #define activation
    activation = LeakyReLU()
    
    #define inputs
    nlp_input = Input(shape = (max(length),), name = 'nlp_input')
    meta_input_train = Input(shape = (12, ), name = 'meta_train')
    emb = embedding_layer(nlp_input)
    #emb = SpatialDropout1D(dropout)(emb)


    #add LSTM layer
    if bidirectional:
        nlp_out = (Bidirectional(LSTM(128, dropout = dropout, recurrent_dropout = recurrent_dropout,
                                kernel_initializer = 'orthogonal')))(emb)
    else:
        nlp_out = (LSTM(128, dropout = dropout, recurrent_dropout = recurrent_dropout,
                      kernel_initializer = 'orthogonal'))(emb)
     
    #add meta data    
    x = Concatenate()([nlp_out, meta_input_train])
    

    #add second hidden layer
    x = Dropout(dropout)(x)
    x = (Dense(100, activation = activation, kernel_regularizer = regularizers.l2(1e-4),
              kernel_initializer = 'he_normal'))(x)
    
    #add output layer
    x = Dropout(dropout)(x)
    preds = Dense(2, activation='softmax', kernel_regularizer = regularizers.l2(1e-4))(x)
    
    #compile model
    model = Model(inputs=[nlp_input , meta_input_train], outputs = preds)
    optimizer = Adam(learning_rate = learning_rate)
    model.compile(loss = 'binary_crossentropy', optimizer = optimizer, metrics = ['accuracy'])
    
    return model

In [13]:
#define new model
lstm = create_lstm(spatial_dropout = .4, dropout = .4, recurrent_dropout = .4,
                       learning_rate = 0.0001, bidirectional = False)

lstm.summary()


Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
nlp_input (InputLayer)          [(None, 91)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 91, 200)      20000200    nlp_input[0][0]                  
__________________________________________________________________________________________________
lstm_2 (LSTM)                   (None, 128)          168448      embedding[1][0]                  
__________________________________________________________________________________________________
meta_train (InputLayer)         [(None, 12)]         0                                            
_______________________________________________________________________________________

In [45]:
mc = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True,save_weights_only = True)
#callback = EarlyStopping(monitor = 'val_loss', patience = 4)
history2 = lstm.fit([x_train, meta_train], y_train, validation_split = .2,
                       epochs = 10, batch_size = 128, verbose = 1, callbacks = [mc])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [46]:
lstm.load_weights('best_model.h5')
y_pred = lstm.predict([x_test, meta_test], batch_size=16, verbose=2)
y_pred = np.argmax(y_pred,axis=1)
f1_score(real_test['target'], y_pred,average='micro')


#df = pd.DataFrame()
#df['id'] = [int(x) for x in id_test.values]
#df['target'] = y_pred
#df.to_csv('lstm.csv', index=False)
#from google.colab import files
#files.download('lstm.csv')

204/204 - 5s


0.8023291449586272

LSTM Bidireccional

In [56]:
#function to create lstm model
def create_lstm_2(spatial_dropout, dropout, recurrent_dropout, learning_rate, bidirectional):
    #define activation
    activation = LeakyReLU(alpha = 0.001)
    
    #define inputs
    nlp_input = Input(shape = (max(length),), name = 'nlp_input')
    meta_input_train = Input(shape = (12, ), name = 'meta_train')
    emb = embedding_layer(nlp_input)

    #add LSTM layer
    if bidirectional:
        nlp_out = (Bidirectional(LSTM(128, dropout = dropout, recurrent_dropout = recurrent_dropout,
                                kernel_initializer = 'orthogonal', return_sequences = True)))(emb)
        nlp_out = (Bidirectional(LSTM(128, dropout = dropout, recurrent_dropout = recurrent_dropout,
                                 kernel_initializer = 'orthogonal')))(nlp_out)
    else:
        nlp_out = (LSTM(128, dropout = dropout, recurrent_dropout = recurrent_dropout,
                                 kernel_initializer = 'orthogonal', return_sequences = True))(emb)
        nlp_out = (LSTM(128, dropout = dropout, recurrent_dropout = recurrent_dropout,
                                 kernel_initializer = 'orthogonal'))(nlp_out)
     
    #add meta data    
    x = Concatenate()([nlp_out, meta_input_train])
    
    #add second hidden layer
    x = Dropout(dropout)(x)
    x = (Dense(100, activation = activation, kernel_regularizer = regularizers.l2(1e-4),
              kernel_initializer = 'he_normal'))(x)
    
    #add output layer
    x = Dropout(dropout)(x)
    preds = Dense(2, activation='softmax', kernel_regularizer = regularizers.l2(1e-4))(x)
    
    #compile model
    model = Model(inputs=[nlp_input , meta_input_train], outputs = preds)
    optimizer = Adam(learning_rate = learning_rate)
    model.compile(loss = 'binary_crossentropy', optimizer = optimizer, metrics = ['accuracy'])
    
    return model

In [57]:

#define new model
lstm_2 = create_lstm_2(spatial_dropout = .4, dropout = .4, recurrent_dropout = .4,
                       learning_rate = 0.0001, bidirectional = True)

lstm_2.summary()

Model: "functional_19"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
nlp_input (InputLayer)          [(None, 91)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 91, 200)      20000200    nlp_input[0][0]                  
__________________________________________________________________________________________________
bidirectional_12 (Bidirectional (None, 91, 256)      336896      embedding[11][0]                 
__________________________________________________________________________________________________
bidirectional_13 (Bidirectional (None, 256)          394240      bidirectional_12[0][0]           
______________________________________________________________________________________

In [60]:
#Me quiero quedar con el mejor epoch
mc = ModelCheckpoint('best_model_lstm2.h5', monitor='val_loss', save_best_only=True,save_weights_only = True)
#callback = EarlyStopping(monitor = 'val_loss', patience = 4)
history2 = lstm_2.fit([x_train, meta_train], y_train, validation_split = .2,
                       epochs = 20, batch_size = 128, verbose = 1, callbacks = [mc])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [61]:
lstm_2.load_weights('best_model_lstm2.h5')
y_pred = lstm_2.predict([x_test, meta_test], batch_size=16, verbose=2)
y_pred = np.argmax(y_pred,axis=1)

f1_score(real_test['target'], y_pred,average='micro')

#df = pd.DataFrame()
#df['id'] = [int(x) for x in id_test.values]
#df['target'] = y_pred
#df.to_csv('lstm.csv', index=False)
#from google.colab import files
#files.download('lstm.csv')

204/204 - 18s


0.8026356114005516

CNN

In [24]:
#function to create cnn model
def create_cnn(dropout, learning_rate):
    #define activation
    activation = LeakyReLU()

    #define inputs
    nlp_input = Input(shape = (max(length),), name = 'nlp_input')
    meta_input_train = Input(shape = (12, ), name = 'meta_train')
    emb = embedding_layer(nlp_input)

    #x = Conv1D(32, 6, activation=activation,padding='valid')(emb)
    #x = MaxPooling1D(2)(x)
    #x = Dropout(dropout)(x)
    x = Conv1D(128, 4, activation=activation,padding='valid')(emb)
    x = MaxPooling1D(2)(x)
    x = Dropout(dropout)(x)
    x = Conv1D(64, 2, activation=activation,padding='valid')(x)
    x = MaxPooling1D()(x) 
    x = Dropout(dropout)(x)
    nlp_out = Flatten()(x)
    x = Concatenate()([nlp_out, meta_input_train])
    x = Dense(35, activation='relu')(x)
    preds = Dense(2, activation='softmax', kernel_regularizer = regularizers.l2(1e-4))(x)


    #compile model
    model = Model(inputs=[nlp_input , meta_input_train], outputs = preds)
    optimizer = Adam(learning_rate = learning_rate)
    model.compile(loss = 'binary_crossentropy', optimizer = optimizer, metrics = ['accuracy'])
    
    return model

In [25]:
#define new model
cnn = create_cnn(dropout = .5, learning_rate = 0.0001)

cnn.summary()

Model: "functional_9"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
nlp_input (InputLayer)          [(None, 91)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 91, 200)      20000200    nlp_input[0][0]                  
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 88, 128)      102528      embedding[4][0]                  
__________________________________________________________________________________________________
max_pooling1d_2 (MaxPooling1D)  (None, 44, 128)      0           conv1d_2[0][0]                   
_______________________________________________________________________________________

In [26]:
#Me quiero quedar con el mejor epoch
mc_cnn = ModelCheckpoint('best_model_cnn.h5', monitor='val_accuracy', mode='max', save_best_only=True)
#callback = EarlyStopping(monitor = 'val_loss', patience = 4)
history2 = cnn.fit([x_train, meta_train], y_train, validation_split = .2,
                       epochs = 30, batch_size = 32, verbose = 1, shuffle=True, callbacks = [mc_cnn])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [27]:
cnn.load_weights('best_model_cnn.h5')

y_pred = cnn.predict([x_test, meta_test], batch_size=32, verbose=2)
y_pred = np.argmax(y_pred,axis=1)

f1_score(real_test['target'], y_pred,average='micro')

df = pd.DataFrame()
df['id'] = [int(x) for x in id_test.values]
df['target'] = y_pred
df.to_csv('cnn.csv', index=False)
from google.colab import files
files.download('cnn.csv')

102/102 - 0s


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [28]:
f1_score(real_test['target'], y_pred,average='micro')


0.8023291449586272