# Fast Text

In [1]:
import numpy as np
import pickle

In [2]:
from keras.preprocessing import sequence

Using TensorFlow backend.


In [3]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import GlobalAveragePooling1D

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

In [5]:
def create_ngram_set(input_list, ngram_value):
    """ Extract a set of n-grams from a list of integers.
    
    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
    {(4, 9), (4, 1), (1, 4), (9, 4)}
    
    """
    return set(zip(*[input_list[i:] for i in range(ngram_value)]))

In [6]:
def add_ngram(sequences, token_indice, ngram_range):
    """
    Augment the input list of list (sequences) by appending n-grams values.
    Example: adding bi-gram
    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
    >>> add_ngram(sequences, token_indice, ngram_range=2)
    [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
    """
    new_sequences = []
    for input_list in sequences:
        new_list = input_list[:] #Copy of n-sequence [1, 3, 4, 5]
        for i in range(len(new_list) - ngram_range + 1): #c=|s|-q+1; c=4-2+1
            for ngram_value in range(2, ngram_range + 1): #
                ngram = tuple(new_list[i:i + ngram_value])#Get the n-gram
                if ngram in token_indice:
                    new_list.append(token_indice[ngram])
        new_sequences.append(new_list)

    return new_sequences

Cargar el DS

In [7]:
def load_ds(path):
    vecfile = open(path,'r')
    return pickle.load(vecfile)

Dictionary

In [8]:
def get_lValue(d_items):
    values = d_items.values()
    values.sort()
    return values[-1]

def get_value(d_items, key):
    value = d_items.get(unicode(key),False) 
    if value:
        return value
    else:
        value = get_lValue(d_items) + 1
        d_items[key] = value #Add the new key to the Dict
        return value

In [9]:
dict_50_50 = pickle.load(open("./data/ann/dicFile_50_50.p","rb"))
last_item = get_lValue(dict_50_50)
print last_item

29505


In [10]:
def build_arch():
    print('Build model...')
    model = Sequential()
    model.add(Embedding(max_features, 
                        embedding_size,
                        input_length=maxlen))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(128, activation='sigmoid'))
    model.add(Dense(1, activation='sigmoid'))
    return model

In [11]:
def cross_val(tweet_sec, target, nFolds):
    kFold = StratifiedKFold(n_splits=nFolds, shuffle=True)
    scores = []
    for train, test in kFold.split(tweet_sec,target):
        model = build_arch()
        model.compile(loss='binary_crossentropy', 
                      optimizer='adam', 
                      metrics=['accuracy'])
        model.fit(tweet_sec[train],target[train],
                  batch_size=batch_size,
                  epochs=epochs,
                  validation_split=0.1,
                  verbose=1)
        score, acc = model.evaluate(tweet_sec[test], target[test],batch_size=batch_size)
        scores.append(acc * 100)
    return scores   

In [12]:
#Training with the 100% of the data set
def train_model(tweet_sec,target):
    model = build_arch()
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model.fit(tweet_sec, target,
              batch_size=batch_size,
              epochs=epochs,
              verbose=1)

Embeding

In [13]:
max_features = 20000 #Vocabulario
maxlen = 20 #Secuence size 
embedding_size = 128

Model

In [14]:
ngram_range = 2
#max_features = 20000
#maxlen = 400
#batch_size = 32
#embedding_dims = 50
#epochs = 5

Training

In [15]:
batch_size = 32
epochs = 5

In [16]:
print('Loading data...')
ds = load_ds('./data/ann/vectors_50_50.txt')
ds.head()

Loading data...


Unnamed: 0,category,tweet_sec
0,1,"[268, 459, 146, 2, 470, 4493]"
1,1,"[2264, 7, 9784, 3050]"
2,0,"[80, 172, 35, 1, 14, 39, 2265, 7, 9785, 2266, ..."
3,1,"[107, 41, 21, 471, 9, 659, 5, 216, 300, 5, 7, ..."
4,1,"[562, 106, 29, 563, 29, 422]"


In [17]:
print('Average sequence length: {}'.format(np.mean(list(map(len, ds['tweet_sec'])), dtype=int)))

Average sequence length: 12


In [18]:
x_supreme = []
if ngram_range > 1:
    print('Adding {}-gram features'.format(ngram_range))
    # Create set of unique n-gram from the training set.
    ngram_set = set()
    for input_list in ds['tweet_sec']:
        for i in range(2, ngram_range + 1):
            set_of_ngram = create_ngram_set(input_list, ngram_value=i) #Get the n_grams
            ngram_set.update(set_of_ngram) #All the n_grams of the corpus
    # Dictionary mapping n-gram token to a unique integer.
    # Integer values are greater than max_features in order
    # to avoid collision with existing features.
    start_index = last_item + 1
    token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
    indice_token = {token_indice[k]: k for k in token_indice}
    
    # max_features is the highest integer that could be found in the dataset.
    max_features = np.max(list(indice_token.keys())) + 1
    print max_features
    # Augmenting x_train and x_test with n-grams features
    x_supreme = add_ngram(ds['tweet_sec'], token_indice, ngram_range)
    print('Average sequence length: {}'.format(np.mean(list(map(len, x_supreme)), dtype=int)))
    

Adding 2-gram features
122565
Average sequence length: 23


In [19]:
print('Pad sequences (samples x time)')
x_supreme = sequence.pad_sequences(x_supreme, maxlen=maxlen)
print('x_train shape:', x_supreme.shape)

Pad sequences (samples x time)
('x_train shape:', (15306, 20))


In [21]:
results = cross_val(x_supreme,ds['category'],10)

Build model...
Train on 12396 samples, validate on 1378 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
  32/1532 [..............................] - ETA: 0sBuild model...
Train on 12396 samples, validate on 1378 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
  32/1532 [..............................] - ETA: 0sBuild model...
Train on 12396 samples, validate on 1378 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
  32/1532 [..............................] - ETA: 0sBuild model...
Train on 12398 samples, validate on 1378 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
  32/1530 [..............................] - ETA: 0sBuild model...
Train on 12398 samples, validate on 1378 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
  32/1530 [..............................] - ETA: 0sBuild model...
Train on 12398 samples, validate on 1378 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
  32/1530 [..............................] - ETA: 0sBuil

In [22]:
print("Acc: %.2f (+/- %.2f)" %(np.mean(results), np.std(results)))

Acc: 83.81 (+/- 0.84)


In [23]:
for i in range (0,len(results)):
    print "Model %d, acc: %.2f " %(i+1,results[i])

Model 1, acc: 82.44 
Model 2, acc: 85.05 
Model 3, acc: 83.88 
Model 4, acc: 84.05 
Model 5, acc: 83.27 
Model 6, acc: 83.14 
Model 7, acc: 83.14 
Model 8, acc: 84.77 
Model 9, acc: 83.40 
Model 10, acc: 84.97 


In [21]:
trained_model = train_model(x_supreme,ds['category'])

Build model...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [22]:
model_json = trained_model.model.to_json()
with open("./data/models/fast_text/fast_text.json",'w') as json_file:
    json_file.write(model_json)
trained_model.model.save_weights("./data/models/fast_text/fast_text_weights.h5")

In [23]:
trained_model.model.save("./data/models/fast_text/fast_text.h5")