In [1]:
import pandas as pd  
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
from keras.callbacks import ModelCheckpoint
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.models import load_model
import random
# Instantly make your loops show a smart progress meter - 
# just wrap any iterable with tqdm(iterable), and you’re done!
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
import gensim
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.models import KeyedVectors
import multiprocessing
from sklearn import utils
from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

Load the preprocessed tweets and add label to distinguish between positive and negative ones.

In [2]:
data = pd.read_pickle('twitter-datasets/full_process_train')
data['target'] = data['target'].apply(lambda x: 0 if x == -1 else 1)

In [3]:
X = data.tweet.values.tolist()
y = data.target.values.tolist()

In [4]:
X[:3]

['positive thank you jamally mal ! <repeat> congrats to jer bear for her new job',
 'and after that conce you wil <redundant> be boyfriend-les <redundant> ahah . <repeat>',
 'finder series <number> : target in the finder yaoi paperback during a routine journalism assignment , akihito takara ...']

The function **train_test_split** splits arrays or matrices into random train and test subsets
- **test_size** : if float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split
- **random_state** : if int, random_state is the seed used by the random number generator

Here
**test_size=.05**

In [5]:
SEED = 2000
x_train, x_validation_and_test, y_train, y_validation_and_test = \
        train_test_split(X, y, test_size=.05,random_state=SEED)
x_validation, x_test, y_validation, y_test = \
        train_test_split(x_validation_and_test, y_validation_and_test, test_size=.5, random_state=SEED)
#list(x_validation_and_test)

In [6]:
print("Train set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_train),
                (len(x_train[y_train == 0]) / (len(x_train)*1.))*100,                                                                          
                (len(x_train[y_train == 1]) / (len(x_train)*1.))*100))
print("Validation set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_validation),
                (len(x_validation[y_validation == 0]) / (len(x_validation)*1.))*100,
                (len(x_validation[y_validation == 1]) / (len(x_validation)*1.))*100))
print("Test set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_test),
                (len(x_test[y_test == 0]) / (len(x_test)*1.))*100,
                (len(x_test[y_test == 1]) / (len(x_test)*1.))*100))

Train set has total 2116371 entries with 0.00% negative, 0.00% positive
Validation set has total 55694 entries with 0.20% negative, 0.20% positive
Test set has total 55694 entries with 0.06% negative, 0.06% positive


In [7]:
print("Train set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_train),
                                    (len(x_train[y_train == 0]) / (len(x_train)*1.))*100,
                                    (len(x_train[y_train == 1]) / (len(x_train)*1.))*100))
print("Validation set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_validation),
                                    (len(x_validation[y_validation == 0]) / (len(x_validation)*1.))*100,
                                    (len(x_validation[y_validation == 1]) / (len(x_validation)*1.))*100))
print("Test set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_test),
                                    (len(x_test[y_test == 0]) / (len(x_test)*1.))*100,
                                    (len(x_test[y_test == 1]) / (len(x_test)*1.))*100))

Train set has total 2116371 entries with 0.00% negative, 0.00% positive
Validation set has total 55694 entries with 0.20% negative, 0.20% positive
Test set has total 55694 entries with 0.06% negative, 0.06% positive


## Word2Vec

This function "home made" returns a **list** of elements of type **TaggedDocument**

In [9]:
def labelize(tweets,label):
    result = []
    prefix = label
    for i, t in enumerate(tweets):
        result.append(TaggedDocument(t.split(), [prefix + '_%s' % i]))
    return result

We append again **all the data** together and labelize it.

In [10]:
all_x = x_train + x_validation + x_test
all_x_w2v = labelize(all_x, 'all')
#all_x_w2v
print(type(all_x_w2v))
all_x_w2v[0:3]

<class 'list'>


[TaggedDocument(words=['such', 'negative', 'harsh', 'word', 'i', 'positive', 'love', 'fuko', 'with', 'al', '<redundant>', 'my', 'hea', '!', '<repeat>'], tags=['all_0']),
 TaggedDocument(words=['al', '<redundant>', 'the', 'time', ')', '<repeat>'], tags=['all_1']),
 TaggedDocument(words=['vaultz', 'vz01094', '<number>', '-', 'drawer', 'locking', 'cd', 'storage', 'cabinet', 'hold', 'up', 'to', '<number>', 'cd', 'black', 'with', 'chrome', 'accent', 'lock', 'it', 'up', 'w', '...'], tags=['all_2'])]

Creation of abstract model **CBOW** (Continuous Bag Of Words)

Parameters:
- **size=100**
- workers=nb_cores

In [11]:
nb_cores = multiprocessing.cpu_count()
print(nb_cores)
cbow_model = Word2Vec(sg=0, size=100, negative=5, window=2, min_count=2, 
                         workers=nb_cores, alpha=0.065, min_alpha=0.065)

8


Concretisation of the abstract model CBOW (Continuous Bag Of Words) by associating the correspondant vocabulary.

In [12]:
cbow_model.build_vocab([x.words for x in tqdm(all_x_w2v)])
#cbow_model.build_vocab([x.words for x in all_x_w2v])

100%|██████████| 2227759/2227759 [00:00<00:00, 2873772.39it/s]


We train **CBOW** with the function **train**.

In [13]:
%%time
for epoch in range(30):
    #cbow_model.train(utils.shuffle([x.words for x in all_x_w2v]),
    cbow_model.train(utils.shuffle([x.words for x in tqdm(all_x_w2v)]), 
                total_examples=len(all_x_w2v), epochs=1)
    cbow_model.alpha -= 0.002
    cbow_model.min_alpha = cbow_model.alpha

100%|██████████| 2227759/2227759 [00:00<00:00, 2924347.49it/s]
100%|██████████| 2227759/2227759 [00:00<00:00, 2627977.03it/s]
100%|██████████| 2227759/2227759 [00:22<00:00, 97680.29it/s] 
100%|██████████| 2227759/2227759 [00:00<00:00, 2441927.41it/s]
100%|██████████| 2227759/2227759 [00:00<00:00, 2926956.38it/s]
100%|██████████| 2227759/2227759 [00:00<00:00, 2918282.06it/s]
100%|██████████| 2227759/2227759 [00:00<00:00, 2924734.68it/s]
100%|██████████| 2227759/2227759 [00:00<00:00, 2920541.43it/s]
100%|██████████| 2227759/2227759 [00:00<00:00, 2848379.60it/s]
100%|██████████| 2227759/2227759 [00:00<00:00, 2734804.09it/s]
100%|██████████| 2227759/2227759 [00:00<00:00, 2889290.33it/s]
100%|██████████| 2227759/2227759 [00:01<00:00, 1842779.92it/s]
100%|██████████| 2227759/2227759 [00:00<00:00, 2932373.13it/s]
100%|██████████| 2227759/2227759 [00:00<00:00, 2917156.87it/s]
100%|██████████| 2227759/2227759 [00:00<00:00, 2940750.19it/s]
100%|██████████| 2227759/2227759 [00:00<00:00, 2932389.7

CPU times: user 46min 34s, sys: 12 s, total: 46min 46s
Wall time: 14min 41s


We create the abstract model **SG** (Skip Gram).


Parameters:
- **size=100**
- workers=nb_cores

In [14]:
sg_model = Word2Vec(sg=1, size=100, negative=5, window=2, min_count=2, workers=nb_cores, 
                       alpha=0.065, min_alpha=0.065)

Concretisation of the abstract model SG by associating the correspondant vocabulary.

In [15]:
sg_model.build_vocab([x.words for x in tqdm(all_x_w2v)])

100%|██████████| 2227759/2227759 [00:00<00:00, 2853034.31it/s]


We train **SG**.

In [16]:
%%time
for epoch in range(30):
    sg_model.train(utils.shuffle([x.words for x in tqdm(all_x_w2v)]), 
                      total_examples=len(all_x_w2v), epochs=1)
    sg_model.alpha -= 0.002
    sg_model.min_alpha = sg_model.alpha

100%|██████████| 2227759/2227759 [00:00<00:00, 2929133.72it/s]
100%|██████████| 2227759/2227759 [00:01<00:00, 1840042.28it/s]
100%|██████████| 2227759/2227759 [00:00<00:00, 2693609.92it/s]
100%|██████████| 2227759/2227759 [00:00<00:00, 2884437.42it/s]
100%|██████████| 2227759/2227759 [00:00<00:00, 2367773.82it/s]
100%|██████████| 2227759/2227759 [00:00<00:00, 2864218.94it/s]
100%|██████████| 2227759/2227759 [00:00<00:00, 2874364.68it/s]
100%|██████████| 2227759/2227759 [00:00<00:00, 2568116.99it/s]
100%|██████████| 2227759/2227759 [00:00<00:00, 2900862.47it/s]
100%|██████████| 2227759/2227759 [00:01<00:00, 2102404.33it/s]
100%|██████████| 2227759/2227759 [00:00<00:00, 2885491.17it/s]
100%|██████████| 2227759/2227759 [00:00<00:00, 2736100.61it/s]
100%|██████████| 2227759/2227759 [00:00<00:00, 2603582.55it/s]
100%|██████████| 2227759/2227759 [00:00<00:00, 2902390.67it/s]
100%|██████████| 2227759/2227759 [00:00<00:00, 2322299.76it/s]
100%|██████████| 2227759/2227759 [00:00<00:00, 2906641.

CPU times: user 1h 51min 12s, sys: 14.4 s, total: 1h 51min 27s
Wall time: 18min 54s


Save the models to reuse them later.

In [18]:
cbow_model.save('cbow_model.word2vec')
sg_model.save('sg_model.word2vec')

## CNN (Convolutional Neural Network)

Load the previous models.

In [19]:
cbow_model = KeyedVectors.load('cbow_model.word2vec')
sg_model = KeyedVectors.load('sg_model.word2vec')

In [20]:
len(cbow_model.wv.vocab.keys())

173940

We get more "specific" vectors by concatenating the corresponding vectors obtained with CBOW and SG. The dimension of all the vectors is therefore the sum of the dimensions of BOW and SG.

In [21]:
embeddings_dico = {}
for w in cbow_model.wv.vocab.keys():
    embeddings_dico[w] = np.append(cbow_model.wv[w],sg_model.wv[w])
print('Found', len(embeddings_dico), ' word vectors.')

Found 173940  word vectors.


We set **num_words** which give the size of the vocabs that will be used.

In [22]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=100000)
tokenizer.fit_on_texts(x_train)
sequences = tokenizer.texts_to_sequences(x_train)
sequences[1]

Using TensorFlow backend.


[41, 1, 6, 61, 12]

In [23]:
len(tokenizer.word_index)

428139

In [24]:
for x in x_train[:5]:
    print(x)

such negative harsh word i positive love fuko with al <redundant> my hea ! <repeat>
al <redundant> the time ) <repeat>
vaultz vz01094 <number> - drawer locking cd storage cabinet hold up to <number> cd black with chrome accent lock it up w ...
earring stand positive clear <number> <number> " w x <number> <number> " d x <number> <number> " h this revolving earring stand can be used veically or hori ...
she is a positive wonderful person & i wudnt mind talkn <number> her but shes gettn positive ready <number> graduate & i always end up sayn sumthin negative stupid wen im wit her


In [25]:
sequences[1]

[41, 1, 6, 61, 12]

In [26]:
length = []
for x in x_train:
    length.append(len(x.split()))

**max(length)** gives us the maximum number of words in a sentence within the training data.

In [27]:
max(length)

123

Here, we chose **maxlen=127** bigger thatn max(len) calculated above.

In [28]:
x_train_seq = pad_sequences(sequences, maxlen=127)
print('Shape of data tensor:', x_train_seq.shape)

Shape of data tensor: (2116371, 127)


In [29]:
x_train_seq[1]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0, 41,  1,  6, 61, 12], dtype=int32)

**maxlen=127**

In [30]:
sequences_val = tokenizer.texts_to_sequences(x_validation)
x_val_seq = pad_sequences(sequences_val, maxlen=127)

We define the variable **num_words = 100000** which specifies the number of words appearing most frequently in the training set and which will be taken into account.

Otherwise, all words of the vocabulary will be considered.

In [31]:
num_words = 100000
embedding_matrix = np.zeros((num_words, 200))
for word, i in tokenizer.word_index.items():
    if i >= num_words:
        continue
    embedding_vector = embeddings_dico.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

## CNN - Implementation

For now, let's create a **"simple" CNN model with bigram filters**.

### CNN model with bigram filters

Below, we use a 1D convolution (better than 2D) with **100** width filters **2** (so we do 2-grams) and with strides of 1.

- **100000** because of **num_words = 100000**
- **200** is the dimension of the vector
- **input_length=127** is **input_length=127** of before

In [35]:
structure_test = Sequential()
e = Embedding(100000, 200, input_length=127)
structure_test.add(e)
structure_test.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
structure_test.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 127, 200)          20000000  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 126, 100)          40100     
Total params: 20,040,100
Trainable params: 20,040,100
Non-trainable params: 0
_________________________________________________________________


Now, if we add **Global Max Pooling** layer, then the pooling layer will extract the maximum value from each filter, and the output dimension will be a just 1-dimensional vector with length as same as the number of filters we applied. This can be directly passed on to a dense layer without flattening.

In [36]:
structure_test = Sequential()
e = Embedding(100000, 200, input_length=127)
structure_test.add(e)
structure_test.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
structure_test.add(GlobalMaxPooling1D())
structure_test.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 127, 200)          20000000  
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 126, 100)          40100     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 100)               0         
Total params: 20,040,100
Trainable params: 20,040,100
Non-trainable params: 0
_________________________________________________________________


### Case 1)
Only **weights** parameter (because **trainable** is False)

In [None]:
model_cnn_01 = Sequential()
#e = Embedding(100000, 200, weights=[embedding_matrix], input_length=127)
e = Embedding(100000, 200, weights=[embedding_matrix], input_length=127, trainable = False)
model_cnn_01.add(e)
model_cnn_01.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn_01.add(GlobalMaxPooling1D())
model_cnn_01.add(Dense(256, activation='relu'))
model_cnn_01.add(Dense(1, activation='sigmoid'))
model_cnn_01.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_cnn_01.fit(x_train_seq, y_train, validation_data=(x_val_seq, y_validation), 
                 epochs=4, batch_size=32, verbose=2)

Train on 180000 samples, validate on 10000 samples
Epoch 1/5
 - 186s - loss: 0.4034 - acc: 0.8087 - val_loss: 0.3747 - val_acc: 0.8187
Epoch 2/5
 - 876s - loss: 0.3582 - acc: 0.8353 - val_loss: 0.3632 - val_acc: 0.8277
Epoch 3/5


  % delta_t_median)
  % delta_t_median)
  % delta_t_median)


### Case 2)
**No** parameter **weights**.

In [None]:
model_cnn_02 = Sequential()
e = Embedding(100000, 200, input_length=127)
model_cnn_02.add(e)
model_cnn_02.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn_02.add(GlobalMaxPooling1D())
model_cnn_02.add(Dense(256, activation='relu'))
model_cnn_02.add(Dense(1, activation='sigmoid'))
model_cnn_02.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_cnn_02.fit(x_train_seq, y_train, validation_data=(x_val_seq, y_validation), 
                 epochs=5, batch_size=32, verbose=2)

### Case 3)
**weights** and **trainable** parameters.

In [None]:
model_cnn_03 = Sequential()
e = Embedding(100000, 200, weights=[embedding_matrix], input_length=127, trainable=True)
model_cnn_03.add(e)
model_cnn_03.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn_03.add(GlobalMaxPooling1D())
model_cnn_03.add(Dense(256, activation='relu'))
model_cnn_03.add(Dense(1, activation='sigmoid'))
model_cnn_03.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_cnn_03.fit(x_train_seq, y_train, validation_data=(x_val_seq, y_validation), 
                 epochs=5, batch_size=32, verbose=2)

### CNN more elaborate

In [None]:
seed = 7

Now, we are improving the "simple" CNN model with bigram filters defined above.

Specifically, we combine several 1Convolutions of 2grams, 3grams and 4 grams.

- tweet_input = Input(shape=(**127**,), dtype='int32')
- tweet_encoder = Embedding(**100000**, **200**, weights=[embedding_matrix], input_length=**127**, trainable=True)(tweet_input)
- bigram_branch = Conv1D(filters=**100**, kernel_size=2, padding='valid', activation='relu', strides=**1**)(tweet_encoder)                          

In [37]:
from keras.layers import Input, Dense, concatenate, Activation
from keras.models import Model

tweet_input = Input(shape=(127,), dtype='int32')

tweet_encoder = Embedding(100000, 200, weights=[embedding_matrix], 
                          input_length=127, trainable=True)(tweet_input)
bigram_branch = Conv1D(filters=100, kernel_size=2, padding='valid', 
                       activation='relu', strides=1)(tweet_encoder)
bigram_branch = GlobalMaxPooling1D()(bigram_branch)
trigram_branch = Conv1D(filters=100, kernel_size=3, padding='valid', 
                        activation='relu', strides=1)(tweet_encoder)
trigram_branch = GlobalMaxPooling1D()(trigram_branch)
fourgram_branch = Conv1D(filters=100, kernel_size=4, padding='valid', 
                         activation='relu', strides=1)(tweet_encoder)
fourgram_branch = GlobalMaxPooling1D()(fourgram_branch)

merged = concatenate([bigram_branch, trigram_branch, fourgram_branch], axis=1)

merged = Dense(256, activation='relu')(merged)
merged = Dropout(0.2)(merged)
merged = Dense(1)(merged)
output = Activation('sigmoid')(merged)
model = Model(inputs=[tweet_input], outputs=[output])
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 127)          0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 127, 200)     20000000    input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_3 (Conv1D)               (None, 126, 100)     40100       embedding_3[0][0]                
__________________________________________________________________________________________________
conv1d_4 (Conv1D)               (None, 125, 100)     60100       embedding_3[0][0]                
__________________________________________________________________________________________________
conv1d_5 (

In [None]:
filepath="CNN_best_weights.{epoch:02d}.hdf5"

checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

model.fit(x_train_seq, y_train, batch_size=32, epochs=5,
                     validation_data=(x_val_seq, y_validation), callbacks = [checkpoint])

Train on 2116371 samples, validate on 55694 samples
Epoch 1/5

We noticed that the second epoch had the better val_acc (validation accuracy, accuracy of data that the model never see) in tweets that  so we use it next.

In [None]:
loaded_CNN_model = load_model('CNN_best_weights.02.hdf5')
loaded_CNN_model.evaluate(x=x_val_seq, y=y_validation)