In [1]:
import warnings
warnings.filterwarnings('ignore')
import multiprocessing
from gensim.models import Word2Vec
import numpy as np
import os
import itertools

In [2]:
from gensim.models.keyedvectors import KeyedVectors

char_embeddings = KeyedVectors.load_word2vec_format("../gensim_char-embeddings.txt", binary=False)
char_vectors = char_embeddings.wv

In [3]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Input, Activation, Flatten
from keras.layers import Conv1D, MaxPooling1D, Dropout
from keras.models import Model, Sequential
from keras.layers import Embedding, Dense, Dropout, LSTM

Using TensorFlow backend.


In [4]:
def load_document(data_location, htf):
    tweets = []
    labels = []
    
    for line in open(os.path.join(data_location, htf)).readlines():
        line_split = line.strip().split('\t')
        tweets.append(line_split[1])
        labels.append(int(line_split[2]))

    return {'tweets': tweets, 'labels': labels}

In [5]:
def create_data(data_location):
    ht_files = sorted(os.listdir(data_location))

    Xs = []
    ys = []
    ht_list = []
    for htf in ht_files:
        ht_dict = load_document(data_location, htf)

        ht_list.append(htf)
        ys.append(ht_dict['labels'])
        Xs.append(ht_dict['tweets'])

    X_train = np.array([*itertools.chain.from_iterable(Xs[1:])])
    y_train = np.array([*itertools.chain.from_iterable(ys[1:])])
    X_test = Xs[1]
    y_test = ys[1]
    
    y_train[y_train == 2] = 1
    y_test[y_test == 2] = 1
        
    return X_train, y_train, X_test, y_test, ht_list

In [6]:
X_train, y_train, X_test, y_test, ht_list = create_data('../trial_data')

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Tokenizer
tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
tk.fit_on_texts(X_train)

In [7]:
# alphabet="abcdefghijklmnopqrstuvwxyz0123456789 ,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 ,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
char_dict = {}
for i, char in enumerate(alphabet):
    char_dict[char] = i + 1
    
# Use char_dict to replace the tk.word_index
tk.word_index = char_dict.copy() 
# Add 'UNK' to the vocabulary 
tk.word_index[tk.oov_token] = max(char_dict.values()) + 1

In [8]:
# Convert string to index 
train_sequences = tk.texts_to_sequences(X_train)
test_texts = tk.texts_to_sequences(X_test)

# Padding
train_data = pad_sequences(train_sequences, maxlen=300, padding='post')
test_data = pad_sequences(test_texts, maxlen=300, padding='post')

# Convert to numpy array
train_data = np.array(train_data, dtype='float32')
test_data = np.array(test_data, dtype='float32')

In [9]:
vocab_size = len(tk.word_index)

embedding_weights = [] #(97, 96)
embedding_weights.append(np.zeros(vocab_size)) # first row is pad

for char, i in tk.word_index.items(): # from index 1 to 70
    onehot = np.zeros(vocab_size)
    onehot[i-1] = 1
    embedding_weights.append(onehot)
embedding_weights = np.array(embedding_weights)

In [10]:
from keras.utils import to_categorical

train_classes = to_categorical(y_train)
test_classes = to_categorical(y_test)

In [11]:
# parameter 
input_size = 300
# vocab_size = 69
embedding_size = 96

num_of_classes = 2
dropout_p = 0.5
optimizer = 'adam'
loss = 'binary_crossentropy'

conv_layers = [[256, 7, 3],
               [256, 7, 3],
               [256, 3, -1],
               [256, 3, -1],
               [256, 3, -1],
               [256, 3, 3]]

fully_connected_layers = [1024, 1024]

# Embedding layer Initialization
embedding_layer = Embedding(vocab_size+1, 
                            embedding_size,
                            input_length=input_size,
                            weights=[embedding_weights])

In [12]:
# Model 

# Input
inputs = Input(shape=(input_size,), name='input', dtype='int64')  # shape=(?, 1014)
# Embedding 
x = embedding_layer(inputs)
# Conv 
for filter_num, filter_size, pooling_size in conv_layers:
    x = Conv1D(filter_num, filter_size)(x) 
    x = Activation('relu')(x)
    if pooling_size != -1:
        x = MaxPooling1D(pool_size=pooling_size)(x) # Final shape=(None, 34, 256)
x = Flatten()(x) # (None, 8704)
# Fully connected layers 
for dense_size in fully_connected_layers:
    x = Dense(dense_size, activation='relu')(x) # dense_size == 1024
    x = Dropout(dropout_p)(x)
# Output Layer
predictions = Dense(2, activation='softmax')(x)
# Build model
model = Model(inputs=inputs, outputs=predictions)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy']) # Adam, categorical_crossentropy

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [14]:
# Training
model.fit(train_data, train_classes,
          batch_size=128,
          epochs=5,
          verbose=2)

y_pred = model.predict(test_data)

print(y_test)
print(y_pred)
# print(np.hstack((y_test.reshape(len(y_test), 1), y_pred)))

Epoch 1/5
 - 3s - loss: 0.2845 - acc: 0.9273
Epoch 2/5
 - 3s - loss: 0.2699 - acc: 0.9273
Epoch 3/5
 - 3s - loss: 0.2528 - acc: 0.9273
Epoch 4/5
 - 3s - loss: 0.2373 - acc: 0.9273
Epoch 5/5
 - 3s - loss: 0.2459 - acc: 0.9273
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1]
[[0.89430815 0.10569185]
 [0.8880926  0.11190744]
 [0.87364614 0.12635386]
 [0.9407357  0.05926429]
 [0.9551587  0.04484131]
 [0.89123946 0.10876048]
 [0.8781354  0.12186453]
 [0.8967027  0.10329729]
 [0.8766388  0.12336116]
 [0.87044525 0.12955478]
 [0.90878403 0.09121599]
 [0.91124123 0.08875876]
 [0.94026583 0.05973413]
 [0.89431006 0.10568997]
 [0.8845927  0.11540724]
 [0.90086675 0.09913322]
 [0.8