In [1]:
import warnings
warnings.filterwarnings('ignore')
import multiprocessing
from gensim.models import Word2Vec
import numpy as np
import os
import itertools

In [2]:
from gensim.models.keyedvectors import KeyedVectors

char_embeddings = KeyedVectors.load_word2vec_format("../gensim_char-embeddings.txt", binary=False)
char_vectors = char_embeddings.wv

In [3]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Input, Activation, Flatten
from keras.layers import Conv1D, MaxPooling1D, Dropout
from keras.models import Model, Sequential
from keras.layers import Embedding, Dense, Dropout, LSTM
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [4]:
def preprocess(X_train, X_test, y_train, y_test):
    # Tokenizer
    tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
    tk.fit_on_texts(X_train)

    
    # alphabet="abcdefghijklmnopqrstuvwxyz0123456789 ,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
    alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 ,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
    char_dict = {}
    for i, char in enumerate(alphabet):
        char_dict[char] = i + 1

        
    # Use char_dict to replace the tk.word_index
    tk.word_index = char_dict.copy() 
    # Add 'UNK' to the vocabulary 
    tk.word_index[tk.oov_token] = max(char_dict.values()) + 1

    
    # Convert string to index 
    train_sequences = tk.texts_to_sequences(X_train)
    test_texts = tk.texts_to_sequences(X_test)

    # Padding
    train_data = pad_sequences(train_sequences, maxlen=300, padding='post')
    test_data = pad_sequences(test_texts, maxlen=300, padding='post')

    # Convert to numpy array
    train_data = np.array(train_data, dtype='float32')
    test_data = np.array(test_data, dtype='float32')
    
    
    vocab_size = len(tk.word_index)

    embedding_weights = [] #(97, 96)
    embedding_weights.append(np.zeros(vocab_size)) # first row is pad

    for char, i in tk.word_index.items(): # from index 1 to 70
        onehot = np.zeros(vocab_size)
        onehot[i-1] = 1
        embedding_weights.append(onehot)
    embedding_weights = np.array(embedding_weights)
    
#     print(np.shape(y_train))
    
    train_classes = to_categorical(y_train)
    test_classes = to_categorical(y_test)
    
    return [train_data, train_classes, test_data, test_classes, embedding_weights, vocab_size]

In [5]:
def separate_data(X, y):
    X_win = []
    X_top10 = []
    X_rest = []
    
    for idx, line in enumerate(y):
        if line[0] == 1:
            X_rest.append(X[idx][:])
        elif line[1] == 1:
            X_top10.append(X[idx][:])
        elif line[2] == 1:
            X_win.append(X[idx][:])
            
    return X_win, X_top10, X_rest

In [17]:
class LSTM:
    def __init__(self):
        self.model = None
        self.num_of_classes = 3
        self.dropout = 0.5
        self.loss = 'categorical_crossentropy'
        self.optimizer = 'adam'
        self.activation = 'sigmoid'
        
    def createModel(self, embedding_layer):
        model = Sequential()
        model.add(embedding_layer)
        model.add(LSTM(128))
        model.add(Dropout(self.dropout))
        model.add(Dense(3, activation = self.activation))

        model.compile(loss=self.loss,
                      optimizer=self.optimizer,
                      metrics=['accuracy'])
        
        self.model = model
        
        
    def fit(self, X_train, y_train):
        model.fit(train_data, y_train, batch_size=16, epochs=10)
        
    def evaluate(self, X_test, y_test):
        score = model.evaluate(test_data, y_test, batch_size=16)
        print(model.predict(test_data))
        
    def run(self, X_train, X_test, y_train, y_test):
        
        num_hts = len(ys)
        for i in range(num_hts):
            
            X_train = [*itertools.chain.from_iterable(Xs[:i] + Xs[i + 1:])]
            y_train = [*itertools.chain.from_iterable(ys[:i] + ys[i + 1:])]
            X_test = Xs[i]
            y_test = ys[i]
            
            train_data, train_classes, test_data, test_classes, embedding_weights, vocab_size = preprocess(X_train, X_test, y_train, y_test)
            
            input_size = 300
            embedding_size = 96
            
            # Embedding layer Initialization
            embedding_layer = Embedding(vocab_size + 1,
                                         embedding_size,
                                         input_length=input_size,
                                         weights=[embedding_weights])

            self.createModel(embedding_layer)
            self.fit(train_data, train_classes)
            self.evaluate(test_data, test_classes)
        
        

class CNN:
    
    def __init__(self):
        self.model = None
        self.num_of_classes = 3
        self.dropout = 0.5
        self.optimizer = 'adam'
        self.loss = 'categorical_crossentropy'
        
    def createModel(self, input_size, conv_layers, fully_connected_layers):
        # Input
        inputs = Input(shape=(input_size,), name='input', dtype='int64')  # shape=(?, 1014)
        # Embedding 
        x = self.embedding_layer(inputs)
        # Conv 
        for filter_num, filter_size, pooling_size in conv_layers:
            x = Conv1D(filter_num, filter_size)(x) 
            x = Activation('relu')(x)
            if pooling_size != -1:
                x = MaxPooling1D(pool_size=pooling_size)(x) # Final shape=(None, 34, 256)
        x = Flatten()(x) # (None, 8704)
        # Fully connected layers 
        for dense_size in fully_connected_layers:
            x = Dense(dense_size, activation='relu')(x) # dense_size == 1024
            x = Dropout(self.dropout)(x)
        # Output Layer
        predictions = Dense(self.num_of_classes, activation='softmax')(x)
        # Build model
        model = Model(inputs=inputs, outputs=predictions)
        model.compile(optimizer=self.optimizer, loss=self.loss, metrics=['accuracy'])
        self.model = model
        
        
    def fit(self, train_data, train_classes):
        X_train, X_val, y_train, y_val = train_test_split(train_data, train_classes, test_size=0.1, shuffle=True)
        
        self.model.fit(X_train, y_train, 
                       validation_data=(X_val, y_val),
                       batch_size=128, 
                       epochs=12, 
                       verbose=2)
        
    def evaluate(self, X_test, y_test):
        y_pred = self.model.predict(X_test)
        
        X_win, X_top10, X_rest = separate_data(X_test, y_test)
        
        cnt = 0
        bingo_cnt = 0
        
        for tweet_pair in itertools.product(X_top10, X_rest):
            cnt += 1
            y_top10 = self.model.predict(tweet_pair[0].reshape(1, 300))
            y_rest = self.model.predict(tweet_pair[1].reshape(1, 300))

            if y_rest[0][0] > y_top10[0][0]:
                bingo_cnt += 1
        
        print('accuracy:', bingo_cnt / cnt)
        
    def run(self, Xs, ys, ht_list):
        
        conv_layers = [[256, 7, 3],
               [256, 7, 3],
               [256, 3, -1],
               [256, 3, -1],
               [256, 3, -1],
               [256, 3, 3]]

        fully_connected_layers = [1024, 1024]
        
        num_hts = len(ys)
        for i in range(num_hts):
            
            X_train = [*itertools.chain.from_iterable(Xs[:i] + Xs[i + 1:])]
            y_train = [*itertools.chain.from_iterable(ys[:i] + ys[i + 1:])]
            X_test = Xs[i]
            y_test = ys[i]
            
            train_data, train_classes, test_data, test_classes, embedding_weights, vocab_size = preprocess(X_train, X_test, y_train, y_test)
            
            input_size = 300
            embedding_size = 96
            
            # Embedding layer Initialization
            self.embedding_layer = Embedding(vocab_size + 1,
                                             embedding_size,
                                             input_length=input_size,
                                             weights=[embedding_weights])

            self.createModel(input_size, conv_layers, fully_connected_layers)
            self.fit(train_data, train_classes)
            self.evaluate(test_data, test_classes)
            

In [7]:
def load_document(data_location, htf):
    tweets = []
    labels = []
    
    for line in open(os.path.join(data_location, htf)).readlines():
        line_split = line.strip().split('\t')
        tweets.append(line_split[1])
        labels.append(int(line_split[2]))

    return {'tweets': tweets, 'labels': labels}

In [8]:
def create_data(data_location):
    ht_files = sorted(os.listdir(data_location))

    Xs = []
    ys = []
    ht_list = []
    for htf in ht_files:
        ht_dict = load_document(data_location, htf)

        ht_list.append(htf)
        ys.append(ht_dict['labels'])
        Xs.append(ht_dict['tweets'])
        
    return Xs, ys, ht_list

In [16]:
Xs, ys, ht_list = create_data('../semi_data')

cnn = CNN().run(Xs, ys, ht_list)

Train on 1613 samples, validate on 180 samples
Epoch 1/20
 - 11s - loss: 0.5214 - acc: 0.8661 - val_loss: 0.3041 - val_acc: 0.9389
Epoch 2/20
 - 8s - loss: 0.3266 - acc: 0.9206 - val_loss: 0.2706 - val_acc: 0.9389
Epoch 3/20
 - 8s - loss: 0.3124 - acc: 0.9206 - val_loss: 0.2849 - val_acc: 0.9389
Epoch 4/20
 - 8s - loss: 0.3065 - acc: 0.9206 - val_loss: 0.2518 - val_acc: 0.9389
Epoch 5/20
 - 8s - loss: 0.2963 - acc: 0.9206 - val_loss: 0.2412 - val_acc: 0.9389
Epoch 6/20
 - 8s - loss: 0.2884 - acc: 0.9206 - val_loss: 0.2382 - val_acc: 0.9389
Epoch 7/20
 - 8s - loss: 0.2729 - acc: 0.9206 - val_loss: 0.2564 - val_acc: 0.9389
Epoch 8/20
 - 8s - loss: 0.2713 - acc: 0.9206 - val_loss: 0.2508 - val_acc: 0.9389
Epoch 9/20
 - 8s - loss: 0.2404 - acc: 0.9206 - val_loss: 0.2610 - val_acc: 0.9389
Epoch 10/20
 - 8s - loss: 0.2332 - acc: 0.9206 - val_loss: 0.4224 - val_acc: 0.9389
Epoch 11/20
 - 8s - loss: 0.2483 - acc: 0.9206 - val_loss: 0.2680 - val_acc: 0.9389
Epoch 12/20
 - 8s - loss: 0.2145 - ac

KeyboardInterrupt: 