In [2]:
import warnings
warnings.filterwarnings('ignore')
import multiprocessing
from gensim.models import Word2Vec
import numpy as np
import os
import itertools

In [3]:
from gensim.models.keyedvectors import KeyedVectors

char_embeddings = KeyedVectors.load_word2vec_format("../gensim_char-embeddings.txt", binary=False)
char_vectors = char_embeddings.wv

In [4]:
from gensim.models.keyedvectors import KeyedVectors
glove_model = KeyedVectors.load_word2vec_format("../gensim_glove_vectors25.txt", binary=False)
vectors = glove_model.wv

In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Input, Activation, Flatten, concatenate
from keras.layers import Conv1D, MaxPooling1D, Dropout
from keras.models import Model, Sequential
from keras.layers import Embedding, Dense, Dropout, LSTM
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [6]:
def preprocess(X_train, X_test, y_train, y_test):
    # Tokenizer
    tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
    tk.fit_on_texts(X_train)

    
    # alphabet="abcdefghijklmnopqrstuvwxyz0123456789 ,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
    alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 ,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
    char_dict = {}
    for i, char in enumerate(alphabet):
        char_dict[char] = i + 1

        
    # Use char_dict to replace the tk.word_index
    tk.word_index = char_dict.copy() 
    # Add 'UNK' to the vocabulary 
    tk.word_index[tk.oov_token] = max(char_dict.values()) + 1

    
    # Convert string to index 
    train_sequences = tk.texts_to_sequences(X_train)
    test_texts = tk.texts_to_sequences(X_test)

    # Padding
    train_data = pad_sequences(train_sequences, maxlen=300, padding='post')
    test_data = pad_sequences(test_texts, maxlen=300, padding='post')

    # Convert to numpy array
    train_data = np.array(train_data, dtype='float32')
    test_data = np.array(test_data, dtype='float32')
    
    
    vocab_size = len(tk.word_index)

    embedding_weights = [] #(97, 96)
    embedding_weights.append(np.zeros(vocab_size)) # first row is pad

    for char, i in tk.word_index.items(): # from index 1 to 70
        onehot = np.zeros(vocab_size)
        onehot[i-1] = 1
        embedding_weights.append(onehot)
            
    embedding_weights = np.array(embedding_weights)
    
#     print(np.shape(y_train))
    
    train_classes = to_categorical(y_train)
    test_classes = to_categorical(y_test)
    
    return [train_data, train_classes, test_data, test_classes, embedding_weights, vocab_size]

In [7]:
def separate_data(X, y):
    X_win = []
    X_top10 = []
    X_rest = []
    
    for idx, line in enumerate(y):
        if line[0] == 1:
            X_rest.append(X[idx][:])
        elif line[1] == 1:
            X_top10.append(X[idx][:])
        elif line[2] == 1:
            X_win.append(X[idx][:])
            
    return X_win, X_top10, X_rest

In [8]:
def averageSimilarity(tokens):
    counter = 0; suma = 0
    
    for w1, w2 in itertools.combinations(tokens, 2):
        w1 = w1.lower(); w2 = w2.lower()
        if w1 not in vectors.vocab or w2 not in vectors.vocab:
            continue
        suma += vectors.similarity(w1, w2)
        counter += 1
        
    if counter != 0:
        return suma /  counter
    else:
        return 0

In [9]:
def extractFeatures(X):
    
    features = np.zeros((1, 5))
    
    for tweet in X:
        tokens = tweet.split()
        
        words_num = len(tweet.split())
        chars_num = np.sum(np.array([len(token) for token in tokens]))
        unk_num = np.sum(np.array([1 for token in tokens if token.lower() not in vectors.vocab]))
        caps_num = np.sum(np.array([1 for token in tokens if token.isupper()]))
        average_similarity = averageSimilarity(tokens)
        
        arr = np.array([words_num, chars_num, unk_num, caps_num, average_similarity]).reshape(1, 5)
        
        features = np.vstack((features, arr))
    
    return features[1:,:]

In [10]:
class MY_LSTM:
    def __init__(self):
        self.model = None
        self.num_of_classes = 3
        self.dropout = 0.5
        self.loss = 'categorical_crossentropy'
        self.optimizer = 'adam'
        self.activation = 'sigmoid'
        self.cnt = 0
        self.bingo_cnt = 0
        
    def createModel(self, embedding_layer):
        model = Sequential()
        model.add(embedding_layer)
        model.add(LSTM(128))
        model.add(Dropout(self.dropout))
        model.add(Dense(3, activation = self.activation))

        model.compile(loss=self.loss,
                      optimizer=self.optimizer,
                      metrics=['accuracy'])
        
        self.model = model
        
        
    def fit(self, train_data, train_classes):
        
        X_train, X_val, y_train, y_val = train_test_split(train_data, train_classes, test_size=0.1, shuffle=True)
        
        self.model.fit(X_train, y_train, 
                       validation_data=(X_val, y_val),
                       batch_size=16, 
                       epochs=7, 
                       verbose=2)
        
    def evaluate(self, X_test, y_test):
        score = self.model.evaluate(X_test, y_test, batch_size=16)
        
        X_win, X_top10, X_rest = separate_data(X_test, y_test)
        
        cnt = 0
        bingo_cnt = 0
        
        for tweet_pair in itertools.product(X_top10, X_rest):
            cnt += 1
            y_top10 = self.model.predict(tweet_pair[0].reshape(1, 300))
            y_rest = self.model.predict(tweet_pair[1].reshape(1, 300))

            if y_rest[0][0] > y_top10[0][0]:
                bingo_cnt += 1
        
        print('accuracy:', bingo_cnt / cnt)
        self.cnt += cnt
        self.bingo_cnt += bingo_cnt
        
        
    def run(self, Xs, ys):
        
        num_hts = len(ys)
        for i in range(num_hts):
            
            X_train = [*itertools.chain.from_iterable(Xs[:i] + Xs[i + 1:])]
            y_train = [*itertools.chain.from_iterable(ys[:i] + ys[i + 1:])]
            X_test = Xs[i]
            y_test = ys[i]
            
            train_data, train_classes, test_data, test_classes, embedding_weights, vocab_size = preprocess(X_train, X_test, y_train, y_test)
            
            input_size = 300
            embedding_size = 96
            
            # Embedding layer Initialization
            embedding_layer = Embedding(vocab_size + 1,
                                         embedding_size,
                                         input_length=input_size,
                                         weights=[embedding_weights])

            self.createModel(embedding_layer)
            self.fit(train_data, train_classes)
            self.evaluate(test_data, test_classes)
            
            print('Current total accuracy:', self.bingo_cnt / self.cnt)
            
        print('Total accuracy:', self.bingo_cnt / self.cnt)
        
        

class MY_CNN:
    
    def __init__(self):
        self.model = None
        self.num_of_classes = 3
        self.dropout = 0.2
        self.optimizer = 'adam'
        self.loss = 'categorical_crossentropy'
        self.cnt = 0
        self.bingo_cnt = 0
        
    def createModel(self, input_size, conv_layers, fully_connected_layers):
        # Input
        inputs = Input(shape=(input_size,), name='input', dtype='int64')
        
        # Embedding 
        x = self.embedding_layer(inputs)
        
        # Conv 
        for filter_num, filter_size, pooling_size in conv_layers:
            x = Conv1D(filter_num, filter_size)(x) 
            x = Activation('relu')(x)
            if pooling_size != -1:
                x = MaxPooling1D(pool_size=pooling_size)(x)
        x = Flatten()(x)
        
        # Fully connected layers 
        for dense_size in fully_connected_layers:
            x = Dense(dense_size, activation='relu')(x)
            x = Dropout(self.dropout)(x)
            
        # Output Layer
        predictions = Dense(self.num_of_classes, activation='softmax')(x)
        
        # Build model
        model = Model(inputs=inputs, outputs=predictions)
        model.compile(optimizer=self.optimizer, loss=self.loss, metrics=['accuracy'])
        self.model = model
#         self.model.summary()
        
        
    def fit(self, train_data, train_classes):
        X_train, X_val, y_train, y_val = train_test_split(train_data, train_classes, test_size=0.1, shuffle=True)
        
#         class_weight = {0: 10,
#                 1: 1,
#                 2: 1}
        
        self.model.fit(X_train, y_train, 
                       validation_data=(X_val, y_val),
                       batch_size=128, 
                       epochs=2, 
                       verbose=2)
        
    def evaluate(self, X_test, y_test):
        y_pred = self.model.predict(X_test)
        
        X_win, X_top10, X_rest = separate_data(X_test, y_test)
        
        cnt = 0
        bingo_cnt = 0
        
        for tweet_pair in itertools.product(X_top10, X_rest):
            cnt += 1
            y_top10 = self.model.predict(tweet_pair[0].reshape(1, 300))
            y_rest = self.model.predict(tweet_pair[1].reshape(1, 300))

            if y_rest[0][0] > y_top10[0][0]:
                bingo_cnt += 1
                
        for tweet_pair in itertools.product(X_top10, X_win):
            cnt += 1
            y_top10 = self.model.predict(tweet_pair[0].reshape(1, 300))
            y_win = self.model.predict(tweet_pair[1].reshape(1, 300))

            if y_top10[0][0] > y_win[0][0]:
                bingo_cnt += 1
                
        for tweet_pair in itertools.product(X_rest, X_win):
            cnt += 1
            y_rest = self.model.predict(tweet_pair[0].reshape(1, 300))
            y_win = self.model.predict(tweet_pair[1].reshape(1, 300))

            if y_rest[0][0] > y_win[0][0]:
                bingo_cnt += 1
        
        print('accuracy:', bingo_cnt / cnt)
        self.cnt += cnt
        self.bingo_cnt += bingo_cnt
        
    def run(self, Xs, ys):
        
        conv_layers = [[100, 5, 3],
               [100, 5, 3]]

        fully_connected_layers = [256, 256]
        
        num_hts = len(ys)
        for i in range(num_hts):
            
            X_train = [*itertools.chain.from_iterable(Xs[:i] + Xs[i + 1:])]
            y_train = [*itertools.chain.from_iterable(ys[:i] + ys[i + 1:])]
            X_test = Xs[i]
            y_test = ys[i]
            
            train_data, train_classes, test_data, test_classes, embedding_weights, vocab_size = preprocess(X_train, X_test, y_train, y_test)
            
            input_size = 300
            embedding_size = 96
            
            # Embedding layer Initialization
            self.embedding_layer = Embedding(vocab_size + 1,
                                             embedding_size,
                                             input_length=input_size,
                                             weights=[embedding_weights])

            self.createModel(input_size, conv_layers, fully_connected_layers)
            self.fit(train_data, train_classes)
            self.evaluate(test_data, test_classes)
            
            print('Current total accuracy:', self.bingo_cnt / self.cnt)
            
        print('Total accuracy:', self.bingo_cnt / self.cnt)
        
        
        
        
    def run2(self, Xs, ys, Xt, yt, h):
        
        conv_layers = [[100, 5, 3],
               [100, 5, 3]]

        fully_connected_layers = [256, 256]
        
        first = True
        
        num_hts = len(yt)
        for i in range(num_hts):
            
            print(h[i])
            
            X_train = [*itertools.chain.from_iterable(Xs[:])]
            y_train = [*itertools.chain.from_iterable(ys[:])]
            X_test = Xt[i]
            y_test = yt[i]
            
            train_data, train_classes, test_data, test_classes, embedding_weights, vocab_size = preprocess(X_train, X_test, y_train, y_test)
            
            input_size = 300
            embedding_size = 96
            
            # Embedding layer Initialization
            self.embedding_layer = Embedding(vocab_size + 1,
                                             embedding_size,
                                             input_length=input_size,
                                             weights=[embedding_weights])

            if first == True:
                self.createModel(input_size, conv_layers, fully_connected_layers)
                self.fit(train_data, train_classes)
                first = False
            self.evaluate(test_data, test_classes)
            
            print('Current total accuracy:', self.bingo_cnt / self.cnt)
            
        print('Total accuracy:', self.bingo_cnt / self.cnt)
            

In [11]:
def load_document(data_location, htf):
    tweets = []
    labels = []
    
    for line in open(os.path.join(data_location, htf)).readlines():
        line_split = line.strip().split('\t')
        tweets.append(line_split[1])
        labels.append(int(line_split[2]))

    return {'tweets': tweets, 'labels': labels}

In [12]:
def create_data(data_location):
    ht_files = sorted(os.listdir(data_location))

    Xs = []
    ys = []
    ht_list = []
    for htf in ht_files:
        ht_dict = load_document(data_location, htf)

        ht_list.append(htf)
        ys.append(ht_dict['labels'])
        Xs.append(ht_dict['tweets'])
        
    return Xs, ys, ht_list

In [26]:
Xs, ys, ht_list = create_data('../train_data')
Xt, yt, h = create_data('../gold_data')

cnn = MY_CNN().run2(Xs, ys, Xt, yt, h)

Bad_Job_In_5_Words.tsv
Train on 10192 samples, validate on 1133 samples
Epoch 1/2
 - 14s - loss: 0.3557 - acc: 0.9037 - val_loss: 0.3487 - val_acc: 0.9082
Epoch 2/2
 - 12s - loss: 0.3095 - acc: 0.9122 - val_loss: 0.3558 - val_acc: 0.9082
accuracy: 0.7367829021372329
Current total accuracy: 0.7367829021372329
Break_Up_In_5_Words.tsv
accuracy: 0.7072838665759019
Current total accuracy: 0.7184054283290925
Broadway_A_Celeb.tsv
accuracy: 0.6274509803921569
Current total accuracy: 0.6833463643471462
Cereal_Songs.tsv
accuracy: 0.54421768707483
Current total accuracy: 0.6539251952322236
Modern_Shakespeare.tsv
accuracy: 0.8187038556193601
Current total accuracy: 0.6869350862777321
Ruin_A_Christmas_Movie.tsv
accuracy: 0.6554132712456344
Current total accuracy: 0.6830357142857143
Total accuracy: 0.6830357142857143
