In [70]:
import warnings
warnings.filterwarnings('ignore')
import multiprocessing
from gensim.models import Word2Vec
import numpy as np
import os
import itertools
import operator
import collections

In [54]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Input, Activation, Flatten, concatenate
from keras.layers import Conv1D, MaxPooling1D, Dropout
from keras.models import Model, Sequential
from keras.layers import Embedding, Dense, Dropout, LSTM
from sklearn.model_selection import train_test_split

In [55]:
def preprocess(X_train, X_test, y_train, y_test):
    # Tokenizer
    tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
    tk.fit_on_texts(X_train)

    
    # alphabet="abcdefghijklmnopqrstuvwxyz0123456789 ,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
    alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 ,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
    char_dict = {}
    for i, char in enumerate(alphabet):
        char_dict[char] = i + 1

        
    # Use char_dict to replace the tk.word_index
    tk.word_index = char_dict.copy() 
    # Add 'UNK' to the vocabulary 
    tk.word_index[tk.oov_token] = max(char_dict.values()) + 1

    
    # Convert string to index 
    train_sequences = tk.texts_to_sequences(X_train)
    test_texts = tk.texts_to_sequences(X_test)

    # Padding
    train_data = pad_sequences(train_sequences, maxlen=300, padding='post')
    test_data = pad_sequences(test_texts, maxlen=300, padding='post')

    # Convert to numpy array
    train_data = np.array(train_data, dtype='float32')
    test_data = np.array(test_data, dtype='float32')
    
    
    vocab_size = len(tk.word_index)

    embedding_weights = [] #(97, 96)
    embedding_weights.append(np.zeros(vocab_size)) # first row is pad

    for char, i in tk.word_index.items(): # from index 1 to 70
        onehot = np.zeros(vocab_size)
        onehot[i-1] = 1
        embedding_weights.append(onehot)
            
    embedding_weights = np.array(embedding_weights)
    
#     print(np.shape(y_train))
    
    train_classes = to_categorical(y_train)
    test_classes = to_categorical(y_test)
    
    return [train_data, train_classes, test_data, test_classes, embedding_weights, vocab_size, tk]

In [56]:
def preprocessTweet(x, tk):
    
    # Convert string to index 
    test_texts = tk.texts_to_sequences([x])

    # Padding
    test_data = pad_sequences(test_texts, maxlen=300, padding='post')
    
    return test_data[0]

In [57]:
def separate_data(X, y):
    X_win = []
    X_top10 = []
    X_rest = []
    
    for idx, line in enumerate(y):
        if line == 0:
            X_rest.append(X[idx][:])
        elif line == 1:
            X_top10.append(X[idx][:])
        elif line == 2:
            X_win.append(X[idx][:])
            
    return X_win, X_top10, X_rest

In [58]:
def addOneToDict(dictionary, key):
    if key in dictionary.keys():
        dictionary[key] += 1
    else:
        dictionary[key] = 1

In [105]:
class MY_CNN:
    
    def __init__(self):
        self.model = None
        self.num_of_classes = 3
        self.dropout = 0.2
        self.optimizer = 'adam'
        self.loss = 'categorical_crossentropy'
        self.cnt = 0
        self.bingo_cnt = 0
        
    def createModel(self, input_size, conv_layers, fully_connected_layers):
        # Input
        inputs = Input(shape=(input_size,), name='input', dtype='int64')
        
        # Embedding 
        x = self.embedding_layer(inputs)
        
        # Conv 
        for filter_num, filter_size, pooling_size in conv_layers:
            x = Conv1D(filter_num, filter_size)(x) 
            x = Activation('relu')(x)
            if pooling_size != -1:
                x = MaxPooling1D(pool_size=pooling_size)(x)
        x = Flatten()(x)
        
        # Fully connected layers 
        for dense_size in fully_connected_layers:
            x = Dense(dense_size, activation='relu')(x)
            x = Dropout(self.dropout)(x)
            
        # Output Layer
        predictions = Dense(self.num_of_classes, activation='softmax')(x)
        
        # Build model
        model = Model(inputs=inputs, outputs=predictions)
        model.compile(optimizer=self.optimizer, loss=self.loss, metrics=['accuracy'])
        self.model = model
#         self.model.summary()
        
        
    def fit(self, train_data, train_classes):
#         X_train, X_val, y_train, y_val = train_test_split(train_data, train_classes, test_size=0.1, shuffle=True)
        
#         class_weight = {0: 10,
#                 1: 1,
#                 2: 1}
        
        self.model.fit(train_data, train_classes,
                       batch_size=128, 
                       epochs=1, 
                       verbose=1)
        
    def evaluate(self, X_test_sent, y_test_sent, tk):
        
        X_win, X_top10, X_rest = separate_data(X_test_sent, y_test_sent)
        
        results = {}
        
        for tweet_pair in itertools.product(X_top10, X_rest):
            
            y_top10 = self.model.predict(preprocessTweet(tweet_pair[0], tk).reshape(1, 300))
            y_rest = self.model.predict(preprocessTweet(tweet_pair[1], tk).reshape(1, 300))

            if y_top10[0][0] > y_rest[0][0]:
                addOneToDict(results, tweet_pair[1])
            else:
                addOneToDict(results, tweet_pair[0])
                
        for tweet_pair in itertools.product(X_top10, X_win):
            
            y_top10 = self.model.predict(preprocessTweet(tweet_pair[0], tk).reshape(1, 300))
            y_win = self.model.predict(preprocessTweet(tweet_pair[1], tk).reshape(1, 300))

            if y_top10[0][0] > y_win[0][0]:
                addOneToDict(results, tweet_pair[1])
            else:
                addOneToDict(results, tweet_pair[0])
                
        for tweet_pair in itertools.product(X_rest, X_win):

            y_rest = self.model.predict(preprocessTweet(tweet_pair[0], tk).reshape(1, 300))
            y_win = self.model.predict(preprocessTweet(tweet_pair[1], tk).reshape(1, 300))

            if y_rest[0][0] > y_win[0][0]:
                addOneToDict(results, tweet_pair[1])
            else:
                addOneToDict(results, tweet_pair[0])

        sorted_results = sorted(results.items(), key=operator.itemgetter(1), reverse=True)
        print()
        for idx, tweet in enumerate(np.array(sorted_results)[:5, 0]):
            print("{}: {}".format(idx + 1, tweet))
#         print(np.array(sorted_results)[:3, 0])
        
        
    def run(self, Xs, ys, Xt, yt, h):
        
        conv_layers = [[100, 5, 3],
               [100, 5, 3]]

        fully_connected_layers = [256, 256]
        
        first = True
        
        num_hts = len(yt)
        for i in range(num_hts):
            
            X_train = [*itertools.chain.from_iterable(Xs[:])]
            y_train = [*itertools.chain.from_iterable(ys[:])]
            X_test = Xt[i]
            y_test = yt[i]
            
            train_data, train_classes, test_data, test_classes, embedding_weights, vocab_size, tk = preprocess(X_train, X_test, y_train, y_test)
            
            input_size = 300
            embedding_size = 96
            
            # Embedding layer Initialization
            self.embedding_layer = Embedding(vocab_size + 1,
                                             embedding_size,
                                             input_length=input_size,
                                             weights=[embedding_weights])

            if first == True:
                self.createModel(input_size, conv_layers, fully_connected_layers)
                self.fit(train_data, train_classes)
                first = False
            self.evaluate(X_test, y_test, tk)
            

In [60]:
def load_document(data_location, htf):
    tweets = []
    labels = []
    
    for line in open(os.path.join(data_location, htf)).readlines():
        line_split = line.strip().split('\t')
        tweets.append(line_split[1])
        labels.append(int(line_split[2]))

    return {'tweets': tweets, 'labels': labels}

In [109]:
import preprocessor as p

import re

def ukloni_topic(tweet, topic):
    pattern = re.compile("#" + topic.replace("_", ""), re.IGNORECASE)
    tweet = pattern.sub("", tweet).strip()
    pattern = re.compile("@" + topic.replace("_", ""), re.IGNORECASE)
    tweet = pattern.sub("", tweet).strip()
    return tweet

def tin_preprocessing(tweets,topic):
    result = []
    for tweet in tweets: 
        tweet = ukloni_topic(tweet, topic)
        result.append(tweet)    
    return result

def create_data(data_location):
    ht_files = sorted(os.listdir(data_location))

    Xs = []
    ys = []
    ht_list = []
    for htf in ht_files:
        ht_dict = load_document(data_location, htf)

        ht_list.append(htf)
        ys.append(ht_dict['labels'])
        Xs.append(tin_preprocessing(ht_dict['tweets'],htf.replace("_", "")))
        
    return Xs, ys, ht_list

In [108]:
Xs, ys, ht_list = create_data('../train_data')
Xt, yt, h = create_data('../tar_data')

cnn = MY_CNN().run(Xs, ys, Xt, yt, h)

Epoch 1/1

1: Geting a resource error after 10 hours of training @midnight #TARgedy
2: Writes a paper without mentioning #SotA 48 times. Is it really a paper? #TARgedy @midnight
3: What is your orientation? Bi-directional LSTM #BiLSTM #TARgedy @midnight
4: Scoring this tweet as the funniest #TARgedy @midnight
5: Well, at least when it comes to the paper, the size doesn't matter @midnight #TARgedy
