In [1]:
import warnings
warnings.filterwarnings('ignore')
import multiprocessing
from gensim.models import Word2Vec
import numpy as np
import os
import itertools
import operator

In [2]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Input, Activation, Flatten, concatenate
from keras.layers import Conv1D, MaxPooling1D, Dropout
from keras.models import Model, Sequential
from keras.layers import Embedding, Dense, Dropout, LSTM
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [3]:
def preprocess(X_train, X_test, y_train):
    # Tokenizer
    tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
    tk.fit_on_texts(X_train)

    
    # alphabet="abcdefghijklmnopqrstuvwxyz0123456789 ,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
    alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 ,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
    char_dict = {}
    for i, char in enumerate(alphabet):
        char_dict[char] = i + 1

        
    # Use char_dict to replace the tk.word_index
    tk.word_index = char_dict.copy() 
    # Add 'UNK' to the vocabulary 
    tk.word_index[tk.oov_token] = max(char_dict.values()) + 1

    
    # Convert string to index 
    train_sequences = tk.texts_to_sequences(X_train)
    test_texts = tk.texts_to_sequences(X_test)

    # Padding
    train_data = pad_sequences(train_sequences, maxlen=300, padding='post')
    test_data = pad_sequences(test_texts, maxlen=300, padding='post')

    # Convert to numpy array
    train_data = np.array(train_data, dtype='float32')
    test_data = np.array(test_data, dtype='float32')
    
    
    vocab_size = len(tk.word_index)

    embedding_weights = [] #(97, 96)
    embedding_weights.append(np.zeros(vocab_size)) # first row is pad

    for char, i in tk.word_index.items(): # from index 1 to 70
        onehot = np.zeros(vocab_size)
        onehot[i-1] = 1
        embedding_weights.append(onehot)
            
    embedding_weights = np.array(embedding_weights)
    
    train_classes = to_categorical(y_train)
    
    return [train_data, train_classes, test_data, embedding_weights, vocab_size, tk]

In [4]:
def preprocessTweets(X_test, tk):
    
    # Convert string to index 
    test_texts = tk.texts_to_sequences(X_test)

    # Padding
    test_data = pad_sequences(test_texts, maxlen=300, padding='post')
    
    return test_data

In [None]:
def separate_data(X, y):
    X_win = []
    X_top10 = []
    X_rest = []
    
    for idx, line in enumerate(y):
        if line == 0:
            X_rest.append(X[idx][:])
        elif line == 1:
            X_top10.append(X[idx][:])
        elif line == 2:
            X_win.append(X[idx][:])
            
    return X_win, X_top10, X_rest

In [None]:
def addOneToDict(dictionary, key):
    if key in dictionary.keys():
        dictionary[key] += 1
    else:
        dictionary[key] = 1

In [None]:
class MY_CNN:
    
    def __init__(self):
        self.model = None
        self.num_of_classes = 3
        self.dropout = 0.2
        self.optimizer = 'adam'
        self.loss = 'categorical_crossentropy'
        self.cnt = 0
        self.bingo_cnt = 0
        
    def createModel(self, input_size, conv_layers, fully_connected_layers):
        # Input
        inputs = Input(shape=(input_size,), name='input', dtype='int64')
        
        # Embedding 
        x = self.embedding_layer(inputs)
        
        # Conv 
        for filter_num, filter_size, pooling_size in conv_layers:
            x = Conv1D(filter_num, filter_size)(x) 
            x = Activation('relu')(x)
            if pooling_size != -1:
                x = MaxPooling1D(pool_size=pooling_size)(x)
        x = Flatten()(x)
        
        # Fully connected layers 
        for dense_size in fully_connected_layers:
            x = Dense(dense_size, activation='relu')(x)
            x = Dropout(self.dropout)(x)
            
        # Output Layer
        predictions = Dense(self.num_of_classes, activation='softmax')(x)
        
        # Build model
        model = Model(inputs=inputs, outputs=predictions)
        model.compile(optimizer=self.optimizer, loss=self.loss, metrics=['accuracy'])
        self.model = model
        
        
    def fit(self, train_data, train_classes):
        
        self.model.fit(train_data, train_classes,
                       batch_size=128, 
                       epochs=1, 
                       verbose=1)
        


    def evaluate(self, X_test_sent, tk):
        
        X_test = preprocessTweets(X_test_sent, tk)
        y_pred = self.model.predict(X_test)
        
        indexes = np.argsort(np.array(y_pred)[:, 0]).tolist()
        
        print()
        for i in range(5):
            print("{}: {}".format(i+1, X_test_sent[indexes.index(i)]))
        
        
    def run(self, Xs, ys, X_test):
        
        conv_layers = [[100, 5, 3],
               [100, 5, 3]]

        fully_connected_layers = [256, 256]
        
        input_size = 300
        embedding_size = 96
        
        X_train = [*itertools.chain.from_iterable(Xs[:])]
        y_train = [*itertools.chain.from_iterable(ys[:])]
            
        train_data, train_classes, test_data, embedding_weights, vocab_size, tk = preprocess(X_train, X_test, y_train)


        # Embedding layer Initialization
        self.embedding_layer = Embedding(vocab_size + 1,
                                         embedding_size,
                                         input_length=input_size,
                                         weights=[embedding_weights])

        self.createModel(input_size, conv_layers, fully_connected_layers)
        self.fit(train_data, train_classes)
        self.evaluate(X_test, tk)

In [None]:
def load_document(data_location, htf):
    tweets = []
    labels = []
    
    for line in open(os.path.join(data_location, htf)).readlines():
        line_split = line.strip().split('\t')
        tweets.append(line_split[1])
        labels.append(int(line_split[2]))

    return {'tweets': tweets, 'labels': labels}

In [None]:
def load_targedy(location):
    file = open(location, "r")
    lines = []
    
    for line in file.readlines():
        lines.append(line.replace('\n', ''))
        
    return lines

In [None]:
def create_data(data_location):
    ht_files = sorted(os.listdir(data_location))

    Xs = []
    ys = []
    ht_list = []
    for htf in ht_files:
        ht_dict = load_document(data_location, htf)

        ht_list.append(htf)
        ys.append(ht_dict['labels'])
        Xs.append(ht_dict['tweets'])
        
    return Xs, ys, ht_list

In [None]:
Xs, ys, ht_list = create_data('../train_data')
Xt2 = load_targedy('../tar_data/#TARgedy.txt')

cnn = MY_CNN().run(Xs, ys, Xt2)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Epoch 1/1