In [162]:
import warnings
warnings.filterwarnings('ignore')
import multiprocessing
from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
import numpy as np
import os
import itertools

from keras.models import Sequential
from keras.layers import Dense, Dropout
from nltk.tokenize import TweetTokenizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM
from keras.utils import to_categorical

In [10]:
# from gensim.scripts.glove2word2vec import glove2word2vec
# glove2word2vec(glove_input_file="char-embeddings.txt",
#                word2vec_output_file="gensim_char-embeddings.txt")

In [11]:
char_embeddings = KeyedVectors.load_word2vec_format("gensim_char-embeddings.txt", binary=False)
char_vectors = char_embeddings.wv

In [146]:
glove_model = KeyedVectors.load_word2vec_format("gensim_glove_vectors25.txt", binary=False)
vectors = glove_model.wv

In [53]:
def glove_char_embeddings(tokens):
    arr = np.zeros(300)
    
    for word in tokens:
        for char in word:
            try:
                arr = np.add(arr, char_vectors[char])
            except:
                pass
        
    return np.array(arr)

In [148]:
def glove_vects(tokens):
    
    arr = np.zeros(25)
    
    for tok in tokens:
        try:
            arr = np.add(arr, vectors[tok])
        except:
            pass
        
    return np.array(arr)

In [151]:
def preprocessTweet(tweet):
    tknzr = TweetTokenizer()
    tokens = tknzr.tokenize(tweet)
    
    return glove_vects(tokens)

In [153]:
def load_document(data_location, htf):
    tweets = []
    labels = []
    
    for line in open(os.path.join(data_location, htf)).readlines():
        line_split = line.strip().split('\t')
        tweets.append(line_split[1])
        labels.append(np.array([int(line_split[2])]))

    Y = np.array(labels)
    
    X_vects = np.zeros(25)
    for tweet in tweets:
        X_vects = np.vstack((X_vects, preprocessTweet(tweet)))
        
#     X_vects = [preprocessTweet(tweet, htf) for tweet in tweets]

    return {'X_vects': X_vects[1:], 'Y': Y}

In [73]:
def create_data(data_location):
    ht_files = sorted(os.listdir(data_location))

    Xs = []
    ys = []
    ht_list = []
    for htf in ht_files:
        ht_dict = load_document(data_location, htf)

        ht_list.append(htf)
        ys.append(ht_dict['Y'])
        Xs.append(ht_dict['X_vects'])

    return Xs, ys, ht_list

In [177]:
Xs, ys, ht_list = create_data('train_data')

x_train = np.array([*itertools.chain.from_iterable(Xs[1:])])
y_train = np.array([*itertools.chain.from_iterable(ys[1:])])
x_test = Xs[0]
y_test = ys[0]

y_train[y_train == 2] = 1
y_test[y_test == 2] = 1

model = Sequential()
model.add(Dense(64, input_dim=25, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

class_weight = {0: 1., 1: 8.5}

model.fit(x_train, y_train,
          epochs=10000,
          batch_size=128,
         class_weight=class_weight)

score = model.evaluate(x_test, y_test, batch_size=128)
# print(score)
y_pred = model.predict_classes(x_test, verbose=1)

print(np.hstack((y_test, y_pred)))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[[0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]


In [169]:
# Generate dummy data
x_train = np.random.random((1000, 20))
y_train = np.random.randint(2, size=(1000, 1))
x_test = np.random.random((100, 20))
y_test = np.random.randint(2, size=(100, 1))

max_features = 1024

model = Sequential()
model.add(Embedding(max_features, output_dim=256))
model.add(LSTM(128))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.fit(x_train, y_train, batch_size=16, epochs=10)
score = model.evaluate(x_test, y_test, batch_size=16)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [178]:
Xs, ys, ht_list = create_data('train_data')

x_train = np.array([*itertools.chain.from_iterable(Xs[1:])])
y_train = np.array([*itertools.chain.from_iterable(ys[1:])])
x_test = Xs[0]
y_test = ys[0]

y_train[y_train == 2] = 1
y_test[y_test == 2] = 1

regressor = Sequential()

regressor.add(LSTM(units = 50, return_sequences = True, input_shape = (x_train.shape[1], 1)))
regressor.add(Dropout(0.2))

regressor.add(LSTM(units = 50, return_sequences = True))
regressor.add(Dropout(0.2))

regressor.add(LSTM(units = 50, return_sequences = True))
regressor.add(Dropout(0.2))

regressor.add(LSTM(units = 50))
regressor.add(Dropout(0.2))

regressor.add(Dense(units = 1))

regressor.compile(optimizer = 'adam', loss = 'mean_squared_error')

regressor.fit(x_train, y_train, epochs = 100, batch_size = 32)
y_pred = regressor.predict(x_test)
print(np.hstack((y_test, y_pred)))

ValueError: Error when checking input: expected lstm_13_input to have 3 dimensions, but got array with shape (11186, 25)