In [1]:
import numpy as np
import os
import itertools
np.random.seed(113) #set seed before any keras import
import argparse
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, LSTM, Dropout
from keras.preprocessing.text import one_hot
from keras.preprocessing.text import Tokenizer
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.model_selection import train_test_split
from collections import defaultdict

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
# logging
import logging
logging.basicConfig(level=logging.INFO)

In [3]:
# load dataset
with open('data/us.text') as f: 
    X_es = f.readlines()
with open('data/us.labels') as f: 
    Y_es = f.readlines()
    
assert len(X_es) == len(Y_es)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X_es, Y_es, test_size=0.9, random_state=0)

In [5]:
logging.info(X_es[0])
logging.info(Y_es[0])

INFO:root:LoL @ West Covina, California 

INFO:root:2



In [6]:
# label encoding
# label2idx = {label: i for i, label in enumerate(set(y_train))}
# num_labels = len(label2idx.keys())
# y_train = np_utils.to_categorical([label2idx[label] for label in y_train], num_classes=num_labels)
# y_test = np_utils.to_categorical([label2idx[label] for label in y_test], num_classes=num_labels)

In [7]:
y2i = defaultdict(lambda: len(y2i))
y_train_num = [y2i[emoji] for emoji in y_train]
y_test_num = [y2i[emoji] for emoji in y_test]
num_classes = len(np.unique(y_train_num))
print(num_classes)

y_train_one_hot = np_utils.to_categorical(y_train_num, num_classes) #important to give the num_classes!
y_test_one_hot = np_utils.to_categorical(y_test_num, num_classes) #important to give the num_classes!

20


In [8]:
y_train_one_hot[0]

array([ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [9]:

w2i = defaultdict(lambda: len(w2i))
PAD = w2i["<pad>"] # index 0 is padding
UNK = w2i["<unk>"] # index 1 is for UNK

# convert words to indices, taking care of UNKs
X_train_num = [[w2i[word] for word in sentence.split(" ")] for sentence in X_train]
w2i = defaultdict(lambda: UNK, w2i) # freeze - cute trick!
X_test_num = [[w2i[word] for word in sentence.split(" ")] for sentence in X_test]

max_sentence_length=max([len(s.split(" ")) for s in X_train] 
                        + [len(s.split(" ")) for s in X_test] )
print(max_sentence_length)

43


In [10]:
X_train_num[0]

[2, 3, 4, 3, 5, 6]

In [11]:
from keras.preprocessing import sequence
# pad X
X_train_pad = sequence.pad_sequences(X_train_num, maxlen=max_sentence_length, value=PAD)
X_test_pad = sequence.pad_sequences(X_test_num, maxlen=max_sentence_length,value=PAD)
print(X_train_pad.shape)

(49026, 43)


In [12]:
vocab_size = len(w2i)
embeds_size=32

In [15]:
np.random.seed(113) #set seed before any keras import

model = Sequential()
model.add(Embedding(vocab_size, embeds_size, input_length=max_sentence_length))
model.add(LSTM(32))
model.add(Dense(num_classes))
model.add(Activation('sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
model.fit(X_train_pad, y_train_one_hot, epochs=3)
loss, accuracy = model.evaluate(X_test_pad, y_test_one_hot)

Epoch 1/3
Epoch 2/3
Epoch 3/3
