In [1]:
import pandas as pd
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
import tensorflow as tf
import tensorflow_datasets as tfds
# from tensorflow.keras.layers import Dropout, Dense, Bidirectional, LSTM, \
#     Embedding, GaussianNoise, Activation, Flatten, \
#     RepeatVector, GlobalMaxPooling1D, \
#     Convolution1D, MaxPooling1D, concatenate, Conv1D

# from tensorflow.keras.layers import Attention
# from tensorflow.keras.regularizers import l2

In [2]:
df = pd.read_table("SemEval2017-task4-dev.subtask-A.english.INPUT.txt",sep="\t", header=None)
df = df.drop(columns=[3])
df = df.drop(columns=[0])
df.columns = ["polarity", "tweet"]
# df.set_index("id")
df

Unnamed: 0,polarity,tweet
0,neutral,"Picturehouse's, Pink Floyd's, 'Roger Waters: T..."
1,neutral,Order Go Set a Watchman in store or through ou...
2,negative,If these runway renovations at the airport pre...
3,neutral,If you could ask an onstage interview question...
4,positive,A portion of book sales from our Harper Lee/Go...
...,...,...
20627,neutral,@ShaquilleHoNeal from what I think you're aski...
20628,positive,"Iran ranks 1st in liver surgeries, Allah bless..."
20629,neutral,Hours before he arrived in Saudi Arabia on Tue...
20630,negative,@VanityFair Alex Kim Kardashian worth how to ...


In [3]:
df["polarity"].replace({"positive": 1, "negative": 2 , "neutral": 0}, inplace=True)
df

Unnamed: 0,polarity,tweet
0,0,"Picturehouse's, Pink Floyd's, 'Roger Waters: T..."
1,0,Order Go Set a Watchman in store or through ou...
2,2,If these runway renovations at the airport pre...
3,0,If you could ask an onstage interview question...
4,1,A portion of book sales from our Harper Lee/Go...
...,...,...
20627,0,@ShaquilleHoNeal from what I think you're aski...
20628,1,"Iran ranks 1st in liver surgeries, Allah bless..."
20629,0,Hours before he arrived in Saudi Arabia on Tue...
20630,2,@VanityFair Alex Kim Kardashian worth how to ...


In [4]:
target = df.pop('polarity')
dataset = tf.data.Dataset.from_tensor_slices((df.values, target.values))
# dataset

In [5]:
text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

  self.tok = re.compile(r"({})".format("|".join(pipeline)))


Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


In [6]:
vocabulary_set = set()
for text_tensor, _ in dataset:
    text = str(text_tensor.numpy()[0], 'utf-8')
    some_tokens = text_processor.pre_process_doc(text)
#     print(some_tokens)
    vocabulary_set.update(some_tokens)
    
vocab_size = len(vocabulary_set)
vocab_size

22056

In [7]:
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

In [8]:
def encode(text_tensor, label):
  encoded_text = encoder.encode(text_tensor.numpy()[0])
  return encoded_text, label

In [9]:
def encode_map_fn(text, label):
  # py_func doesn't set the shape of the returned tensors.
  encoded_text, label = tf.py_function(encode, 
                                       inp=[text, label], 
                                       Tout=(tf.int64, tf.int64))

  # `tf.data.Datasets` work best if all components have a shape set
  #  so set the shapes manually: 
  encoded_text.set_shape([None])
  label.set_shape([])

  return encoded_text, label


all_encoded_data = dataset.map(encode_map_fn)

In [10]:
BUFFER_SIZE = 50000
BATCH_SIZE = 64
DATASET_SIZE = 20632
TEST_SIZE = int(0.15 * DATASET_SIZE)
VAL_SIZE = int(0.15 * DATASET_SIZE)

In [11]:
train_data = all_encoded_data.skip(TEST_SIZE+VAL_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE, padded_shapes=([None],[]))

test_val_data = all_encoded_data.take(TEST_SIZE+VAL_SIZE)
# test_val_data = test_data.padded_batch(BATCH_SIZE, padded_shapes=([None],[]))

test_data = test_val_data.skip(VAL_SIZE)
test_data = test_data.padded_batch(BATCH_SIZE, padded_shapes=([None],[]))

val_data = test_val_data.take(VAL_SIZE)
val_data = val_data.padded_batch(BATCH_SIZE, padded_shapes=([None],[]))

In [12]:
sample_text, sample_labels = next(iter(test_data))

sample_text[0], sample_labels[0]

(<tf.Tensor: id=50801, shape=(29,), dtype=int64, numpy=
 array([17737, 11504, 15258,  7737, 12862,  7737, 22057,  5027,  2969,
         5027, 12724,  7061,  7157,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0], dtype=int64)>,
 <tf.Tensor: id=50805, shape=(), dtype=int64, numpy=1>)

In [13]:
vocab_size += 2

In [14]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, 64))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
# One or more dense layers.
# Edit the list in the `for` line to experiment with layer sizes.
for units in [64, 64]:
  model.add(tf.keras.layers.Dense(units, activation='relu'))

# Output layer. The first argument is the number of labels.
model.add(tf.keras.layers.Dense(3))

In [18]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [19]:
model.fit(train_data, epochs=3, validation_data=val_data)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x1bd9898aa08>

In [20]:
eval_loss, eval_acc = model.evaluate(test_data)

print('\nEval loss: {:.3f}, Eval accuracy: {:.3f}'.format(eval_loss, eval_acc))


Eval loss: 0.863, Eval accuracy: 0.616
