In [1]:
import pandas as pd
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np

In [2]:
df = pd.read_table("SemEval2017-task4-dev.subtask-BD.english.INPUT.txt",sep="\t", header=None)
df = df.drop(columns=[4])
df = df.drop(columns=[0])
df.columns = ["topic", "polarity", "tweet"]
df

Unnamed: 0,topic,polarity,tweet
0,amy schumer,negative,@MargaretsBelly Amy Schumer is the stereotypic...
1,amy schumer,negative,@dani_pitter I mean I get the hype around JLaw...
2,amy schumer,negative,Amy Schumer at the #GQmenoftheyear2015 party i...
3,amy schumer,negative,Amy Schumer is on Sky Atlantic doing one of th...
4,amy schumer,negative,"Amy Schumer may have brought us Trainwreck, bu..."
...,...,...,...
10546,zayn,positive,tomorrow I've to wake up early so Zayn's erfo...
10547,zayn,positive,with Zayn gone I can now definitively say that...
10548,zayn,positive,yo don't ever say that! god forbid! may it not...
10549,zayn,positive,you may call me a bad fan but I sobbed so hard...


In [3]:
df["polarity"].replace({"positive": 1, "negative": 0}, inplace=True)
df

Unnamed: 0,topic,polarity,tweet
0,amy schumer,0,@MargaretsBelly Amy Schumer is the stereotypic...
1,amy schumer,0,@dani_pitter I mean I get the hype around JLaw...
2,amy schumer,0,Amy Schumer at the #GQmenoftheyear2015 party i...
3,amy schumer,0,Amy Schumer is on Sky Atlantic doing one of th...
4,amy schumer,0,"Amy Schumer may have brought us Trainwreck, bu..."
...,...,...,...
10546,zayn,1,tomorrow I've to wake up early so Zayn's erfo...
10547,zayn,1,with Zayn gone I can now definitively say that...
10548,zayn,1,yo don't ever say that! god forbid! may it not...
10549,zayn,1,you may call me a bad fan but I sobbed so hard...


In [4]:
target = df.pop('polarity')
dataset = tf.data.Dataset.from_tensor_slices((df.values, target.values))
dataset

<TensorSliceDataset shapes: ((2,), ()), types: (tf.string, tf.int64)>

In [5]:
for i in dataset.take(1):
    print(i)

(<tf.Tensor: id=11, shape=(2,), dtype=string, numpy=
array([b'amy schumer',
       b"@MargaretsBelly Amy Schumer is the stereotypical 1st world Laci Green feminazi. Plus she's unfunny"],
      dtype=object)>, <tf.Tensor: id=12, shape=(), dtype=int64, numpy=0>)


In [6]:
text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

Reading twitter - 1grams ...
Reading twitter - 2grams ...


  self.tok = re.compile(r"({})".format("|".join(pipeline)))


Reading twitter - 1grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


In [7]:
vocabulary_set = set()
for text_tensor, label in dataset:
    text = str(text_tensor.numpy()[1], 'utf-8')
    some_tokens = text_processor.pre_process_doc(text)
#     print(some_tokens)
    vocabulary_set.update(some_tokens)
    
vocab_size = len(vocabulary_set)
vocab_size

14253

In [8]:
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

In [9]:
def encode(text_tensor, label):
#     special_number = [14254]
    encoded_msg = encoder.encode(text_tensor.numpy()[1])
#     encoded_topic = encoder.encode(text_tensor.numpy()[0])
#     encoded_text = encoded_topic+special_number+encoded_msg

    return encoded_msg, label

In [10]:
def encode_map_fn(text, label):
  # py_func doesn't set the shape of the returned tensors.
    encoded_text, label = tf.py_function(encode, 
                                        inp=[text, label], 
                                        Tout=(tf.int64, tf.int64))

  # `tf.data.Datasets` work best if all components have a shape set
  #  so set the shapes manually: 
    encoded_text.set_shape([None])
#     encoded_topic.set_shape([None])
    label.set_shape([])

    return encoded_text, label


all_encoded_data = dataset.map(encode_map_fn)

In [11]:
for i in all_encoded_data.take(1):
    print(i)

(<tf.Tensor: id=21139, shape=(15,), dtype=int64, numpy=
array([14254, 14254, 14254,  6928,  1915, 11137, 14254,  4627, 14254,
       14254, 11483, 14254, 11032, 13500,  1238], dtype=int64)>, <tf.Tensor: id=21140, shape=(), dtype=int64, numpy=0>)


In [12]:
BUFFER_SIZE = 50000
BATCH_SIZE = 64
DATASET_SIZE = 14253
TEST_SIZE = int(0.15 * DATASET_SIZE)
VAL_SIZE = int(0.15 * DATASET_SIZE)

In [13]:
train_data = all_encoded_data.skip(TEST_SIZE+VAL_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE, padded_shapes=([None],[]))

test_val_data = all_encoded_data.take(TEST_SIZE+VAL_SIZE)

test_data = test_val_data.skip(VAL_SIZE)
test_data = test_data.padded_batch(BATCH_SIZE, padded_shapes=([None],[]))

val_data = test_val_data.take(VAL_SIZE)
val_data = val_data.padded_batch(BATCH_SIZE, padded_shapes=([None],[]))

In [14]:
for i in train_data.take(1):
    print(i)

(<tf.Tensor: id=52836, shape=(64, 30), dtype=int64, numpy=
array([[14254, 14254, 14254, ...,     0,     0,     0],
       [14254,   757,  6317, ...,     0,     0,     0],
       [ 3544,   757, 11602, ...,     0,     0,     0],
       ...,
       [14254, 14254, 13500, ...,     0,     0,     0],
       [14254,  4222,   474, ...,     0,     0,     0],
       [14254, 14254,  1689, ...,     0,     0,     0]], dtype=int64)>, <tf.Tensor: id=52837, shape=(64,), dtype=int64, numpy=
array([1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1],
      dtype=int64)>)


In [15]:
vocab_size += 2

In [16]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 200),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [17]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [18]:
model.fit(train_data, epochs=7, validation_data=val_data)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<tensorflow.python.keras.callbacks.History at 0x23b4169e388>

In [19]:
eval_loss, eval_acc = model.evaluate(test_data)

print('\nEval loss: {:.3f}, Eval accuracy: {:.3f}'.format(eval_loss, eval_acc))


Eval loss: 0.438, Eval accuracy: 0.789
