In [2]:
pip install trax

Collecting trax
[?25l  Downloading https://files.pythonhosted.org/packages/f8/fd/53162b3ea77e8b017b8a961ab973afed2d07bf2dc809abcf821c9b3b4470/trax-1.3.4-py2.py3-none-any.whl (366kB)
[K     |████████████████████████████████| 368kB 2.7MB/s 
[?25hCollecting t5
[?25l  Downloading https://files.pythonhosted.org/packages/4f/77/c00ce95121f5b8363b880aec38fde71ecaf9b7eeb29ed8bd29fc5f5b8541/t5-0.6.4-py3-none-any.whl (163kB)
[K     |████████████████████████████████| 163kB 8.4MB/s 
Collecting funcsigs
  Downloading https://files.pythonhosted.org/packages/69/cb/f5be453359271714c01b9bd06126eaf2e368f1fddfff30818754b5ac2328/funcsigs-1.0.2-py2.py3-none-any.whl
Collecting tensor2tensor
[?25l  Downloading https://files.pythonhosted.org/packages/d6/7c/9e87d30cefad5cbc390bb7f626efb3ded9b19416b8160f1a1278da81b218/tensor2tensor-1.15.7-py2.py3-none-any.whl (1.4MB)
[K     |████████████████████████████████| 1.5MB 11.9MB/s 
Collecting tensorflow-text
[?25l  Downloading https://files.pythonhosted.org/pack

In [5]:
import os 
import random as rnd
import trax
import trax.fastmath.numpy as np
from trax import layers as tl

In [6]:
import string
import re
import os
import nltk
nltk.download('twitter_samples')
nltk.download('stopwords')
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, twitter_samples 

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
tweet_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)

In [9]:
stopwords_english = stopwords.words('english')

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

def process_tweet(tweet):
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and
            word not in string.punctuation):
            stem_word = stemmer.stem(word)
            tweets_clean.append(stem_word)
    return tweets_clean

def load_tweets():
    all_positive_tweets = twitter_samples.strings('positive_tweets.json')
    all_negative_tweets = twitter_samples.strings('negative_tweets.json')  
    return all_positive_tweets, all_negative_tweets

class Layer(object):
    def __init__(self):
        self.weights = None

    def forward(self, x):
        raise NotImplementedError
  
    def init_weights_and_state(self, input_signature, random_key):
        pass

    def init(self, input_signature, random_key):
        self.init_weights_and_state(input_signature, random_key)
        return self.weights
    
    def __call__(self, x):
        return self.forward(x)

In [12]:
import numpy as np
all_positive_tweets, all_negative_tweets = load_tweets()
val_pos = all_positive_tweets[4000:] 
train_pos = all_positive_tweets[:4000]
val_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]
train_x = train_pos + train_neg 
val_x  = val_pos + val_neg
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
val_y  = np.append(np.ones(len(val_pos)), np.zeros(len(val_neg)))

In [13]:
Vocab = {'__PAD__': 0, '__</e>__': 1, '__UNK__': 2}
for tweet in train_x: 
    processed_tweet = process_tweet(tweet)
    for word in processed_tweet:
        if word not in Vocab: 
            Vocab[word] = len(Vocab)

In [15]:
def tweet_to_tensor(tweet, vocab_dict, unk_token='__UNK__', verbose=False):
    
    word_l = process_tweet(tweet)
    tensor_l = []
    unk_ID = vocab_dict[unk_token]
    for word in word_l:
        word_ID = vocab_dict[word] if word in vocab_dict else unk_ID
        tensor_l.append(word_ID) 
    
    return tensor_l

In [19]:
def data_generator(data_pos, data_neg, batch_size, loop, vocab_dict, shuffle=False):
    assert batch_size % 2 == 0
    n_to_take = batch_size // 2
    pos_index = 0
    neg_index = 0
    
    len_data_pos = len(data_pos)
    len_data_neg = len(data_neg)
    
    pos_index_lines = list(range(len_data_pos))
    neg_index_lines = list(range(len_data_neg))
    
    if shuffle:
        rnd.shuffle(pos_index_lines)
        rnd.shuffle(neg_index_lines)    
    stop = False
    while not stop:  
        batch = []
        for i in range(n_to_take):
            if pos_index >= len_data_pos: 
                if not loop:
                    stop = True;
                    break;
                pos_index = 0
                if shuffle:
                    rnd.shuffle(pos_index_lines)
            tweet = data_pos[pos_index_lines[pos_index]]
            tensor = tweet_to_tensor(tweet, vocab_dict)
            batch.append(tensor)
            pos_index = pos_index + 1
        for i in range(n_to_take):
            if neg_index >= len_data_neg:
                if not loop:
                    stop = True;
                    break;
                neg_index = 0
                if shuffle:
                    rnd.shuffle(neg_index_lines)
            tweet = data_neg[neg_index_lines[neg_index]]
            tensor = tweet_to_tensor(tweet, vocab_dict)
            batch.append(tensor)
            neg_index += 1
        if stop:
            break;
        pos_index += n_to_take
        neg_index += n_to_take
        max_len = max([len(t) for t in batch]) 
        tensor_pad_l = []
        for tensor in batch:
            n_pad = max_len - len(tensor)
            pad_l = [0]*n_pad
            tensor_pad = tensor + pad_l
            tensor_pad_l.append(tensor_pad)
        inputs = np.array(tensor_pad_l)
        target_pos = [1]*n_to_take
        target_neg = [0]*n_to_take
        target_l = target_pos + target_neg
        targets = np.array(target_l)
        example_weights = np.ones_like(targets)
        yield inputs, targets, example_weights

In [20]:
rnd.seed(30) 

In [21]:
class Relu(Layer):
    def forward(self, x):
        activation = np.maximum(x,0)
        return activation

In [None]:
from trax import fastmath
np = fastmath.numpy
random = fastmath.random

In [22]:
class Dense(Layer):
  
    def __init__(self, n_units, init_stdev=0.1):
        self._n_units = n_units
        self._init_stdev = 0.1
    def forward(self, x):
        dense = np.dot(x, self.weights) 
        return dense
    def init_weights_and_state(self, input_signature, random_key):
        input_shape = input_signature.shape
        w = self._init_stdev * random.normal(
            key = random_key, shape = (input_shape[-1], self._n_units))
        self.weights = w
        return self.weights

In [26]:
def classifier(vocab_size=len(Vocab), embedding_dim=256, output_dim=2, mode='train'):

    embed_layer = tl.Embedding(
        vocab_size=vocab_size, # Size of the vocabulary
        d_feature=embedding_dim)  # Embedding dimension
    mean_layer = tl.Mean(axis=1)
    dense_output_layer = tl.Dense(n_units = output_dim)
    log_softmax_layer = tl.LogSoftmax()
    model = tl.Serial(
      embed_layer,  # embedding layer
      mean_layer, # mean layer
      dense_output_layer, # dense output layer 
      log_softmax_layer # log softmax layer
    )
    return model

tmp_model = classifier()

In [27]:
display(tmp_model)

Serial[
  Embedding_9092_256
  Mean
  Dense_2
  LogSoftmax
]

In [29]:
def train_generator(batch_size, shuffle = False):
    return data_generator(train_pos, train_neg, batch_size, True, Vocab, shuffle)

def val_generator(batch_size, shuffle = False):
    return data_generator(val_pos, val_neg, batch_size, True, Vocab, shuffle)

def test_generator(batch_size, shuffle = False):
    return data_generator(val_pos, val_neg, batch_size, False, Vocab, shuffle)

inputs, targets, example_weights = next(train_generator(4, shuffle=True))

In [46]:
from trax.supervised import training

batch_size = 16
rnd.seed(271)
train_task = training.TrainTask(
    labeled_data=train_generator(batch_size=batch_size, shuffle=True),
    loss_layer=tl.CrossEntropyLoss(),
    optimizer=trax.optimizers.Adam(0.01),
    n_steps_per_checkpoint=10,
)
eval_task = training.EvalTask(
    labeled_data=val_generator(batch_size=batch_size, shuffle=True),
    metrics=[tl.CrossEntropyLoss(), tl.Accuracy()],
)
model = classifier()

In [47]:
output_dir = '~/model/'
output_dir_expand = os.path.expanduser(output_dir)
print(output_dir_expand)

/root/model/


In [72]:
def train_model(classifier, train_task, eval_task, n_steps, output_dir):

    training_loop = training.Loop(
                                classifier,  # The learning model
                                train_task,  # The training task
                                #eval_tasks = eval_task, # The evaluation task
                                output_dir = output_dir) # The output directory

    training_loop.run(n_steps = n_steps)
    return training_loop

In [73]:
training_loop = train_model(model, train_task, eval_task, 100, output_dir_expand)

In [75]:
tmp_train_generator = train_generator(16)

tmp_batch = next(tmp_train_generator)
tmp_inputs, tmp_targets, tmp_example_weights = tmp_batch

In [76]:
tmp_pred = training_loop.eval_model(tmp_inputs)
tmp_pred

DeviceArray([[-4.57343006e+00, -1.03759766e-02],
             [-5.23079491e+00, -5.36370277e-03],
             [-6.36161518e+00, -1.72805786e-03],
             [-4.39077187e+00, -1.24685764e-02],
             [-1.95393348e+00, -1.52819753e-01],
             [-3.88019562e+00, -2.08629370e-02],
             [-4.91289091e+00, -7.37833977e-03],
             [-3.98273039e+00, -1.88105106e-02],
             [-3.37648392e-03, -5.69259787e+00],
             [-3.50618362e-03, -5.65495825e+00],
             [-2.15053558e-03, -6.14313602e+00],
             [-2.38418579e-06, -1.30092125e+01],
             [-3.99327278e-03, -5.52514553e+00],
             [-1.58262253e-03, -6.44945526e+00],
             [-4.18162346e-03, -5.47912884e+00],
             [-2.65049934e-03, -5.93437338e+00]], dtype=float32)

In [77]:
def compute_accuracy(preds, y, y_weights):
    
    is_pos =  preds[:, 1] > preds[:, 0] 
    is_pos_int = is_pos.astype(np.int32)
    correct = is_pos_int == y 
    sum_weights = np.sum(y_weights)
    correct_float = correct.astype(np.float32)
    weighted_correct_float = correct_float * y_weights
    weighted_num_correct = np.sum(weighted_correct_float)
    accuracy = weighted_num_correct / sum_weights
    return accuracy, weighted_num_correct, sum_weights

In [78]:
tmp_val_generator = val_generator(64)

tmp_batch = next(tmp_val_generator)
tmp_inputs, tmp_targets, tmp_example_weights = tmp_batch
tmp_pred = training_loop.eval_model(tmp_inputs)

tmp_acc, tmp_num_correct, tmp_num_predictions = compute_accuracy(preds=tmp_pred, y=tmp_targets, y_weights=tmp_example_weights)

print(f"Model's prediction accuracy on a single training batch is: {100 * tmp_acc}%")
print(f"Weighted number of correct predictions {tmp_num_correct}; weighted number of total observations predicted {tmp_num_predictions}")

Model's prediction accuracy on a single training batch is: 98.4375%
Weighted number of correct predictions 63.0; weighted number of total observations predicted 64


In [79]:
def test_model(generator, model):
    
    accuracy = 0.
    total_num_correct = 0
    total_num_pred = 0
    
    for batch in generator: 
        
        inputs = batch[0]
        
        targets = batch[1]
        
        example_weight = batch[2]

        pred = model(inputs)
        
        batch_accuracy, batch_num_correct, batch_num_pred = compute_accuracy(pred, targets, example_weight) 
        
        total_num_correct += batch_num_correct
        total_num_pred += batch_num_pred

    accuracy = total_num_correct / total_num_pred
  
    return accuracy

In [80]:
model = training_loop.eval_model
accuracy = test_model(test_generator(16), model)

print(f'The accuracy of your model on the validation set is {accuracy:.4f}', )

The accuracy of your model on the validation set is 0.9931


In [81]:
def predict(sentence):
    inputs = np.array(tweet_to_tensor(sentence, vocab_dict=Vocab))
    inputs = inputs[None, :]  
    
    preds_probs = model(inputs)
    preds = int(preds_probs[0, 1] > preds_probs[0, 0])
    
    sentiment = "negative"
    if preds == 1:
        sentiment = 'positive'

    return preds, sentiment

In [83]:
sentence = "It's such a nice day"
tmp_pred, tmp_sentiment = predict(sentence)
print(f"The sentiment of the sentence \n***\n\"{sentence}\"\n***\nis {tmp_sentiment}.")

print()

sentence = "I hate you"
tmp_pred, tmp_sentiment = predict(sentence)
print(f"The sentiment of the sentence \n***\n\"{sentence}\"\n***\nis {tmp_sentiment}.")

The sentiment of the sentence 
***
"It's such a nice day"
***
is positive.

The sentiment of the sentence 
***
"I hate you"
***
is negative.
