<a href="https://colab.research.google.com/github/akiabe/coding-practice/blob/master/nltk_tweet_trax.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [None]:
!pip install -q -U trax
import trax

In [None]:
from nltk.corpus import twitter_samples 
from trax import fastmath
np = fastmath.numpy
random = fastmath.random

all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')  

train_pos = all_positive_tweets[:4000]
val_pos = all_positive_tweets[4000:]
train_neg = all_negative_tweets[:4000]
val_neg = all_negative_tweets[4000:]

train_x = train_pos + train_neg
val_x = val_pos + val_neg
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
val_y = np.append(np.ones(len(val_pos)), np.zeros(len(val_neg)))

In [None]:
import string
import re
nltk.download('stopwords')
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

stopwords_english = stopwords.words('english')
stemmer = PorterStemmer()

def process_tweet(tweet):
  tweet = re.sub(r'\$\w*', '', tweet) # remove "$"
  tweet = re.sub(r'^RT[\s]+', '', tweet) # remove "RT"
  tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet) # remove "https"
  tweet = re.sub(r'#', '', tweet) # remove "#"
    
  tokenizer = TweetTokenizer(
      preserve_case=False,
      strip_handles=True,
      reduce_len=True
  )
  tweet_tokens = tokenizer.tokenize(tweet)
    
  tweets_clean = []
  for word in tweet_tokens:
    if (word not in stopwords_english and word not in string.punctuation):
      stem_word = stemmer.stem(word)
      tweets_clean.append(stem_word)
  return tweets_clean

# test
print(train_pos[0])
print(process_tweet(train_pos[0]))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']


In [None]:
Vocab = {
    '__PAD__': 0,
    '__</e>__': 1,
    '__UNK__': 2
}
#print(len(Vocab))

for tweet in train_x:
  processed_tweet = process_tweet(tweet)
  for word in processed_tweet:
    if word not in Vocab:
      Vocab[word] = len(Vocab)
print(len(Vocab))
#display(Vocab)

9092


In [None]:
def tweet_to_tensor(tweet, vocab_dict, unk_token='__UNK__', verbose=False):
  word_l = process_tweet(tweet)
  if verbose:
    print("list of word from the processed tweet:")
    print(word_l)

  tensor_l = []
  unk_ID = vocab_dict[unk_token]
  if verbose:
    print(f"the unique integer ID for the unk_token is {unk_ID}")

  for word in word_l:
    word_ID = vocab_dict[word] if word in vocab_dict.keys() else unk_ID
    tensor_l.append(word_ID)
  return tensor_l

# test
print(train_pos[0])
print(process_tweet(train_pos[0]))  
print(tweet_to_tensor(train_pos[0], Vocab))

#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']
[3, 4, 5, 6, 7, 8, 9]


In [None]:
import random as rnd

def data_generator(data_pos, data_neg, batch_size, loop, vocab_dict, shuffle=False):
  assert batch_size % 2 == 0
  n_to_take = batch_size // 2
  
  pos_index = 0
  neg_index = 0
  len_data_pos = len(train_pos)
  len_data_neg = len(train_neg)
  
  pos_index_lines = [*range(len_data_pos)]
  neg_index_lines = [*range(len_data_neg)]

  if shuffle:
    rnd.shuffle(pos_index_lines)
    rnd.shuffle(neg_index_lines)

  stop = False

  while not stop:
    batch = []
    for i in range(n_to_take):
      if pos_index >= len_data_pos:
        if not loop:
          stop = True;
          break;
        pos_index = 0
        if shuffle:
          rnd.shuffle(pos_index_lines)
      tweet =  train_pos[pos_index_lines[pos_index]]
      tensor = tweet_to_tensor(tweet, Vocab)
      batch.append(tensor)
      pos_index = pos_index + 1

    for i in range(n_to_take):
      if neg_index >= len_data_neg:
        if not loop:
          stop = True;
          break;
        neg_index = 0
        if shuffle:
          rnd.shuffle(neg_index_lines)
      tweet = train_neg[neg_index_lines[neg_index]]
      tensor = tweet_to_tensor(tweet, Vocab)
      batch.append(tensor)
      neg_index = neg_index + 1
    
    if stop:
      break;

    pos_index += n_to_take
    neg_index += n_to_take
    max_len = max(len(t) for t in batch)

    tensor_pad_l = []
    for tensor in batch:
      n_pad = max_len - len(tensor)
      pad_l = [0] * n_pad
      tensor_pad = tensor + pad_l
      tensor_pad_l.append(tensor_pad)

    inputs = np.array(tensor_pad_l)

    target_pos = [1] * n_to_take
    target_neg = [0] * n_to_take
    target_l = target_pos + target_neg
    targets = np.array(target_l)

    example_weights = np.ones_like(targets)

    yield inputs, targets, example_weights

In [None]:
def train_generator(batch_size, shuffle = False):
    return data_generator(train_pos, train_neg, batch_size, True, Vocab, shuffle)

def val_generator(batch_size, shuffle = False):
    return data_generator(val_pos, val_neg, batch_size, True, Vocab, shuffle)

# Get a batch from the train_generator and inspect.
inputs, targets, example_weights = next(train_generator(4, shuffle=True))

# this will print a list of 4 tensors padded with zeros
print(f'Inputs: {inputs}')
print(f'Targets: {targets}')
print(f'Example Weights: {example_weights}')


Inputs: [[ 238 4507 1584  453    9    0    0    0    0    0    0    0    0    0]
 [  10 4255  100  366  610  345  429  790  610    9  263  343   98 1799]
 [  73  459  460 3761    0    0    0    0    0    0    0    0    0    0]
 [ 363  136 4105 6282  269 6283   63 5749    0    0    0    0    0    0]]
Targets: [1 1 0 0]
Example Weights: [1 1 1 1]


In [None]:
def classifier(vocab_size=len(Vocab), embedding_dim=256, output_dim=2, mode='train'):
  embed_layer = trax.layers.Embedding(
      vocab_size=vocab_size,
      d_feature=embedding_dim
  )
  mean_layer = trax.layers.Mean(axis=1)
  dense_output_layer = trax.layers.Dense(n_units=output_dim)
  log_softmax = trax.layers.LogSoftmax()

  model = trax.layers.Serial(
      embed_layer,
      mean_layer,
      dense_output_layer,
      log_softmax
  )

  return model

model = classifier()
display(model)

Serial[
  Embedding_9092_256
  Mean
  Dense_2
  LogSoftmax
]

In [None]:
import os
from trax import layers as tl
from trax.supervised import training

batch_size = 16
rnd.seed(271)

train_task = training.TrainTask(
    labeled_data=train_generator(batch_size=batch_size, shuffle=True),
    loss_layer=tl.CrossEntropyLoss(),
    optimizer=trax.optimizers.Adam(0.01),
    n_steps_per_checkpoint=10,
)

eval_task = training.EvalTask(
    labeled_data=val_generator(batch_size=batch_size, shuffle=True),
    metrics=[tl.CrossEntropyLoss(), tl.Accuracy()],
)

model = classifier()

In [None]:
output_dir = '~/model/'
output_dir_expand = os.path.expanduser(output_dir)
print(output_dir_expand)

/root/model/


In [None]:
def train_model(classifier, train_task, eval_task, n_steps, output_dir):
    '''
    Input: 
        classifier - the model you are building
        train_task - Training task
        eval_task - Evaluation task
        n_steps - the evaluation steps
        output_dir - folder to save your files
    Output:
        trainer -  trax trainer
    '''
### START CODE HERE (Replace instances of 'None' with your code) ###
    training_loop = training.Loop(
                                classifier, # The learning model
                                train_task, # The training task
                                eval_task, # The evaluation task
                                output_dir) # The output directory

    training_loop.run(n_steps = n_steps)
### END CODE HERE ###

    # Return the training_loop, since it has the model.
    return training_loop

In [None]:
training_loop = train_model(
    classifier=model,
    train_task=train_task,
    eval_task=eval_task,
    n_steps=100,
    output_dir=output_dir_expand
)

AssertionError: ignored