# Part I: data preprocessing

In [None]:
# get some text data
import requests
target_url = "https://raw.githubusercontent.com/casualcomputer/evaluation-ai/main/data/raw/cra_007_Evaluation%20%E2%80%93%20Canada%20Revenue%20Agency%20(CRA)%20Management%20of%20Small%20Business%20Nudge.txt"
response = requests.get(target_url)
data = response.text #[0:1000] #first three hundred charactes
data #run into bugs when feeding the evalaution report

# if the text data are as follows, then the codes works
# data = '''Machine learning is the study of computer algorithms that \
# improve automatically through experience. It is seen as a \
# subset of artificial intelligence. Machine learning algorithms \
# build a mathematical model based on sample data, known as \
# training data, in order to make predictions or decisions without \
# being explicitly programmed to do so. Machine learning algorithms \
# are used in a wide variety of applications, such as email filtering \
# and computer vision, where it is difficult or infeasible to develop \
# conventional algorithms to perform the needed tasks.'''

"Evaluation – Canada Revenue Agency (CRA) Management of Small Business Nudge\nFinal Report\nAudit, Evaluation, and Risk Branch\nJune\xa02022\nOn this page\nExecutive summary\n1. Introduction\n2. Background\n3. Evaluation methodologies\n4. Findings, recommendations, and management response\n4.1 There is no standard definition of “small business” at the CRA, and none that takes a taxpayer-centric view. The evaluation team has developed the concept of a “closely-held” business to address this gap.\n4.2 A taxpayer-centric segmentation approach could enhance CRA nudge impacts on small businesses.\n4.3 Many of the concerns regarding CRA nudge experimentations can be traced to a culture and practice of vertical rather than horizontal integration.\n5. Conclusion\n6. Acknowledgement\n7. Appendices\nAppendix A: Evaluation issues and methodology\nAppendix B: List of reports for the thematic evaluation\nAppendix C: Glossary\nExecutive summary\nThis evaluation is the second in a series of evaluatio

In [None]:
#filter stopwords with nltk
import nltk
import io
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')

# word_tokenize accepts
# a string as an input, not a file.
stop_words = set(stopwords.words('english'))
#file1 = open("text.txt")

# Use this to read file content as a stream:
#line = file1.read()
words = data.split()
for r in words:
    if not r in stop_words:
        appendFile = open('filteredtext.txt','a')
        appendFile.write(" "+r)
        appendFile.close()

with open('filteredtext.txt', 'r') as file:
    data = file.read().replace('\n', '')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
data

" Evaluation – Canada Revenue Agency (CRA) Management Small Business Nudge Final Report Audit, Evaluation, Risk Branch June 2022 On page Executive summary 1. Introduction 2. Background 3. Evaluation methodologies 4. Findings, recommendations, management response 4.1 There standard definition “small business” CRA, none takes taxpayer-centric view. The evaluation team developed concept “closely-held” business address gap. 4.2 A taxpayer-centric segmentation approach could enhance CRA nudge impacts small businesses. 4.3 Many concerns regarding CRA nudge experimentations traced culture practice vertical rather horizontal integration. 5. Conclusion 6. Acknowledgement 7. Appendices Appendix A: Evaluation issues methodology Appendix B: List reports thematic evaluation Appendix C: Glossary Executive summary This evaluation second series evaluations small business adm Evaluation – Canada Revenue Agency (CRA) Management Small Business Nudge Final Report Audit, Evaluation, Risk Branch June 2022 O

# Part II: train embedding



## Word2Vec from scratch
Word2Vec from scratch by Jake Tae: https://jaketae.github.io/study/word2vec/

In [None]:
##find all non-overlapping words defined by the regex patterns
import re

def tokenize(text):
    pattern = re.compile(r'[A-Za-z]+[\w^\']*|[\w^\']*[A-Za-z]+[\w^\']*')
    return pattern.findall(text.lower())

tokens = tokenize(data)
tokens[1:10]

['canada',
 'revenue',
 'agency',
 'cra',
 'management',
 'small',
 'business',
 'nudge',
 'final']

In [None]:
#create indices for the tokens from above
def mapping(tokens):
    word_to_id = {}
    id_to_word = {}

    for i, token in enumerate(set(tokens)):
        word_to_id[token] = i
        id_to_word[i] = token

    return word_to_id, id_to_word

word_to_id, id_to_word = mapping(tokens)

In [None]:
def concat(*iterables):
    for iterable in iterables:
        yield from iterable

def one_hot_encode(id, vocab_size):
    res = [0] * vocab_size
    res[id] = 1
    return res

#generate training data
import numpy as np

np.random.seed(42)


def generate_training_data(tokens, word_to_id, window):
    X = []
    y = []
    n_tokens = len(tokens)

    for i in range(n_tokens):
        idx = concat(
            range(max(0, i - window), i),
            range(i, min(n_tokens, i + window + 1))
        )
        for j in idx:
            if i == j:
                continue
            X.append(one_hot_encode(word_to_id[tokens[i]], len(word_to_id)))
            y.append(one_hot_encode(word_to_id[tokens[j]], len(word_to_id)))

    return np.asarray(X), np.asarray(y)

In [None]:
X, y = generate_training_data(tokens, word_to_id, 2)

In [None]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
y

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
def init_network(vocab_size, n_embedding):
    model = {
        "w1": np.random.randn(vocab_size, n_embedding),
        "w2": np.random.randn(n_embedding, vocab_size)
    }
    return model

model = init_network(len(word_to_id), 10)

#forward prop
def forward(model, X, return_cache=True):
    cache = {}

    cache["a1"] = X @ model["w1"]
    cache["a2"] = cache["a1"] @ model["w2"]
    cache["z"] = softmax(cache["a2"])

    if not return_cache:
        return cache["z"]
    return cache

#softmax activation
def softmax(X):
    res = []
    for x in X:
        exp = np.exp(x)
        res.append(exp / exp.sum())
    return res

#corss entropy
def cross_entropy(z, y):
    return - np.sum(np.log(z) * y)

#backprop
def backward(model, X, y, alpha):
    cache  = forward(model, X)
    da2 = cache["z"] - y
    dw2 = cache["a1"].T @ da2
    da1 = da2 @ model["w2"].T
    dw1 = X.T @ da1
    assert(dw2.shape == model["w2"].shape)
    assert(dw1.shape == model["w1"].shape)
    model["w1"] -= alpha * dw1
    model["w2"] -= alpha * dw2
    return cross_entropy(cache["z"], y)


In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
plt.style.use("seaborn")

n_iter = 50
learning_rate = 0.05

history = [backward(model, X, y, learning_rate) for _ in range(n_iter)]

plt.plot(range(len(history)), history, color="skyblue")
plt.show()

  plt.style.use("seaborn")
  exp = np.exp(x)
  res.append(exp / exp.sum())
  return - np.sum(np.log(z) * y)
  return - np.sum(np.log(z) * y)


KeyboardInterrupt: ignored

In [None]:
#sanity check to see if the word embedding would return words like
learning = one_hot_encode(word_to_id["learning"], len(word_to_id))
result = forward(model, [learning], return_cache=False)[0]

for word in (id_to_word[id] for id in np.argsort(result)[::-1]):
    print(word)


## Word2Vec tensorflow
Tensorflow tutorial (skip-gram): https://www.tensorflow.org/tutorials/text/word2vec#skip-gram_and_negative_sampling

In [250]:
import io
import re
import string
import tqdm

import numpy as np

import tensorflow as tf
from tensorflow.keras import layers

In [251]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [252]:
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

In [253]:
sentence = "The wide road shimmered in the hot sun"
tokens = list(sentence.lower().split())
print(len(tokens))

8


In [254]:
#Create a vocabulary to save mappings from tokens to integer indices:
vocab, index = {}, 1  # start indexing from 1
vocab['<pad>'] = 0  # add a padding token
for token in tokens:
  if token not in vocab:
    vocab[token] = index
    index += 1
vocab_size = len(vocab)
print(vocab)

{'<pad>': 0, 'the': 1, 'wide': 2, 'road': 3, 'shimmered': 4, 'in': 5, 'hot': 6, 'sun': 7}


In [255]:
inverse_vocab = {index: token for token, index in vocab.items()}
print(inverse_vocab)

example_sequence = [vocab[word] for word in tokens]
print(example_sequence)

{0: '<pad>', 1: 'the', 2: 'wide', 3: 'road', 4: 'shimmered', 5: 'in', 6: 'hot', 7: 'sun'}
[1, 2, 3, 4, 5, 1, 6, 7]


In [256]:
#generate positive skip grams
window_size = 2
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
      example_sequence,
      vocabulary_size=vocab_size,
      window_size=window_size,
      negative_samples=0)
print(len(positive_skip_grams))

for target, context in positive_skip_grams[:5]:
  print(f"({target}, {context}): ({inverse_vocab[target]}, {inverse_vocab[context]})")

26
(2, 1): (wide, the)
(2, 4): (wide, shimmered)
(4, 5): (shimmered, in)
(6, 5): (hot, in)
(4, 2): (shimmered, wide)


In [257]:
# Get target and context words for one positive skip-gram.
target_word, context_word = positive_skip_grams[0]

# Set the number of negative samples per positive context.
num_ns = 4

context_class = tf.reshape(tf.constant(context_word, dtype="int64"), (1, 1))
negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
    true_classes=context_class,  # class that should be sampled as 'positive'
    num_true=1,  # each positive skip-gram has 1 positive context class
    num_sampled=num_ns,  # number of negative context words to sample
    unique=True,  # all the negative samples should be unique
    range_max=vocab_size,  # pick index of the samples from [0, vocab_size]
    seed=SEED,  # seed for reproducibility
    name="negative_sampling"  # name of this operation
)
print(negative_sampling_candidates)
print([inverse_vocab[index.numpy()] for index in negative_sampling_candidates])

tf.Tensor([0 7 4 6], shape=(4,), dtype=int64)
['<pad>', 'sun', 'shimmered', 'hot']


In [258]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for `vocab_size` tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in the dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with a positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          seed=seed,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      context = tf.concat([tf.squeeze(context_class,1), negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

In [259]:
#https://stackoverflow.com/questions/60166043/can-tf-keras-utils-get-file-be-used-to-load-local-zip-files
import sys
import os
import keras
# myFile = "filteredtext.txt" # just for example...
# fullPath = os.path.abspath("./" + myFile)  # or similar, depending on your scenario
# path_to_file = keras.utils.get_file(myFile, 'file://'+fullPath)

# path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

# with open(path_to_file) as f:
#   lines = f.read().splitlines()
# for line in lines[:20]:
#   print(line)

In [260]:
with open("/content/filteredtext.txt") as f:
  lines = f.read().splitlines()
for line in lines[:20]:
  print(line)

 Evaluation – Canada Revenue Agency (CRA) Management Small Business Nudge Final Report Audit, Evaluation, Risk Branch June 2022 On page Executive summary 
 1. Introduction 2. Background 3. Evaluation methodologies 4. Findings, recommendations, management response 4.1 There standard definition “small business” 
 CRA, none takes taxpayer-centric view. The evaluation team developed concept “closely-held” business address gap. 4.2 A taxpayer-centric segmentation approach 
 could enhance CRA nudge impacts small businesses. 4.3 Many concerns regarding CRA nudge experimentations traced culture practice vertical rather horizontal integration. 
 5. Conclusion 6. Acknowledgement 7. Appendices Appendix A: Evaluation issues methodology Appendix B: List reports thematic evaluation Appendix C: Glossary 
 Executive summary This evaluation second series evaluations small business adm Evaluation – Canada Revenue Agency (CRA) Management Small Business Nudge Final 
 Report Audit, Evaluation, Risk Branch 

In [261]:
text_ds = tf.data.TextLineDataset("/content/filteredtext.txt").filter(lambda x: tf.cast(tf.strings.length(x), bool))
#text_ds = tf.data.TextLineDataset(path_to_file).filter(lambda x: tf.cast(tf.strings.length(x), bool))

In [262]:
text_ds

<_FilterDataset element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

In [263]:
# Now, create a custom standardization function to lowercase the text and
# remove punctuation.
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  return tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation), '')


# Define the vocabulary size and the number of words in a sequence.
vocab_size = 4096
sequence_length = 10

# Use the `TextVectorization` layer to normalize, split, and map strings to
# integers. Set the `output_sequence_length` length to pad all samples to the
# same length.
vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

In [264]:
vectorize_layer.adapt(text_ds.batch(1024))

In [265]:
# Save the created vocabulary for reference.
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[:20])

['', '[UNK]', 'nudge', 'cra', 'evaluation', 'data', 'the', 'business', 'small', 'research', 'taxpayercentric', 'segmentation', 'segments', 'footnote', 'population', 'management', 'compliance', 'team', 'branch', 'businesses']


In [266]:
# Vectorize the data in text_ds.
text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

In [267]:
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))

for seq in sequences[:5]:
  print(f"{seq} => {[inverse_vocab[i] for i in seq]}")

101
[  4 193  82  94  24   3  15   8   7   2] => ['evaluation', '–', 'canada', 'revenue', 'agency', 'cra', 'management', 'small', 'business', 'nudge']
[ 31 213  46 231  66   4 168  45 133  51] => ['1', 'introduction', '2', 'background', '3', 'evaluation', 'methodologies', '4', 'findings', 'recommendations']
[  3 128 150  10  40   6   4  17  21 138] => ['cra', 'none', 'takes', 'taxpayercentric', 'view', 'the', 'evaluation', 'team', 'developed', 'concept']
[ 22 179   3   2 101   8  19 234  52  81] => ['could', 'enhance', 'cra', 'nudge', 'impacts', 'small', 'businesses', '43', 'many', 'concerns']
[ 65 323  87 344 233 341  64  44   4 172] => ['5', 'conclusion', '6', 'acknowledgement', '7', 'appendices', 'appendix', 'a', 'evaluation', 'issues']


In [268]:
#generate training examples from sequence
targets, contexts, labels = generate_training_data(
    sequences=sequences,
    window_size=2,
    num_ns=4,
    vocab_size=vocab_size,
    seed=SEED)

targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

print(f"targets: {targets}")
print(f"contexts: {contexts}")
print(f"labels: {labels}")

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")

100%|██████████| 101/101 [00:00<00:00, 898.18it/s]

targets: [ 231  213  231  231  231  168  213  168  213  168  168   52   52   81
   81   52   42   42  131   42  131  131   42  131   47   47  141  141
  141  141  240  914  240  240  914  836  365  365  836  836  365  836
  982  230  872  100  872  100  982  100  230  984  984  984  872  230
  230  872  100  984  375  375  375  375  300  300  300  300   75   75
   75   75  140  140  140  140  313 1055   50   50  313  313 1055  313
   77   77   77   77  437  260  437  260  260  437  260  260 1083  257
 1083  257  257  260 1083  873  873  873  257  260  873  135  135  135
   23  135   23   23   23  953  953  953  953  363  363  363  363  139
  139  139  139  718  718  918  672  918  918  672  440  440  918  672
  672  440  440  141  141  141  141  689  689  359  359  689  689  359
  359   70   70   70   70   53  181   53   53   53  181  181  181  917
  214  917  917  214  917  716  716  269  269  269  214  214  214  214
  484  342  342  484  484  579 1125  579  579  579 1125  637  637  2




In [275]:
#configure dataset for performance
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

for element in dataset:
    print(element)

<_BatchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None))>
<_PrefetchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None))>


In [276]:
class Word2Vec(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim):
    super(Word2Vec, self).__init__()
    self.target_embedding = layers.Embedding(vocab_size,
                                      embedding_dim,
                                      input_length=1,
                                      name="w2v_embedding")
    self.context_embedding = layers.Embedding(vocab_size,
                                       embedding_dim,
                                       input_length=num_ns+1)

  def call(self, pair):
    target, context = pair
    # target: (batch, dummy?)  # The dummy axis doesn't exist in TF2.7+
    # context: (batch, context)
    if len(target.shape) == 2:
      target = tf.squeeze(target, axis=1)
    # target: (batch,)
    word_emb = self.target_embedding(target)
    # word_emb: (batch, embed)
    context_emb = self.context_embedding(context)
    # context_emb: (batch, context, embed)
    dots = tf.einsum('be,bce->bc', word_emb, context_emb)
    # dots: (batch, context)
    return dots

In [281]:
# #define loss function
# def custom_loss(x_logit, y_true):
#       return tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=y_true)

embedding_dim = 128
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [282]:
word2vec.fit(dataset, epochs=20, callbacks=[tensorboard_callback])
#BUG: this line was okay for the codes in the example: https://www.tensorflow.org/tutorials/text/word2vec
#for the current input dataset, the training script doeesnt work

Epoch 1/20


ValueError: ignored

In [None]:
#docs_infra: no_execute
%tensorboard --logdir logs