In [1]:
# https://petamind.com/word2vec-with-tensorflow-2-0-a-simple-cbow-implementation/

In [2]:
from google.colab import drive
drive.mount('/content/drive')
from google.colab import files

Mounted at /content/drive


In [3]:
race_subspace = set()
with open("drive/MyDrive/ethnicities.txt") as f:
  for line in f:
    race_subspace.add(line.strip().lower())

In [4]:
from __future__ import absolute_import, division, print_function, unicode_literals
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf
import numpy as np
print(tf.__version__)
##Output
#TensorFlow 2.x selected.
#2.0.0-rc2

2.8.0


In [5]:
class Word2Vec:
  def __init__(self, vocab_size=0, embedding_dim=16, optimizer='adam', epochs=1):
    self.vocab_size=vocab_size
    self.embedding_dim=5
    self.epochs=epochs
    if optimizer=='adam':
      self.optimizer = tf.optimizers.Adam()
    else:
      self.optimizer = tf.optimizers.SGD(learning_rate=1)

    self.W1 = tf.Variable(tf.random.normal([self.vocab_size, self.embedding_dim]))
    self.b1 = tf.Variable(tf.random.normal([self.embedding_dim])) #bias
    self.W2 = tf.Variable(tf.random.normal([self.embedding_dim, self.vocab_size]))
    self.b2 = tf.Variable(tf.random.normal([self.vocab_size]))

    self.checkpoint = tf.train.Checkpoint(w_1=self.W1, b_1=self.b1, w_2=self.W2, b_2=self.b2)

  def train(self, x_train=None, y_train=None, sentiments=None):
    for _ in range(1, self.epochs + 1):
      with tf.GradientTape() as t:
        #print(x_train, self.W1)
        hidden_layer = tf.add(tf.matmul(x_train,self.W1),self.b1)
        output_layer = tf.nn.softmax(tf.add( tf.matmul(hidden_layer, self.W2), self.b2))
        cross_entropy_loss = tf.reduce_mean(-tf.math.reduce_sum(y_train * tf.math.log(output_layer), axis=[1]))*sentiments
      grads = t.gradient(cross_entropy_loss, [self.W1, self.b1, self.W2, self.b2])
      self.optimizer.apply_gradients(zip(grads,[self.W1, self.b1, self.W2, self.b2]))
      if(_ % (self.epochs - 2) == 0):
        print(cross_entropy_loss)
  
  def vectorized(self, word_idx):
    return (self.W1+self.b1)[word_idx]

  def save_variables(self):
    self.checkpoint.save('./checkpoint')
    files.download("checkpoint")
    files.download("checkpoint-1.data-00000-of-00001")
    files.download("checkpoint-1.index")
    


  def restore_variables(self):
    status = self.checkpoint.restore(tf.train.latest_checkpoint('.'))
    status.assert_consumed()  # Optional check

  

In [6]:
def to_one_hot(data_points, vocab_size):
  temp = np.zeros(vocab_size)
  for i in data_points:
    temp[i] = 1
  return temp

stopwords = {'a', 'about', 'above', 'across', 'after', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'among', 'an', 'and', 'another', 'any', 'anybody', 'anyone', 'anything', 'anywhere', 'are', 'area', 'areas', 'around', 'as', 'ask', 'asked', 'asking', 'asks', 'at', 'away', 'b', 'back', 'backed', 'backing', 'backs', 'be', 'became', 'because', 'become', 'becomes', 'been', 'before', 'began', 'behind', 'being', 'beings', 'best', 'better', 'between', 'big', 'both', 'but', 'by', 'c', 'came', 'can', 'cannot', 'case', 'cases', 'certain', 'certainly', 'clear', 'clearly', 'come', 'could', 'd', 'did', 'differ', 'different', 'differently', 'do', 'does', 'done', 'down', 'down', 'downed', 'downing', 'downs', 'during', 'e', 'each', 'early', 'either', 'end', 'ended', 'ending', 'ends', 'enough', 'even', 'evenly', 'ever', 'every', 'everybody', 'everyone', 'everything', 'everywhere', 'f', 'face', 'faces', 'fact', 'facts', 'far', 'felt', 'few', 'find', 'finds', 'first', 'for', 'four', 'from', 'full', 'fully', 'further', 'furthered', 'furthering', 'furthers', 'g', 'gave', 'general', 'generally', 'get', 'gets', 'give', 'given', 'gives', 'go', 'going', 'good', 'goods', 'got', 'great', 'greater', 'greatest', 'group', 'grouped', 'grouping', 'groups', 'h', 'had', 'has', 'have', 'having', 'he', 'her', 'here', 'herself', 'high', 'high', 'high', 'higher', 'highest', 'him', 'himself', 'his', 'how', 'however', 'i', 'if', 'important', 'in', 'interest', 'interested', 'interesting', 'interests', 'into', 'is', 'it', 'its', 'itself', 'j', 'just', 'k', 'keep', 'keeps', 'kind', 'knew', 'know', 'known', 'knows', 'l', 'large', 'largely', 'last', 'later', 'latest', 'least', 'less', 'let', 'lets', 'like', 'likely', 'long', 'longer', 'longest', 'm', 'made', 'make', 'making', 'man', 'many', 'may', 'me', 'member', 'members', 'men', 'might', 'more', 'most', 'mostly', 'mr', 'mrs', 'much', 'must', 'my', 'myself', 'n', 'necessary', 'need', 'needed', 'needing', 'needs', 'never', 'new', 'new', 'newer', 'newest', 'next', 'no', 'nobody', 'non', 'noone', 'not', 'nothing', 'now', 'nowhere', 'number', 'numbers', 'o', 'of', 'off', 'often', 'old', 'older', 'oldest', 'on', 'once', 'one', 'only', 'open', 'opened', 'opening', 'opens', 'or', 'order', 'ordered', 'ordering', 'orders', 'other', 'others', 'our', 'out', 'over', 'p', 'part', 'parted', 'parting', 'parts', 'per', 'perhaps', 'place', 'places', 'point', 'pointed', 'pointing', 'points', 'possible', 'present', 'presented', 'presenting', 'presents', 'problem', 'problems', 'put', 'puts', 'q', 'quite', 'r', 'rather', 'really', 'right', 'right', 'room', 'rooms', 's', 'said', 'same', 'saw', 'say', 'says', 'second', 'seconds', 'see', 'seem', 'seemed', 'seeming', 'seems', 'sees', 'several', 'shall', 'she', 'should', 'show', 'showed', 'showing', 'shows', 'side', 'sides', 'since', 'small', 'smaller', 'smallest', 'so', 'some', 'somebody', 'someone', 'something', 'somewhere', 'state', 'states', 'still', 'still', 'such', 'sure', 't', 'take', 'taken', 'than', 'that', 'the', 'their', 'them', 'then', 'there', 'therefore', 'these', 'they', 'thing', 'things', 'think', 'thinks', 'this', 'those', 'though', 'thought', 'thoughts', 'three', 'through', 'thus', 'to', 'today', 'together', 'too', 'took', 'toward', 'turn', 'turned', 'turning', 'turns', 'two', 'u', 'under', 'until', 'up', 'upon', 'us', 'use', 'used', 'uses', 'v', 'very', 'w', 'want', 'wanted', 'wanting', 'wants', 'was', 'way', 'ways', 'we', 'well', 'wells', 'went', 'were', 'what', 'when', 'where', 'whether', 'which', 'while', 'who', 'whole', 'whose', 'why', 'will', 'with', 'within', 'without', 'work', 'worked', 'working', 'works', 'would', 'x', 'y', 'year', 'years', 'yet', 'you', 'young', 'younger', 'youngest', 'your', 'yours', 'z'}

In [7]:
import re
import os
import numpy as np
import time
from math import ceil

count = 0
max_num_lines = 100
words = set()

with open("drive/MyDrive/wiki_corpus.txt", encoding="utf-8", errors="ignore") as infile:
    for line in infile:
      if count >= max_num_lines:
          break

      corpus_raw = line.replace("'", "")
      corpus_raw = corpus_raw.replace('"', "")
      pattern = re.compile('[\W_-–]+')
      pattern.sub('', corpus_raw)

      # print(corpus_raw)

      # convert to lower case
      corpus_raw = corpus_raw.lower()
      # raw sentences is a list of sentences.
      raw_sentences_no_stopword_filter = corpus_raw.split('.')

      processed_sentences = []
      for phrase in raw_sentences_no_stopword_filter:
        new_phrase = [word for word in phrase.split() if word not in stopwords]
        processed_sentences.append(new_phrase)  
      
      for sentence in processed_sentences:
          for word in sentence:
              if word and word not in words:
                words.add(word)
                
      count += 1

In [8]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...




In [None]:
lines = []
count = 0
# vocab_size = 30039    # actual vocab size
vocab_size = len(words) + 1
word2int = {}
int2word = {}
WINDOW_SIZE = 5

t = 0
t1 = 0
t2 = 0
print(vocab_size)
cbow = Word2Vec(vocab_size=vocab_size, optimizer='adam', epochs=50)

RESTORE_FROM_CHECKPOINT = False

if RESTORE_FROM_CHECKPOINT:
  cbow.restore_variables()

with open("drive/MyDrive/wiki_corpus.txt", encoding="utf-8", errors="ignore") as infile:
    for line in infile:
      s = time.time()
      if count >= max_num_lines:
          break

      corpus_raw = line.replace("'", "")
      corpus_raw = corpus_raw.replace('"', "")
      pattern = re.compile('[\W_-–]+')
      pattern.sub('', corpus_raw)

      # print(corpus_raw)

      # convert to lower case
      corpus_raw = corpus_raw.lower()
      # raw sentences is a list of sentences.
      raw_sentences_no_stopword_filter = corpus_raw.split('.')

      processed_sentences = []
      for phrase in raw_sentences_no_stopword_filter:
        new_phrase = [word for word in phrase.split() if word not in stopwords]
        processed_sentences.append(new_phrase)  

      for sentence in processed_sentences:
          for word in sentence:
              if word and word not in word2int:
                word2int[word] = len(word2int)
                int2word[len(int2word)] = word

      if len(word2int) >= vocab_size:
        break

      if not RESTORE_FROM_CHECKPOINT:
        t += time.time() - s
        for sentence in processed_sentences[100:]:
          s = time.time()
          # print(sentence)

          #sentences:
          data = []
          sentiments = []
          for word_index, word in enumerate(sentence):
              prediction_words = [word2int[i] for i in sentence[max(0, word_index - int(WINDOW_SIZE/2)) : min(len(sentence), word_index + ceil(WINDOW_SIZE/2))] if i != word]
              data.append([prediction_words, [word2int[word]]])
              sent_words = [i for i in sentence[max(0, word_index - int(WINDOW_SIZE/2)) : min(len(sentence), word_index + ceil(WINDOW_SIZE/2))]]
              sent_str = ""
              race_word = ""
              race_related = False
              for w in sent_words:
                if w in race_subspace:
                  race_word = w
                  print('Detected Race related word!')
                  race_related = True
                else:
                  sent_str += w + " "
              sentiment_dict = sid.polarity_scores(sent_str)
              neutral_score = sentiment_dict["neu"]
              sentiments.append(neutral_score if race_related else 1 )
              if race_related:
                print(sent_str, race_word)
                print(neutral_score, neutral_score if race_related else 1)

          x_train = [] # input word
          y_train = [] # output word

          for data_word in data:
              x_train.append(to_one_hot(data_word[0], vocab_size))
              y_train.append(to_one_hot(data_word[1], vocab_size))

          # convert them to numpy arrays
          x_train = np.asarray(x_train, dtype='float32')
          y_train = np.asarray(y_train, dtype='float32')
          t1 += time.time() - s

          try:
            s = time.time()
            for i in range(x_train.shape[0]):
              cbow.train(x_train[i].reshape(1, -1), y_train[i].reshape(1, -1), sentiments[i])
            t2 += time.time() - s
          except:
            continue
        count += 1
        cbow.save_variables()

print(t, t1, t2)

39419
tf.Tensor(34.009357, shape=(), dtype=float32)
tf.Tensor(20.313206, shape=(), dtype=float32)
tf.Tensor(30.948393, shape=(), dtype=float32)
tf.Tensor(22.428953, shape=(), dtype=float32)
tf.Tensor(18.610355, shape=(), dtype=float32)
tf.Tensor(16.546389, shape=(), dtype=float32)
tf.Tensor(13.8575535, shape=(), dtype=float32)
tf.Tensor(14.472734, shape=(), dtype=float32)
tf.Tensor(10.543836, shape=(), dtype=float32)
tf.Tensor(17.260178, shape=(), dtype=float32)
tf.Tensor(18.255142, shape=(), dtype=float32)
tf.Tensor(11.117657, shape=(), dtype=float32)
tf.Tensor(13.702992, shape=(), dtype=float32)
Detected Race related word!
individualist emphasises  current
1.0 1.0
Detected Race related word!
individualist emphasises negative  current
0.351 0.351
Detected Race related word!
individualist emphasises negative liberty  current
0.22 0.22
Detected Race related word!
emphasises negative liberty opposing  current
0.22 0.22
Detected Race related word!
restraints free individual, social  curre

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

tf.Tensor(9.85295, shape=(), dtype=float32)
tf.Tensor(11.720285, shape=(), dtype=float32)
tf.Tensor(11.228141, shape=(), dtype=float32)
tf.Tensor(11.482035, shape=(), dtype=float32)
tf.Tensor(11.036827, shape=(), dtype=float32)
tf.Tensor(10.82873, shape=(), dtype=float32)
tf.Tensor(10.816884, shape=(), dtype=float32)
tf.Tensor(10.152488, shape=(), dtype=float32)
tf.Tensor(10.8091, shape=(), dtype=float32)
tf.Tensor(11.841681, shape=(), dtype=float32)
tf.Tensor(10.864321, shape=(), dtype=float32)
tf.Tensor(12.429352, shape=(), dtype=float32)
tf.Tensor(12.763964, shape=(), dtype=float32)
tf.Tensor(12.338929, shape=(), dtype=float32)
tf.Tensor(11.748956, shape=(), dtype=float32)
tf.Tensor(11.172917, shape=(), dtype=float32)
tf.Tensor(4.924219, shape=(), dtype=float32)
tf.Tensor(11.108099, shape=(), dtype=float32)
tf.Tensor(12.761278, shape=(), dtype=float32)
tf.Tensor(13.008567, shape=(), dtype=float32)
Detected Race related word!
advocating navigational improvements sangamon  river
0.566

In [None]:
criminal_vector = cbow.vectorized(word2int['black'])
black_vector = cbow.vectorized(word2int['white'])

normalize_a = tf.nn.l2_normalize(criminal_vector,0)        
normalize_b = tf.nn.l2_normalize(black_vector,0)
cos_similarity=tf.reduce_sum(tf.multiply(normalize_a,normalize_b))
print(cos_similarity)
print(criminal_vector)
print(black_vector)
dist = tf.reduce_sum(tf.square(criminal_vector-black_vector))
print(dist)

In [None]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...




In [None]:
sid = SentimentIntensityAnalyzer()
sentiment_dict = sid.polarity_scores("good criminal store")
print(sentiment_dict)

{'neg': 0.466, 'neu': 0.137, 'pos': 0.397, 'compound': -0.128}


In [None]:
sentiment_dict = sid.polarity_scores("nurse")
print(sentiment_dict)

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
