<a href="https://colab.research.google.com/github/abs-git/NLP/blob/main/English_Word_Embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
# glue data load

path = '/content/gdrive/MyDrive/Colab Notebooks/NLP'

with open(path + '/glue_train.txt', 'r') as f:
  train_sentences = f.readlines()

for i, sen in enumerate(train_sentences):
  train_sentences[i] = sen.rstrip("\n")

with open(path + '/glue_test.txt', 'r') as f:
  test_sentences = f.readlines()

for i, sen in enumerate(test_sentences):
  test_sentences[i] = sen.rstrip("\n")

print("train : {}".format(len(train_sentences)))
print("test : {}".format(len(test_sentences)))

print(train_sentences[:2])
print(test_sentences[:2])




train : 8551
test : 1063
["Our friends won't buy this analysis, let alone the next one we propose.", "One more pseudo generalization and I'm giving up."]
['Bill whistled past the house.', 'The car honked its way down the road.']


## Frequency (count) based embedding

Reference : https://datascienceschool.net/03%20machine%20learning/03.01.03%20Scikit-Learn%EC%9D%98%20%EB%AC%B8%EC%84%9C%20%EC%A0%84%EC%B2%98%EB%A6%AC%20%EA%B8%B0%EB%8A%A5.html

In [89]:
# Bag of Word (BoW)
import numpy as np
from collections import defaultdict


tokens = []
for sen in train_sentences[:3]:
  
  words = sen.split()
  for w in words:
    w = w.lower()
    if w not in tokens:
      tokens.append(w)


vocab = defaultdict(int)
for i, t in enumerate(tokens):
  vocab[t] += i+1


def BoW(sentences_list, vocab):

  BoW = []
  for sen in sentences_list:
    sen_words = sen.split()
    
    temp = []
    for t, i in vocab.items():
      if t in sen_words:
        count = sen_words.count(t)
        temp.append(count)
      else:
        temp.append(0)

    BoW.append(temp)

  return BoW


BoW = BoW(train_sentences[:3], vocab)


print("sentences     : {}".format(train_sentences[:3]))
print("tokens        : {}".format(tokens))
print("vocab         : {}".format(dict(vocab)))
print("sen to vector : {}".format(np.array(BoW)))

sentences     : ["Our friends won't buy this analysis, let alone the next one we propose.", "One more pseudo generalization and I'm giving up.", "One more pseudo generalization or I'm giving up."]
tokens        : ['our', 'friends', "won't", 'buy', 'this', 'analysis,', 'let', 'alone', 'the', 'next', 'one', 'we', 'propose.', 'more', 'pseudo', 'generalization', 'and', "i'm", 'giving', 'up.', 'or']
vocab         : {'our': 1, 'friends': 2, "won't": 3, 'buy': 4, 'this': 5, 'analysis,': 6, 'let': 7, 'alone': 8, 'the': 9, 'next': 10, 'one': 11, 'we': 12, 'propose.': 13, 'more': 14, 'pseudo': 15, 'generalization': 16, 'and': 17, "i'm": 18, 'giving': 19, 'up.': 20, 'or': 21}
sen to vector : [[0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 1 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 1 1]]


In [90]:
from sklearn.feature_extraction.text import CountVectorizer

# vectorizer = CountVectorizer(token_pattern = string)
vectorizer = CountVectorizer()

# CountVectorizer는 기본적으로 길이가 2이상인 문자만 토큰으로 인식한다.
words_freq = vectorizer.fit_transform(train_sentences[:3]).toarray()

vocab = vectorizer.vocabulary_

print("sentences       : {}".format(np.array(train_sentences[:3])))
print("words frequency : {}".format(words_freq))    
print("vocab           : {}".format(vocab))


sentences       : ["Our friends won't buy this analysis, let alone the next one we propose."
 "One more pseudo generalization and I'm giving up."
 "One more pseudo generalization or I'm giving up."]
words frequency : [[1 1 0 1 1 0 0 1 0 1 1 0 1 1 0 1 1 0 1 1]
 [0 0 1 0 0 1 1 0 1 0 1 0 0 0 1 0 0 1 0 0]
 [0 0 0 0 0 1 1 0 1 0 1 1 0 0 1 0 0 1 0 0]]
vocab           : {'our': 12, 'friends': 4, 'won': 19, 'buy': 3, 'this': 16, 'analysis': 1, 'let': 7, 'alone': 0, 'the': 15, 'next': 9, 'one': 10, 'we': 18, 'propose': 13, 'more': 8, 'pseudo': 14, 'generalization': 5, 'and': 2, 'giving': 6, 'up': 17, 'or': 11}


In [91]:
# TF-IDF (Term Frequency-Inverse Document Frequency, 단어 빈도-역 문서 빈도)
from math import log

class tf_idf():
  def __init__(self, docs):
    self.docs = docs
    self.vocab = []
    self.nDocs = len(docs)

  def get_vocab(self):
    
    vocab = []
    for doc in self.docs:
      doc = doc.lower()
      words = doc.split(" ")

      for w in words:
        if w not in self.vocab:
          self.vocab.append(w)

    return self.vocab


  def get_tfidf(self):

    # df
    df = defaultdict(int)
    for doc in self.docs:
      doc = doc.lower()
      words = doc.split(" ")

      temp = []
      for t in self.vocab:
        if t in words:
          df[t] += 1


    # df -> idf
    idf = defaultdict(int)
    for t, df_value in df.items():
      idf[t] = log( self.nDocs / (df_value + 1) )



    tf_idf = []
    for doc in self.docs:
      doc = doc.lower()
      words = doc.split(" ")

      temp = []
      for t in self.vocab:
        if t in words:
          tf = words.count(t)              # tf
          result = round(tf * idf[t], 4)
          temp.append(result)

        else:
          temp.append(0)

      tf_idf.append(temp)

    
    return tf_idf


In [92]:

vectorizer = tf_idf(train_sentences[:4])

vocab = vectorizer.get_vocab()
print(vocab)

matrix = vectorizer.get_tfidf()
print("result : {}".format(np.array(matrix)))

# nDocs = 4, df = 3인 경우 log 1 -> 0 이 되기 때문에 오류가 발생하기도 한다. 

['our', 'friends', "won't", 'buy', 'this', 'analysis,', 'let', 'alone', 'the', 'next', 'one', 'we', 'propose.', 'more', 'pseudo', 'generalization', 'and', "i'm", 'giving', 'up.', 'or', 'study', 'verbs,', 'crazier', 'they', 'get.']
result : [[0.6931 0.6931 0.6931 0.6931 0.6931 0.6931 0.6931 0.6931 0.2877 0.6931
  0.     0.2877 0.6931 0.     0.     0.     0.     0.     0.     0.
  0.     0.     0.     0.     0.     0.    ]
 [0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
  0.     0.     0.     0.     0.2877 0.2877 0.6931 0.2877 0.2877 0.2877
  0.     0.     0.     0.     0.     0.    ]
 [0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
  0.     0.     0.     0.     0.2877 0.2877 0.     0.2877 0.2877 0.2877
  0.6931 0.     0.     0.     0.     0.    ]
 [0.     0.     0.     0.     0.     0.     0.     0.     0.5754 0.
  0.     0.2877 0.     0.     0.     0.     0.     0.     0.     0.
  0.     0.6931 0.6931 0.6931 0.6931 0.6931]]


In [93]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

tfidf = vectorizer.fit_transform(train_sentences[:4]).toarray()
vocab = vectorizer.vocabulary_

print("sentences : {}".format(train_sentences[0]))
print("TF-IDF    : {}".format(tfidf))
print("vocab     : {}".format(vocab))


sentences : Our friends won't buy this analysis, let alone the next one we propose.
TF-IDF    : [[0.29297188 0.29297188 0.         0.29297188 0.         0.29297188
  0.         0.         0.         0.29297188 0.         0.29297188
  0.18700015 0.         0.29297188 0.29297188 0.         0.
  0.23098239 0.         0.29297188 0.         0.         0.23098239
  0.29297188]
 [0.         0.         0.48217603 0.         0.         0.
  0.38015312 0.         0.38015312 0.         0.30776671 0.
  0.30776671 0.         0.         0.         0.38015312 0.
  0.         0.         0.         0.38015312 0.         0.
  0.        ]
 [0.         0.         0.         0.         0.         0.
  0.38015312 0.         0.38015312 0.         0.30776671 0.
  0.30776671 0.48217603 0.         0.         0.38015312 0.
  0.         0.         0.         0.38015312 0.         0.
  0.        ]
 [0.         0.         0.         0.         0.34268741 0.
  0.         0.34268741 0.         0.         0.21873293 0

# Prediction based embedding

### Word2Vec

In [68]:
# CBOW (Continuous Bag Of Words)
# Skip-gram
# Data processing

import numpy as np


def create_vocab(sentences_list):
  
  vocab = []
  for sen in sentences_list:
    sen = sen.lower()
    tokens = sen.split(" ")

    for t in tokens:
      if t not in vocab:
        vocab.append(t)

  length = len(vocab)
  word_to_onehot = {}

  for i, v in enumerate(vocab):
    one_hot = np.zeros(length)
    one_hot[i] = 1
    word_to_onehot[v] = list(one_hot)
  
  return word_to_onehot


def create_word2vec_dataset(sentences_list, onehot_vocab, n = 2):     # n = window size

  contexts_data = []
  centers_data = []
  for sen in sentences_list:
    sen = sen.lower()
    tokens = sen.split(" ")                     # tokenizing

    for t_idx in range(n, len(tokens)-n):       # tokens[n:-n]
      center = tokens[t_idx]

      context = tokens[t_idx - n : t_idx+1 + n]
      context.remove(center)

      context_to_onehot = []
      for c in context:
        context_to_onehot.append(onehot_vocab[c])
        
      center_to_onehot = onehot_vocab[center]


      contexts_data.append(context_to_onehot)
      centers_data.append(center_to_onehot)

  return contexts_data, centers_data


def get_batch(contexts, centers, batch_size = 4):

  iter = len(centers) // batch_size

  batch = []
  idx = 0
  for i in range(iter):

    contexts_batch = np.array(contexts[idx : idx + batch_size])
    centers_batch = np.array(centers[idx : idx + batch_size])

    batch.append((contexts_batch, centers_batch))

    idx = i + batch_size

  return batch


nContext = 2
batch_size = 4

onehot_vocab = create_vocab(train_sentences)

contexts, centers = create_word2vec_dataset(train_sentences[:10], onehot_vocab, nContext)

word2vec_batch = get_batch(contexts, centers, batch_size)

print('contexts shape : {}'.format(np.array(contexts).shape))
print('centers shape : {}'.format(np.array(centers).shape))


contexts shape : (34, 4, 7845)
centers shape : (34, 7845)


In [69]:
# torch CBOW

import torch
from torch import nn
from torch.optim import SGD
import torch.nn.functional as F


class CBOW(nn.Module):
  def __init__(self, vocab_size, embedding_size, context_size): 
    # 어휘 수 (onehot size), 임베딩 벡터 크기 (table size), 문맥 단어의 수 n (context number)
    
    super(CBOW, self).__init__()

    self.vocab_size = vocab_size
    self.embedding_size = embedding_size
    self.context_size = context_size

    self.embedding = nn.Embedding(self.vocab_size, self.embedding_size)         # lookup table / 어휘 수, 뉴런 수
    self.linear1 = nn.Linear(2 * self.context_size * self.embedding_size, 512)
    self.linear2 = nn.Linear(512, self.vocab_size)


  def forward(self, inputs):   

    # inputs : (Batch size, Number of context words, Onehot context)
    # embedded : (Batch size, Number of context words, Onehot context, Onehot to embedding vector)
    # u : (Batch size, Number of context words, Onehot to embedding vector) 
    #   ->(Batch size, Number of context words * Onehot to embedding vector)
    # out : (Batch size, Onehot size embedding vector)
    
    embedded = self.embedding(inputs)

    u = embedded.sum(dim = 2)
    u = u.view(4,-1)

    out = self.linear1(u)
    out = F.relu(out)

    out = self.linear2(out)
    out = F.log_softmax(out, dim = 1)

    return out


In [70]:
# CBOW train

vocab_size = len(onehot_vocab)      # Number of words, 7845
embedding_size = 100                # Table size

losses = []

model = CBOW(vocab_size, embedding_size, nContext)

criterion = nn.CrossEntropyLoss()
optim = SGD(model.parameters(), lr=0.0001)

for epoch in range(10):
  
  batch_loss = 0
  for i, batch in enumerate(word2vec_batch):

    contexts = torch.tensor(batch[0], dtype = torch.long)
    centers = torch.tensor(batch[1], dtype = torch.float32)

    model.zero_grad()
    probs = model(contexts)

    loss = criterion(probs, centers)
    loss.backward()
    optim.step()

    batch_loss = batch_loss + loss.item()

  print('epoch : {}, loss : {}'.format(epoch+1, batch_loss))
  
  losses.append(batch_loss)


epoch : 1, loss : 14532321063.476562
epoch : 2, loss : nan
epoch : 3, loss : nan
epoch : 4, loss : nan
epoch : 5, loss : nan
epoch : 6, loss : nan
epoch : 7, loss : nan
epoch : 8, loss : nan
epoch : 9, loss : nan
epoch : 10, loss : nan


In [87]:
# torch Skip-gram

class Skipgram(nn.Module):
  def __init__(self, vocab_size, embedding_size, context_size): 
    # 어휘 수 (onehot size), 임베딩 벡터 크기 (table size), 문맥 단어의 수 n (context number)
    
    super(Skipgram, self).__init__()

    self.vocab_size = vocab_size
    self.embedding_size = embedding_size
    self.context_size = context_size

    self.embedding = nn.Embedding(self.vocab_size, self.embedding_size)         # 룩업(lookup) : 테이블 / 어휘 수, 뉴런 수
    self.linear1 = nn.Linear(self.embedding_size, 512)
    self.linear2 = nn.Linear(512, 2 * self.context_size * self.vocab_size)


  def forward(self, inputs):   

    # inputs : (Batch size, Onehot center)
    # embedded : (Batch size, Onehot center, Onehot to embedding vector)
    # u : (Batch size, Onehot to embedding vector)
    # out : (Batch size, Onehot size embedding vector)
    #    -> (Batch size, Number of contexts, Onehot vector)
    
    embedded = self.embedding(inputs)

    u = embedded.sum(dim = 1)

    out = self.linear1(u)
    out = F.relu(out)

    out = self.linear2(out)
    out = F.log_softmax(out)

    out = out.view(4, 2 * self.context_size, -1)

    return out



In [88]:
# Skip gram train

vocab_size = len(onehot_vocab)      # 7845
embedding_size = 100

losses = []

model = Skipgram(vocab_size, embedding_size, nContext)

criterion = nn.CrossEntropyLoss()
optim = SGD(model.parameters(), lr=0.0001)

for epoch in range(10):
  total_loss = 0
  for i, batch in enumerate(word2vec_batch):
    contexts = torch.tensor(batch[0], dtype = torch.float32)
    centers = torch.tensor(batch[1], dtype = torch.long)

    model.zero_grad()
    probs = model(centers)

    loss = criterion(probs, contexts)
    loss.backward()
    optim.step()

    batch_loss = batch_loss + loss.item()

  print('epoch : {}, loss : {}'.format(epoch+1, batch_loss))
  
  losses.append(batch_loss)




epoch : 1, loss : nan
epoch : 2, loss : nan
epoch : 3, loss : nan
epoch : 4, loss : nan
epoch : 5, loss : nan
epoch : 6, loss : nan
epoch : 7, loss : nan
epoch : 8, loss : nan
epoch : 9, loss : nan
epoch : 10, loss : nan


In [None]:
# SGNS (Skip-Gram with Negative Sampling)

class SGNS(nn.Module):

  def __init__(self, vocab_size, embedding_size, context_size, nNegs): 
    super(SGNS, self).__init__()

    self.vocab_size = vocab_size
    self.embedding_size = embedding_size
    self.nNegs = nNegs

  
  def forward(self, inputs):
    






### FastText

In [None]:
# FastText
from gensim.models import FastText

model = FastText(sentences = corpus, size = 1000, window = 5, min_count = 5, workers = 4, sg = 1)




# Frequency + Prediction based embedding

In [None]:
# GloVe




# Language model based embedding

In [None]:
# ELMo



