#one hot encoding

In [2]:
## no library
def one_hot(word_list):
  #(1) 단어의 중복을 제거해﻿줍니다.
  word_list = list(set(word_list))
  #(2) 단어의 수만큼 배열을 만들고, 0으로 채워﻿줍니다.
  encoding_matrix = [[0 for col in range(len(word_list))] for row in range(len(word_list))]
  #(3) 해당 단어의 인덱스를 찾고, 그 부분을 1로 만들어﻿줍니다.
  for index, word in enumerate(word_list):
    encoding_matrix[index][index] = 1
  return encoding_matrix

labels = ['cat','dog','rabbit','turtle']

In [3]:
## using pandas
import pandas as pd

label_dict = {'label':['cat','dog','rabbit','turtle']}
#df = pd.DataFrame(label_dict)
one_hot_encoding = pd.get_dummies(label_dict['label'])
print(one_hot_encoding)

   cat  dog  rabbit  turtle
0    1    0       0       0
1    0    1       0       0
2    0    0       1       0
3    0    0       0       1


In [4]:
## using sklearn
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

label_dict = {'label':['cat','dog','rabbit','turtle']}
df = pd.DataFrame(label_dict)
one_hot = OneHotEncoder()
one_hot_encoding = one_hot.fit_transform(df)
print(one_hot_encoding)

  (0, 0)	1.0
  (1, 1)	1.0
  (2, 2)	1.0
  (3, 3)	1.0


#skip gram

In [5]:
# convert context to index vector
def make_context_vector(context, word_to_ix):
  idxs = word_to_ix[context]
  return torch.tensor(idxs, dtype=torch.long)

# make dataset function
def make_data(sentence):
  data = []
  for i in range(2, len(example_sentence) - 2):
    context = example_sentence[i]
    target = [example_sentence[i - 2], example_sentence[i - 1], example_sentence[i + 1], example_sentence[i + 2]]
    data.append((context, target))
  return data

In [7]:
import torch
import torch.nn as nn

#(4) Skip-Gram 모델을 정의해 줍니다.
class SKIP_GRAM(nn.Module):
  def __init__(self, vocab_size, embedding_dim, context_size):
    super(SKIP_GRAM, self).__init__()
    self.context_size = context_size
    self.embeddings = nn.Embedding(vocab_size, embedding_dim)

    self.layer1 = nn.Linear(embedding_dim, 64)
    self.activation1 = nn.ReLU()

    self.layer2 = nn.Linear(64, vocab_size * context_size)
    self.activation2 = nn.LogSoftmax(dim = -1)

  def forward(self, inputs):
    embeded_vector = self.embeddings(inputs)
    output = self.activation1(self.layer1(embeded_vector))
    output = self.activation2(self.layer2(output))
    return output.view(self.context_size,vocab_size)

In [9]:
## using pytorch
import torch
import torch.nn as nn

EMBEDDING_DIM = 128
EPOCHS = 200
CONTEXT_SIZE = 4

with open('../[01]data_set/data_set.txt', 'r') as file: 
    example_sentence = file.readline()

example_sentence = example_sentence.split()
print(example_sentence)



#(1) 입력받은 문장을 단어로 쪼개고, 중복을 제거해줍니다.
vocab = set(example_sentence)
vocab_size = len(example_sentence)

#(2) 단어 : 인덱스, 인덱스 : 단어를 가지는 딕셔너리를 선언해 줍니다.
word_to_index = {word:index for index, word in enumerate(vocab)}
index_to_word = {index:word for index, word in enumerate(vocab)}

#(3) 학습을 위한 데이터를 생성해 줍니다.
data = make_data(example_sentence)



#(5) 모델을 선언해주고, loss function, optimizer등을 선언해줍니다.
model = SKIP_GRAM(vocab_size, EMBEDDING_DIM, CONTEXT_SIZE)
loss_function = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

#(6) 학습을 진행합니다.
for epoch in range(EPOCHS):
    total_loss = 0
    for context, target in data:
        context_vector = make_context_vector(context, word_to_index)  
        log_probs = model(context_vector)
        total_loss += loss_function(log_probs, torch.tensor([word_to_index[t] for t in target]))
    print('epoch = ',epoch, ', loss = ',total_loss)
    optimizer.zero_grad()
    total_loss.backward()
    optimizer.step()

#﻿(7) test하고 싶은 문장을 뽑고, test를 진행합니다.
test_data = 'Skip-Gram'
test_vector = make_context_vector(test_data, word_to_index)
result = model(test_vector)
print('Prediction : ', [index_to_word[torch.argmax(r).item()] for r in result])

['In', 'the', 'case', 'of', 'CBOW,', 'one', 'word', 'is', 'eliminated,', 'and', 'the', 'word', 'is', 'predicted', 'from', 'surrounding', 'words.', 'Therefore,', 'it', 'takes', 'multiple', 'input', 'vectors', 'as', 'inputs', 'to', 'the', 'model', 'and', 'creates', 'one', 'output', 'vector.', 'In', 'contrast,', 'Skip-Gram', 'learns', 'by', 'removing', 'all', 'words', 'except', 'one', 'word', 'and', 'predicting', 'the', 'surrounding', 'words', 'in', 'the', 'context', 'through', 'one', 'word.', 'So,', 'it', 'takes', 'a', 'vector', 'as', 'input', 'and', 'produces', 'multiple', 'output', 'vectors.', 'CBOW', 'and', 'Skip-Gram', 'are', 'different.']
epoch =  0 , loss =  tensor(386.0436, grad_fn=<AddBackward0>)
epoch =  1 , loss =  tensor(385.4234, grad_fn=<AddBackward0>)
epoch =  2 , loss =  tensor(384.8090, grad_fn=<AddBackward0>)
epoch =  3 , loss =  tensor(384.2037, grad_fn=<AddBackward0>)
epoch =  4 , loss =  tensor(383.6024, grad_fn=<AddBackward0>)
epoch =  5 , loss =  tensor(383.0038, gr

epoch =  121 , loss =  tensor(298.4211, grad_fn=<AddBackward0>)
epoch =  122 , loss =  tensor(297.4701, grad_fn=<AddBackward0>)
epoch =  123 , loss =  tensor(296.5180, grad_fn=<AddBackward0>)
epoch =  124 , loss =  tensor(295.5649, grad_fn=<AddBackward0>)
epoch =  125 , loss =  tensor(294.6102, grad_fn=<AddBackward0>)
epoch =  126 , loss =  tensor(293.6547, grad_fn=<AddBackward0>)
epoch =  127 , loss =  tensor(292.6964, grad_fn=<AddBackward0>)
epoch =  128 , loss =  tensor(291.7394, grad_fn=<AddBackward0>)
epoch =  129 , loss =  tensor(290.7788, grad_fn=<AddBackward0>)
epoch =  130 , loss =  tensor(289.8176, grad_fn=<AddBackward0>)
epoch =  131 , loss =  tensor(288.8556, grad_fn=<AddBackward0>)
epoch =  132 , loss =  tensor(287.8918, grad_fn=<AddBackward0>)
epoch =  133 , loss =  tensor(286.9265, grad_fn=<AddBackward0>)
epoch =  134 , loss =  tensor(285.9602, grad_fn=<AddBackward0>)
epoch =  135 , loss =  tensor(284.9952, grad_fn=<AddBackward0>)
epoch =  136 , loss =  tensor(284.0273, 