## predict a word is a "location" or not

In [1]:
corpus = [
          "We always come to Paris",
          "The professor is from Australia",
          "I live in Stanford",
          "He comes from Taiwan",
          "The capital of Turkey is Ankara"
         ]

def preprocess_sentence(sentence):
    return sentence.lower().split()

train_sentences = [preprocess_sentence(sent) for sent in corpus]
print(train_sentences)

locations = set(["paris", "australia", "stanford", "taiwan", "ankara","turkey"]) # 标注地点
train_labels = [[1 if word in locations else 0 for word in sent]for sent in train_sentences]
print(train_labels)

[['we', 'always', 'come', 'to', 'paris'], ['the', 'professor', 'is', 'from', 'australia'], ['i', 'live', 'in', 'stanford'], ['he', 'comes', 'from', 'taiwan'], ['the', 'capital', 'of', 'turkey', 'is', 'ankara']]
[[0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1, 0, 1]]


In [3]:
voc = set(w for s in train_sentences for w in s) # 词汇表
voc.add("<unk>") #for unknown words
voc.add("<pad>") #for words at the beginning and end
print(voc)

{'from', 'to', 'paris', 'stanford', 'we', 'come', 'always', 'he', 'australia', 'the', 'comes', 'capital', 'ankara', '<pad>', '<unk>', 'is', 'live', 'turkey', 'professor', 'taiwan', 'i', 'in', 'of'}


In [4]:
# to make every window the same length
def pad_window(sentence, window_size, pad_token="<pad>"):
  window = [pad_token] * window_size
  return window + sentence + window

# Show padding example
window_size = 2
pad_window(train_sentences[0], window_size=window_size)

['<pad>', '<pad>', 'we', 'always', 'come', 'to', 'paris', '<pad>', '<pad>']

In [5]:
ix_to_word = sorted(list(voc)) # 给出索引，能够找到对应的单词
print(ix_to_word)
word_to_ix={word:ix for ix,word in enumerate(ix_to_word)} # 给出单词，能找到对应的索引
print(word_to_ix)

['<pad>', '<unk>', 'always', 'ankara', 'australia', 'capital', 'come', 'comes', 'from', 'he', 'i', 'in', 'is', 'live', 'of', 'paris', 'professor', 'stanford', 'taiwan', 'the', 'to', 'turkey', 'we']
{'<pad>': 0, '<unk>': 1, 'always': 2, 'ankara': 3, 'australia': 4, 'capital': 5, 'come': 6, 'comes': 7, 'from': 8, 'he': 9, 'i': 10, 'in': 11, 'is': 12, 'live': 13, 'of': 14, 'paris': 15, 'professor': 16, 'stanford': 17, 'taiwan': 18, 'the': 19, 'to': 20, 'turkey': 21, 'we': 22}


In [6]:
def convert_token_to_indices(sentence,word_to_ix):
    return [word_to_ix.get(word,word_to_ix["<unk>"]) for word in sentence]# if word isn't in the voc -> <unk>

example_sentence = ["we", "always", "come", "to", "kuwait"]
example_indices = convert_token_to_indices(example_sentence, word_to_ix) # kuwait是<unk>
restored_example = [ix_to_word[ind] for ind in example_indices]

# 验证成功实现了单词和索引间的相互转换
print(f"Original sentence is: {example_sentence}")
print(f"Going from words to indices: {example_indices}")
print(f"Going from indices to words: {restored_example}")

Original sentence is: ['we', 'always', 'come', 'to', 'kuwait']
Going from words to indices: [22, 2, 6, 20, 1]
Going from indices to words: ['we', 'always', 'come', 'to', '<unk>']


In [7]:
example_padded_indices = [convert_token_to_indices(s,word_to_ix)for s in train_sentences] # 应用到我们的训练样本上
print(example_padded_indices)

[[22, 2, 6, 20, 15], [19, 16, 12, 8, 4], [10, 13, 11, 17], [9, 7, 8, 18], [19, 5, 14, 21, 12, 3]]


In [9]:
import torch
import torch.nn as nn
dim = 5
embeds = nn.Embedding(len(voc),dim) # 随机初始化
list(embeds.parameters())

[Parameter containing:
 tensor([[ 1.8665e+00,  2.6193e-01,  1.9000e-01, -8.2790e-01, -8.4364e-01],
         [-7.2985e-01,  1.1892e+00,  7.0656e-02,  3.8087e-01,  1.5866e+00],
         [ 7.5231e-01,  1.4707e+00, -3.1502e-01,  5.9916e-01, -1.0444e+00],
         [-1.6701e-01, -1.0700e+00, -7.9754e-02,  2.3688e-01, -2.2711e+00],
         [ 1.7645e+00,  4.8368e-01,  3.2452e-01,  2.9931e-01,  5.5431e-01],
         [-8.2392e-02,  1.5849e+00, -1.4372e+00,  6.4310e-01, -4.4925e-01],
         [ 3.2781e-01,  4.7918e-01,  1.6109e-03,  1.2416e+00,  5.9712e-01],
         [-5.4880e-01, -6.2648e-01,  1.4517e+00, -6.3594e-01,  5.9755e-01],
         [-2.3429e-01,  2.1966e-01,  1.4106e+00, -1.7326e+00, -1.2497e+00],
         [ 4.4186e-01,  7.7119e-02, -1.8368e-02, -1.2357e+00,  6.3652e-01],
         [ 8.3299e-01,  1.0178e+00, -3.4494e-01, -6.1611e-01,  7.2718e-01],
         [ 4.0414e-01, -7.7575e-01,  1.6117e-01, -2.2091e-01,  8.8417e-01],
         [ 6.6395e-02,  7.1163e-01, -2.5933e-01, -1.0118e-01, -3.

In [10]:
# Get the embedding for the word Paris
index = word_to_ix["paris"]
index_tensor = torch.tensor(index, dtype=torch.long) # nn.Embedding的输入索引必须是torch.long类型
paris_embed = embeds(index_tensor)
print(paris_embed)

# We can also get multiple embeddings at once
index_paris = word_to_ix["paris"]
index_ankara = word_to_ix["ankara"]
indices = [index_paris, index_ankara]
indices_tensor = torch.tensor(indices, dtype=torch.long)
embeddings = embeds(indices_tensor)
print(embeddings)

tensor([-0.0810, -0.3722,  2.0939,  0.8163,  0.4850],
       grad_fn=<EmbeddingBackward0>)
tensor([[-0.0810, -0.3722,  2.0939,  0.8163,  0.4850],
        [-0.1670, -1.0700, -0.0798,  0.2369, -2.2711]],
       grad_fn=<EmbeddingBackward0>)


In [11]:
from torch.utils.data import DataLoader
from functools import partial

def custom_collate_fn(batch,window_size,word_to_ix):
    x,y = zip(*batch) #split the value and label
    x = [pad_window(sent,window_size) for sent in x]
    x = [convert_token_to_indices(sent,word_to_ix) for sent in x]
    pad_token_ix = word_to_ix["<pad>"]
    x = [torch.LongTensor(xi) for xi in x] # 同样，embedding的索引是torch.long类型
    x_p = nn.utils.rnn.pad_sequence(x,batch_first=True,padding_value=pad_token_ix) # 补齐成相同的长度

    # 对标签做相同的操作
    lengths = [len(label) for label in y]
    lenghts = torch.LongTensor(lengths)
    y = [torch.LongTensor(yi) for yi in y]
    y_p = nn.utils.rnn.pad_sequence(y, batch_first=True, padding_value=0)

    return x_p,y_p,lenghts

In [12]:
data = list(zip(train_sentences,train_labels))
batch_size = 2
shuffle = True
collate_fn = partial(custom_collate_fn,window_size=window_size,word_to_ix=word_to_ix)#only one parameter in DataLoader,so use partial 

loader = DataLoader(data,batch_size=batch_size,shuffle=shuffle,collate_fn=collate_fn)
counter = 0
for batched_x, batched_y, batched_lengths in loader:
  print(f"Iteration {counter}")
  print("Batched Input:")
  print(batched_x)
  print("Batched Labels:")
  print(batched_y)
  print("Batched Lengths:")
  print(batched_lengths)
  print("")
  counter += 1

Iteration 0
Batched Input:
tensor([[ 0,  0, 19,  5, 14, 21, 12,  3,  0,  0],
        [ 0,  0, 19, 16, 12,  8,  4,  0,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 1, 0, 1],
        [0, 0, 0, 0, 1, 0]])
Batched Lengths:
tensor([6, 5])

Iteration 1
Batched Input:
tensor([[ 0,  0, 10, 13, 11, 17,  0,  0,  0],
        [ 0,  0, 22,  2,  6, 20, 15,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 1, 0],
        [0, 0, 0, 0, 1]])
Batched Lengths:
tensor([4, 5])

Iteration 2
Batched Input:
tensor([[ 0,  0,  9,  7,  8, 18,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 1]])
Batched Lengths:
tensor([4])



In [13]:
print(f"Original Tensor: ")
print(batched_x)
print("")

# Create the 2 * 2 + 1 chunks
chunk = batched_x.unfold(1, window_size*2 + 1, 1)# step = 1
print(f"Windows: ")
print(chunk)

Original Tensor: 
tensor([[ 0,  0,  9,  7,  8, 18,  0,  0]])

Windows: 
tensor([[[ 0,  0,  9,  7,  8],
         [ 0,  9,  7,  8, 18],
         [ 9,  7,  8, 18,  0],
         [ 7,  8, 18,  0,  0]]])


In [14]:
class WordWindowClassifer(nn.Module):
    def __init__(self,parameters,voc_size,pad_ix=0):
        super(WordWindowClassifer,self).__init__()
        #hyperparameters
        self.window_size = parameters["window_size"]
        self.embed_dim = parameters["embed_dim"]
        self.hidden_dim = parameters["hidden_dim"]
        self.freeze_embeddings = parameters["freeze_embeddings"]
        
        #create vectors
        self.embeds = nn.Embedding(voc_size,self.embed_dim,padding_idx=pad_ix)

        if self.freeze_embeddings:
            self.embeds.weight.requires_grad = False

        # full window size 
        full_size = 2*window_size+1
        self.hidden_layer = nn.Sequential(
            nn.Linear(full_size *self.embed_dim,self.hidden_dim),
            nn.Tanh()
        )
        self.output_layer = nn.Linear(self.hidden_dim,1)
        self.probabilities = nn.Sigmoid()

    def forward(self,input):
        B,L = input.size()

        token_windows = input.unfold(1,2*self.window_size+1,1)
        
        _,adjusted_length,_ = token_windows.size()
        # Good idea to do internal tensor-size sanity checks, at the least in comments!
        assert token_windows.size() == (B, adjusted_length, 2 * self.window_size + 1)

        embeded_windows = self.embeds(token_windows)
        embeded_windows = embeded_windows.view(B,adjusted_length,-1)

        layer_1 = self.hidden_layer(embeded_windows)
        output = self.output_layer(layer_1)

        output = self.probabilities(output)
        output = output.view(B,-1)

        return output

In [15]:
model_hyperparameters = {
    "batch_size": 2,
    "window_size": 2,
    "embed_dim": 25,
    "hidden_dim": 25,
    "freeze_embeddings": False,
}
voc_size = len(word_to_ix)
model = WordWindowClassifer(model_hyperparameters, voc_size)

lr1 = 0.05
optimizer = torch.optim.SGD(model.parameters(),lr = lr1)

# Define a loss function, which computes to binary cross entropy loss
def loss_function(batch_outputs, batch_labels, batch_lengths):
    # Calculate the loss for the whole batch
    bceloss = nn.BCELoss()
    loss = bceloss(batch_outputs, batch_labels.float())

    # Rescale the loss. Remember that we have used lengths to store the number of words in each training example
    loss = loss / batch_lengths.sum().float()  # batch_lengths = the actual length of a sentence

    return loss

In [18]:
def train_epoch(loss_function,optimizer,model,loader):
    total_loss =0
    for batch_inputs,batch_labels,batched_lengths in loader:
        optimizer.zero_grad()
        outputs = model.forward(batch_inputs)
        loss = loss_function(outputs,batch_labels,batched_lengths)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss

def train(loss_function,optimizer,model,loader,num_epochs):
    for epoch in range(num_epochs):
        epoch_loss = train_epoch(loss_function,optimizer,model,loader)
        if epoch %100 == 0: print(f"epoch:{epoch} loss:{epoch_loss:.4f}")

In [19]:
num_epochs = 1000
train(loss_function, optimizer, model, loader, num_epochs)

epoch:0 loss:0.0035
epoch:100 loss:0.0034
epoch:200 loss:0.0024
epoch:300 loss:0.0028
epoch:400 loss:0.0022
epoch:500 loss:0.0023
epoch:600 loss:0.0015
epoch:700 loss:0.0017
epoch:800 loss:0.0016
epoch:900 loss:0.0015


In [20]:
# Create test sentences
test_corpus = ["She comes from Paris"]
test_sentences = [s.lower().split() for s in test_corpus]
test_labels = [[0, 0, 0, 1]]    #initialize a random label

# Create a test loader
test_data = list(zip(test_sentences, test_labels))
batch_size = 1
shuffle = False
window_size = 2
collate_fn = partial(custom_collate_fn, window_size=2, word_to_ix=word_to_ix)
test_loader = torch.utils.data.DataLoader(test_data,batch_size=1,shuffle=False,collate_fn=collate_fn)

for test_instance, labels, _ in test_loader:
  outputs = model(test_instance)
  print(labels)
  print(outputs)

tensor([[0, 0, 0, 1]])
tensor([[0.0096, 0.0803, 0.0027, 0.9985]], grad_fn=<ViewBackward0>)
