# Word Embedding

In [0]:
import pandas
from collections import Counter
import pickle
import itertools
import numpy as np
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Yating\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [0]:
def process(dataframe, words_dict, label_dict, pos_dict, doc_tfidf_values, test=False):
    data = []
    START_TAG = "<START>"
    STOP_TAG = "<STOP>"
    tfidf_values = list(itertools.chain(*doc_tfidf_values))
    index = 0
    for i, row in dataframe.iterrows():
        if row['Sentence'] == "-docstart-":
            continue
        else:
            tmp = []
            sent = [START_TAG] + row['Sentence'].split() + [STOP_TAG]
            current_pos = [START_TAG] + list(list(zip(*nltk.pos_tag(sent)))[1]) + [STOP_TAG]
            tmp_pos = []
            for p in current_pos:
                if p not in pos_dict:
                    pos_dict[p] = len(pos_dict)
                tmp_pos.append(pos_dict[p])
            for word in sent:
                if word not in words_dict:
                    words_dict[word] = len(words_dict)
                tmp.append(words_dict[word])
            if not test:
                tmp_label = []
                for l in [START_TAG] + row['NER'].split() + [STOP_TAG]:
                    if l not in label_dict:
                        label_dict[l] = len(label_dict)
                    tmp_label.append(label_dict[l])
                tmp_label =  tmp_label 
                data.append({"index": index, "raw": sent, "word_index": tmp, "label": tmp_label, "tf_idf": [0] + tfidf_values[index] + [0], "POS": tmp_pos})
            else:
                data.append({"index": index, "raw": sent, "word_index": tmp, "tf_idf": [0] + tfidf_values[index] + [0], "POS": tmp_pos})
            index += 1
    
    #print(data, words_dict, label_dict, pos_dict)
    return data, words_dict, label_dict, pos_dict


def df_to_doc(df):
    """
    Loading dataframe into articles.

    Args:
        df: loaded dataframe.

    Returns:
        docs: A list of articles, each article is a list of sentences, each sentence is a list of words.
    """
    docs = []
    tmp = []
    for index, row in df.iterrows():
        if row['Sentence'] == "-docstart-":
            if len(tmp) > 0:
                docs.append(tmp)
                tmp = []
        else:
            tmp.append(row['Sentence'].split())
    docs.append(tmp)
    #print(docs)
    return docs


def process_weight_to_index(train_docs, eval_docs, test_docs, weight):
    """
    Alter the tfidf of article metrics into sentences. Matching input.
    Args:
        train_docs: list of articles from df_to_doc of training data.
        eval_docs: list of articles from df_to_doc of developing data.
        test_docs: list of articles from df_to_doc of testing data.
        weight: the tf-idf weight metrics.
    Returns:
        train_tfidf: devided tf-idf weight by sentences of training.
        eval_tfidf: devided tf-idf weight by sentences of developing.
        test_tfidf: devided tf-idf weight by sentences of testing.
    """

    doc_index = 0
    train_tfidf, eval_tfidf, test_tfidf = [],[],[]
    for doc in train_docs:
        current_weight = weight[doc_index]
        current_doc_tfidf = []
        start_index = 0
        for sent in doc:
            current_doc_tfidf.append(current_weight[start_index:len(sent) + start_index].tolist())
            start_index += len(sent)
        train_tfidf.append(current_doc_tfidf)
        doc_index += 1
    for doc in eval_docs:
        current_weight = weight[doc_index]
        current_doc_tfidf = []
        start_index = 0
        for sent in doc:
            current_doc_tfidf.append(current_weight[start_index:len(sent) + start_index].tolist())
            start_index += len(sent)
        eval_tfidf.append(current_doc_tfidf)
        doc_index += 1
    for doc in test_docs:
        current_weight = weight[doc_index]
        current_doc_tfidf = []
        start_index = 0
        for sent in doc:
            current_doc_tfidf.append(current_weight[start_index:len(sent) + start_index].tolist())
            start_index += len(sent)
        test_tfidf.append(current_doc_tfidf)
        doc_index += 1
    return train_tfidf, eval_tfidf, test_tfidf

def get_td_idf(train_raw, eval_raw, test_raw):
    """
    Computing tf-idf of data.
    """

    from sklearn import feature_extraction
    from sklearn.feature_extraction.text import TfidfTransformer
    from sklearn.feature_extraction.text import CountVectorizer

    train_docs = df_to_doc(train_raw)
    eval_docs = df_to_doc(eval_raw)
    test_docs = df_to_doc(test_raw)

    corpus = []
    text = ""
    for i in itertools.chain(train_docs, eval_docs, test_docs):
        text += " ".join(itertools.chain(*i)) + " "
        corpus.append(text)
        text = ""
    
    vectorizer=CountVectorizer()
    transformer=TfidfTransformer()
    tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus))
    weight=tfidf.toarray()
    train_tfidf, eval_tfidf, test_tfidf = \
        process_weight_to_index(train_docs, eval_docs, test_docs, weight)
    return train_tfidf, eval_tfidf, test_tfidf


In [0]:
def build_embedding_matrix(worddict, embeddings_file):
    """
    Build an embedding matrix with pretrained weights for a given worddict.

    Args:
        worddict: A dictionary associating words to unique integer indices.
        embeddings_file: A file containing pretrained word embeddings.

    Returns:
        A numpy matrix of size (num_words+4, embedding_dim) containing
        pretrained word embeddings (the +4 is for the padding, BOS, EOS and
        out-of-vocabulary tokens).
    """
    # Load the word embeddings in a dictionnary.
    embeddings = {}
    with open(embeddings_file, 'r', encoding='utf8') as input_data:
        for line in input_data:
            line = line.split()
            try:
                float(line[1])
                word = line[0]
                if word in worddict:
                    embeddings[word] = list(map(float, line[1:]))
            except ValueError:
                continue

    num_words = len(worddict)
    embedding_dim = len(list(embeddings.values())[0])
    embedding_matrix = np.zeros((num_words, embedding_dim))
    for word, i in worddict.items():
        if word in embeddings:
            embedding_matrix[i] = np.array(embeddings[word], dtype=float)
        else:
            if word == "<PAD>":
                continue
            # Out of vocabulary words are initialised with random gaussian samples.
            embedding_matrix[i] = np.random.normal(size=(embedding_dim))

    return embedding_matrix

def show_data(data, len_show=5, type='train'):
    print("{} demo".format(type))
    for i in range(len_show):
        print(data[i])

In [0]:
if __name__ == "__main__":
    train_raw = pandas.read_csv("train.csv", sep=",")
    eval_raw = pandas.read_csv("val.csv", sep=",")
    test_raw = pandas.read_csv("test.csv", sep=",")

    # processing tf-idf value
    train_tfidf, eval_tfidf, test_tfidf = get_td_idf(train_raw, eval_raw, test_raw)
    
    words_dict = {"<PAD>": 0, "<OOV>":1}
    label_dict = {"<PAD>": 0}
    pos_dict = {"<PAD>": 0}

    # Processing raw data into index
    train_, words_dict, label_dict, pos_dict = \
        process(train_raw, words_dict, label_dict, pos_dict, train_tfidf)
    eval_, words_dict, label_dict, pos_dict = \
        process(eval_raw, words_dict, label_dict, pos_dict, eval_tfidf)
    test_, words_dict, label_dict, pos_dict = \
        process(test_raw, words_dict, label_dict, pos_dict, test_tfidf, test=True)
    
    # Check if the data is processed correctly
    show_data(train_, len_show=5, type='train')
    show_data(eval_, len_show=5, type='eval')
    show_data(test_, len_show=5, type='test')

    # Saving dict
    print(words_dict, file=open('word_dict.txt', 'w'))
    print(label_dict, file=open('label_dict.txt', 'w'))
    print(pos_dict, file=open('pos_dict.txt', 'w'))

    pickle.dump(train_, open('data/train.pkl', 'wb'))
    pickle.dump(eval_, open('data/val.pkl', 'wb'))
    pickle.dump(test_, open('data/test.pkl', 'wb'))

    # build embedding
    embedding_matrix = build_embedding_matrix(words_dict, 'glove.840B.300d.txt')
    pickle.dump(embedding_matrix, open('data/embedding.pkl', 'wb'))

[[['eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.'], ['peter', 'blackburn'], ['brussels', '1996-08-22'], ['the', 'european', 'commission', 'said', 'on', 'thursday', 'it', 'disagreed', 'with', 'german', 'advice', 'to', 'consumers', 'to', 'shun', 'british', 'lamb', 'until', 'scientists', 'determine', 'whether', 'mad', 'cow', 'disease', 'can', 'be', 'transmitted', 'to', 'sheep', '.'], ['germany', "'s", 'representative', 'to', 'the', 'european', 'union', "'s", 'veterinary', 'committee', 'werner', 'zwingmann', 'said', 'on', 'wednesday', 'consumers', 'should', 'buy', 'sheepmeat', 'from', 'countries', 'other', 'than', 'britain', 'until', 'the', 'scientific', 'advice', 'was', 'clearer', '.'], ['"', 'we', 'do', "n't", 'support', 'any', 'such', 'recommendation', 'because', 'we', 'do', "n't", 'see', 'any', 'grounds', 'for', 'it', ',', '"', 'the', 'commission', "'s", 'chief', 'spokesman', 'nikolaus', 'van', 'der', 'pas', 'told', 'a', 'news', 'briefing', '.'], ['he', 'sai



[{'index': 0, 'raw': ['<START>', 'eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.', '<STOP>'], 'word_index': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], 'label': [1, 2, 3, 4, 3, 3, 3, 4, 3, 3, 5], 'tf_idf': [0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0], 'POS': [1, 2, 3, 4, 2, 3, 5, 6, 2, 3, 7, 3, 8]}, {'index': 1, 'raw': ['<START>', 'peter', 'blackburn', '<STOP>'], 'word_index': [2, 13, 14, 12], 'label': [1, 6, 6, 5], 'tf_idf': [0, 0.0, 0.0, 0], 'POS': [1, 4, 9, 3, 3, 8]}, {'index': 2, 'raw': ['<START>', 'brussels', '1996-08-22', '<STOP>'], 'word_index': [2, 15, 16, 12], 'label': [1, 7, 3, 5], 'tf_idf': [0, 0.0, 0.0, 0], 'POS': [1, 2, 4, 10, 4, 8]}, {'index': 3, 'raw': ['<START>', 'the', 'european', 'commission', 'said', 'on', 'thursday', 'it', 'disagreed', 'with', 'german', 'advice', 'to', 'consumers', 'to', 'shun', 'british', 'lamb', 'until', 'scientists', 'determine', 'whether', 'mad', 'cow', 'disease', 'can', 'be', 'transmitted', 'to', 'sheep', '.', '<STOP>

[{'index': 0, 'raw': ['<START>', 'cricket', '-', 'leicestershire', 'take', 'over', 'at', 'top', 'after', 'innings', 'victory', '.', '<STOP>'], 'word_index': [2, 2004, 639, 2048, 213, 778, 157, 385, 120, 2093, 1801, 11, 12], 'label': [1, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 5], 'tf_idf': [0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0], 'POS': [1, 3, 3, 28, 3, 3, 11, 11, 2, 11, 2, 3, 7, 3, 8]}, {'index': 1, 'raw': ['<START>', 'london', '1996-08-30', '<STOP>'], 'word_index': [2, 264, 7916, 12], 'label': [1, 7, 3, 5], 'tf_idf': [0, 0.0, 0.0, 0], 'POS': [1, 2, 2, 2, 3, 8]}, {'index': 2, 'raw': ['<START>', 'west', 'indian', 'all-rounder', 'phil', 'simmons', 'took', 'four', 'for', '38', 'on', 'friday', 'as', 'leicestershire', 'beat', 'somerset', 'by', 'an', 'innings', 'and', '39', 'runs', 'in', 'two', 'days', 'to', 'take', 'over', 'at', 'the', 'head', 'of', 'the', 'county', 'championship', '.', '<STOP>'], 'word_index': [2, 1004, 5479, 2288, 1696, 2051, 767, 2170, 71, 7917, 21, 1097,

train demo
{'index': 0, 'raw': ['<START>', 'eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.', '<STOP>'], 'word_index': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], 'label': [1, 2, 3, 4, 3, 3, 3, 4, 3, 3, 5], 'tf_idf': [0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0], 'POS': [1, 2, 3, 4, 2, 3, 5, 6, 2, 3, 7, 3, 8]}
{'index': 1, 'raw': ['<START>', 'peter', 'blackburn', '<STOP>'], 'word_index': [2, 13, 14, 12], 'label': [1, 6, 6, 5], 'tf_idf': [0, 0.0, 0.0, 0], 'POS': [1, 4, 9, 3, 3, 8]}
{'index': 2, 'raw': ['<START>', 'brussels', '1996-08-22', '<STOP>'], 'word_index': [2, 15, 16, 12], 'label': [1, 7, 3, 5], 'tf_idf': [0, 0.0, 0.0, 0], 'POS': [1, 2, 4, 10, 4, 8]}
{'index': 3, 'raw': ['<START>', 'the', 'european', 'commission', 'said', 'on', 'thursday', 'it', 'disagreed', 'with', 'german', 'advice', 'to', 'consumers', 'to', 'shun', 'british', 'lamb', 'until', 'scientists', 'determine', 'whether', 'mad', 'cow', 'disease', 'can', 'be', 'transmitted', 'to', 'sheep', '.', 

# Data Loading

In [0]:
"""
Dataset definition for NER datasets.##输进去RNN的数据是定长的
"""

import torch
from torch.utils.data import Dataset

In [0]:
class NERDataset(Dataset):
    def __init__(self,
           datas,
           padding_idx=0,
           max_sentence_length=None,
           test=False
           ):
        self.lengths = [len(seq["word_index"]) for seq in datas]
        self.max_length = max_sentence_length
        if self.max_length is None:
            self.max_length = max(self.lengths)

        self.num_sequences = len(datas)

        self.data = {
            'word_index': torch.ones(self.num_sequences, self.max_length, dtype=torch.long)* padding_idx,
            'tf_idf': torch.ones(self.num_sequences, self.max_length, dtype=torch.float)* padding_idx,
            'pos': torch.ones(self.num_sequences, self.max_length, dtype=torch.long)* padding_idx,
            'labels': torch.ones(self.num_sequences, self.max_length, dtype=torch.long)* padding_idx, 
            'length': [0] * self.num_sequences
        }

        for i, data in enumerate(datas):
            end = min(len(data["word_index"]), self.max_length)
            self.data["word_index"][i][:end] = torch.tensor(data["word_index"][:end])
            self.data["tf_idf"][i][:end] = torch.tensor(data["tf_idf"][:end])
            self.data["pos"][i][:end] = torch.tensor(data["POS"][:end])
            self.data["length"][i] = min(len(data["word_index"]), self.max_length)
            if not test:
                self.data["labels"][i][:end] = torch.tensor(data["label"][:end])
    def __len__(self):
        return self.num_sequences

    def __getitem__(self, index):
        return {"word_index": self.data["word_index"][index],
                "tf_idf": self.data["tf_idf"][index],
                "pos": self.data["pos"][index],
                "length": self.data["length"][index],
                "labels": self.data["labels"][index]}


# Model

In [8]:
!pip install pytorch-crf

Collecting pytorch-crf
  Downloading https://files.pythonhosted.org/packages/96/7d/4c4688e26ea015fc118a0327e5726e6596836abce9182d3738be8ec2e32a/pytorch_crf-0.7.2-py3-none-any.whl
Installing collected packages: pytorch-crf
Successfully installed pytorch-crf-0.7.2


In [11]:
pwd

'/content'

In [0]:
import torch
import torch.nn as nn
# from layers import RNNDropout, Seq2SeqEncoder, SoftmaxAttention
from torchcrf import CRF
# from utils import get_mask
import math
import torch.nn.functional as F

START_TAG = '<START>'
STOP_TAG = '<STOP>'

In [12]:

from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
def sort_by_seq_lens(batch, sequences_lengths, descending=True):
    sorted_seq_lens, sorting_index =\
        sequences_lengths.sort(0, descending=descending)

    sorted_batch = batch.index_select(0, sorting_index)

    idx_range =\
        sequences_lengths.new_tensor(torch.arange(0, len(sequences_lengths)))
    _, reverse_mapping = sorting_index.sort(0, descending=False)
    restoration_index = idx_range.index_select(0, reverse_mapping)

    return sorted_batch, sorted_seq_lens, sorting_index, restoration_index

In [0]:
def get_mask(sequences_batch, sequences_lengths):
    batch_size = sequences_batch.size()[0]
    max_length = torch.max(sequences_lengths)
    mask = torch.ones(batch_size, max_length, dtype=torch.float)
    mask[sequences_batch[:, :max_length] == 0] = 0.0

    return mask

In [0]:
class RNNDropout(nn.Dropout):
    def forward(self, sequences_batch):
        ones = sequences_batch.data.new_ones(sequences_batch.shape[0],
                                             sequences_batch.shape[-1])
        dropout_mask = nn.functional.dropout(ones, self.p, self.training,
                                             inplace=False)
        return dropout_mask.unsqueeze(1) * sequences_batch

In [0]:
class Seq2SeqEncoder(nn.Module):
    def __init__(self,
                 rnn_type,
                 input_size,
                 hidden_size,
                 num_layers=1,
                 bias=True,
                 dropout=0.0,
                 bidirectional=False):
        assert issubclass(rnn_type, nn.RNNBase)
        super(Seq2SeqEncoder, self).__init__()
        self.rnn_type = rnn_type
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bias = bias
        self.dropout = dropout
        self.bidirectional = bidirectional

        self._encoder = nn.LSTM(input_size,
                     hidden_size,
                     num_layers=num_layers,
                     bias=bias,
                     batch_first=True,
                     dropout=dropout,
                     bidirectional=bidirectional)

    def forward(self, sequences_batch, sequences_lengths):
        sorted_batch, sorted_lengths, _, restoration_idx = \
            sort_by_seq_lens(sequences_batch, sequences_lengths)
        packed_batch = nn.utils.rnn.pack_padded_sequence(sorted_batch,
                                 sorted_lengths,
                                 batch_first=True)
        outputs, hidden = self._encoder(packed_batch, None)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs,
                                batch_first=True)
        reordered_outputs = outputs.index_select(0, restoration_idx)
        hidden = (hidden[0].index_select(1, restoration_idx),
                  hidden[1].index_select(1, restoration_idx))
        return reordered_outputs, hidden


In [0]:
class MultiHeadedAttention(nn.Module):
    """
    Take in model size and number of heads.
    """

    def __init__(self, h, d_model, dropout=0.1):
        super().__init__()
        assert d_model % h == 0

        self.d_k = d_model // h
        self.h = h

        self.linear_layers = nn.ModuleList([nn.Linear(d_model, d_model) for _ in range(3)])
        self.output_linear = nn.Linear(d_model, d_model)
        self.attention = Attention()

        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query, mask=None):
        key, value = query, query
        batch_size = query.size(0)
        mask = (mask > 0).unsqueeze(1).repeat(1, mask.size(1), 1).unsqueeze(1)
        # 1) Do all the linear projections in batch from d_model => h x d_k
        query, key, value = [l(x).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
                             for l, x in zip(self.linear_layers, (query, key, value))]

        # 2) Apply attention on all the projected vectors in batch.
        x, attn = self.attention(query, key, value, mask=mask, dropout=self.dropout)

        # 3) "Concat" using a view and apply a final linear.
        x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k)

        return self.output_linear(x)

In [0]:
class Attention(nn.Module):
    """
    Compute 'Scaled Dot Product Attention
    """

    def forward(self, query, key, value, mask=None, dropout=None):
        scores = torch.matmul(query, key.transpose(-2, -1)) \
                 / math.sqrt(query.size(-1))

        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        p_attn = F.softmax(scores, dim=-1)

        if dropout is not None:
            p_attn = dropout(p_attn)

        return torch.matmul(p_attn, value), p_attn

In [0]:
class LSTM_Attention(nn.Module):
    def __init__(self,
                 input_size,
                 hidden_size,
                 num_layer, 
                 dropout=0.0):
        super().__init__()
        self.encoder = Seq2SeqEncoder(nn.LSTM,
                                      input_size,
                                      hidden_size,
                                      num_layers=num_layer,
                                      bidirectional=True)
        self.m = torch.nn.Softmax(dim=-1)
        self.w = torch.nn.Parameter(torch.randn(hidden_size * 2))

    def forward(self, input, length):
        encoder_output, state = self.encoder(input, length)
        encoder_output = encoder_output.matmul(self.w)
        encoder_output = self.m(encoder_output)
        encoder_output = encoder_output.unsqueeze(-1).repeat(1, 1, input.shape[-1])
        return input * encoder_output, state

In [0]:
class BiLSTM_CRF(nn.Module):
    def __init__(self, args, vocab_size, tag_to_ix, embeddings, pos_size):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = args.embedding_dim
        self.hidden_dim = args.hidden
        self.pos_dim = args.pos_dim
        self.args = args
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.cell = nn.LSTM 

        self.tagset_size = len(tag_to_ix)
        if args.self_attention:
            self.self_attentin = MultiHeadedAttention(args.num_head, self.hidden_dim * 2)
            self.lstm2 = Seq2SeqEncoder(self.cell,
                            self.hidden_dim*2,
                            self.hidden_dim,
                            num_layers = args.num_layers,
                            bidirectional=args.bidirectional)
        if args.LSTM_attention:
            self.lstm3 = LSTM_Attention(self.hidden_dim * 2, self.hidden_dim, args.num_layers)
        if args.from_pretrain_embedding:
            self.word_embeds = nn.Embedding(vocab_size,
                                self.embedding_dim,
                                padding_idx=0,
                                _weight=embeddings)
        else:
            self.word_embeds = nn.Embedding(vocab_size,
                                self.embedding_dim)
        self.rnn_input_dim = self.embedding_dim
        if args.use_pos:
            self.rnn_input_dim += self.pos_dim
        if args.use_tfidf:
            self.rnn_input_dim += 1
        self.lstm = Seq2SeqEncoder(self.cell,
                                  self.rnn_input_dim,
                                  self.hidden_dim,
                                  num_layers = args.num_layers,
                                  bidirectional=args.bidirectional)
        self.pos_embeds = nn.Embedding(pos_size, self.pos_dim)
        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(self.hidden_dim * (2 if args.bidirectional else 1), self.tagset_size)
        self.crf = CRF(self.tagset_size)

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2),
                torch.randn(2, 1, self.hidden_dim // 2))

    def _make_embeddings(self, sentence):
        a,b,c = sentence
        embeds = self.word_embeds(a) #B*N*H_EMB
        pos = self.pos_embeds(c) #B*N*H_POS
        tf_idf = b #B*H
        if self.args.use_pos and self.args.use_tfidf:
            all_ = torch.cat([embeds, pos, tf_idf.unsqueeze(-1)], dim=-1)
        elif self.args.use_pos and not self.args.use_tfidf:
            all_ = torch.cat([embeds, pos], dim=-1)
        elif not self.args.use_pos and self.args.use_tfidf:
            all_ = torch.cat([embeds, tf_idf.unsqueeze(-1)], dim=-1)
        else:
            all_ = embeds
        return all_

    def _get_lstm_features(self, sentence, length, mask):
        self.hidden = self.init_hidden()
        # embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        embeds = self._make_embeddings(sentence)
        # print(embeds.shape, length.shape)
        lstm_out, _ = self.lstm(embeds, length)
        if self.args.self_attention and not self.args.LSTM_attention:
            attention_out = self.self_attentin(lstm_out, mask)
            lstm_out, _ = self.lstm2(attention_out, length)
        elif self.args.self_attention and self.args.LSTM_attention:
            attention_out = self.self_attentin(lstm_out, mask)
            lstm_out, _ = self.lstm3(attention_out, length)

        elif self.args.LSTM_attention:
            lstm_out, _ = self.lstm3(lstm_out, length)

        # lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score


    def forward(self, sentence, length, mask, label):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence, length, mask)
        # Find the best path, given the features.
        score = - self.crf(lstm_feats.permute(1,0,2), label.permute(1,0), mask=mask.permute(1,0))
        predict = self.crf.decode(lstm_feats.permute(1,0,2), mask=mask.permute(1,0))
        return score, predict


# Train

In [0]:
import torch
import pickle
from torch.utils.data import DataLoader
import os
import time
from tqdm import tqdm
import numpy as np
import torch.nn as nn
import itertools
from sklearn.metrics import f1_score

In [0]:
def train(model, dataloader, optimizer, max_gradient_norm, device='cpu'):
    model.train()

    epoch_start = time.time()
    batch_time_avg = 0.0
    running_loss = 0.0
    correct_preds = 0

    tqdm_batch_iterator = tqdm(dataloader)
    for batch_index, batch in enumerate(tqdm_batch_iterator): #从迭代器获取数据
        batch_start = time.time()

        # Move input and output data to the GPU if it is used.
        wid = batch['word_index'].to(device)
        tf_idf = batch['tf_idf'].to(device)
        pos = batch['pos'].to(device)
        length = batch['length'].to(device)
        labels = batch['labels'].to(device)[:, :max(length)] #切
        optimizer.zero_grad()
        mask = get_mask(wid, length).bool().to(device)

        loss, pred = model((wid, tf_idf, pos), length, mask, labels)
        # loss = criterionlogits, labels)
        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), max_gradient_norm)
        optimizer.step()
        pred = torch.tensor(list(itertools.chain(*pred))).to(device)
        labels = labels[mask]
        batch_time_avg += time.time() - batch_start
        running_loss += loss.item()
        correct_preds += f1_score(pred.detach().cpu(), labels.detach().cpu(), average='micro')

        description = "Avg. batch proc. time: {:.4f}s, loss: {:.4f}, accu: {}" \
            .format(batch_time_avg / (batch_index + 1),
                    running_loss / (batch_index + 1),
                    f1_score(pred.detach().cpu(), labels.detach().cpu(), average='micro'))
        tqdm_batch_iterator.set_description(description)

    epoch_time = time.time() - epoch_start
    epoch_loss = running_loss / len(dataloader)
    epoch_accuracy = correct_preds / len(dataloader)

    return epoch_time, epoch_loss, epoch_accuracy

In [0]:
def validate(model, dataloader, device='cpu'): ## 不需要梯度裁剪和梯度，只需要模型的输出
    model.eval()

    epoch_start = time.time()
    batch_time_avg = 0.0
    running_loss = 0.0
    correct_preds = 0
    total_length = 0
    all_preds, all_labels = [], []

    tqdm_batch_iterator = tqdm(dataloader)
    for batch_index, batch in enumerate(tqdm_batch_iterator):
        batch_start = time.time()

        # Move input and output data to the GPU if it is used.
        wid = batch['word_index'].to(device)
        tf_idf = batch['tf_idf'].to(device)
        pos = batch['pos'].to(device)
        length = batch['length'].to(device)
        labels = batch['labels'].to(device)[:, :max(length)]
        mask = get_mask(wid, length).bool().to(device)

        loss, pred = model((wid, tf_idf, pos), length, mask, labels)
        pred = torch.tensor(list(itertools.chain(*pred))).to(device)
        labels = labels[mask]
        batch_time_avg += time.time() - batch_start
        running_loss += loss.item()
        all_preds = all_preds + [pred]
        all_labels = all_labels + [labels]
        total_length += torch.sum(length,dtype=torch.float32).item()
        description = "Avg. batch proc. time: {:.4f}s, loss: {:.4f}, accu: {}" \
            .format(batch_time_avg / (batch_index + 1),
                    running_loss / (batch_index + 1),
                    f1_score(pred.detach().cpu(), labels.detach().cpu(), average='micro'))
        tqdm_batch_iterator.set_description(description)
    all_preds = torch.cat(all_preds,dim=0)
    all_labels = torch.cat(all_labels, dim=0)
    
    epoch_time = time.time() - epoch_start
    epoch_loss = running_loss / len(dataloader)
    epoch_accuracy = f1_score(all_preds.detach().cpu(), all_labels.detach().cpu(), average='micro')

    return epoch_time, epoch_loss, epoch_accuracy

In [0]:
def main(args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    print(20 * "=", " Preparing for training ", 20 * "=")

    if not os.path.exists(args.target_dir):
        os.makedirs(args.target_dir)

    # -------------------- Data loading ------------------- #
    print("\t* Loading training data...")
    with open(args.train_file, 'rb') as pkl:
        train_data = NERDataset(pickle.load(pkl))

    train_loader = DataLoader(train_data, shuffle=True, batch_size=args.batch_size)

    print("\t* Loading validation data...")
    with open(args.valid_file, 'rb') as pkl:
        valid_data = NERDataset(pickle.load(pkl))

    valid_loader = DataLoader(valid_data, shuffle=False, batch_size=args.batch_size)

    # -------------------- Model definition ------------------- #
    print('\t* Building model...')
    with open(args.embeddings_file, 'rb') as pkl:
        embeddings = torch.tensor(pickle.load(pkl), dtype=torch.float) \
            .to(device)
    word_to_ix = eval(open(args.word_dict_file).readline())
    tag_to_ix = eval(open(args.tag_file).readline())
    pos_to_ix = eval(open(args.pos_file).readline())



    model = BiLSTM_CRF(args, len(word_to_ix), tag_to_ix, embeddings, len(pos_to_ix)).to(device)
    print(model)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
   
    best_dev = 0
    for i in range(20):
        epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader, optimizer, args.max_gradient_norm, device=device)
        print("train accuracy:", epoch_accuracy)
        epoch_time, epoch_loss, epoch_accuracy = validate(model, valid_loader, device=device)
        print("eval accuracy:", epoch_accuracy)
        if best_dev < epoch_accuracy:
            best_dev = epoch_accuracy
            torch.save(model.state_dict(), os.path.join(args.target_dir, "best_model.pt"))
    print("BEST", best_dev)

In [0]:
from collections import namedtuple

if __name__ == "__main__":
    
    import argparse
    ArgsStruct = namedtuple('args', 'hidden pos_dim learning_rate target_dir batch_size epoch max_gradient_norm num_layers bidirectional embedding_dim from_pretrain_embedding use_pos use_tfidf self_attention LSTM_attention num_head train_file valid_file test_file embeddings_file tag_file pos_file word_dict_file ')
    args = ArgsStruct(hidden=300,
              pos_dim=50,
              learning_rate=1e-3,
              target_dir="model/layer5",
              batch_size=32,
              epoch=10,
              max_gradient_norm=5.0,
              num_layers=5,
              bidirectional=True,
              embedding_dim=300,
              from_pretrain_embedding=True,
              use_pos=True,
              use_tfidf=True,
              self_attention=True,
              LSTM_attention=True,
              num_head=2,
              train_file='/train.pkl',
              valid_file='/eval.pkl',
              test_file="",
              embeddings_file='/embedding.pkl',
              tag_file='/label_dict.txt',
              pos_file='/pos_dict.txt',
              word_dict_file='/word_dict.txt')
    main(args)

	* Loading training data...
	* Loading validation data...
	* Building model...


  0%|          | 0/88 [00:00<?, ?it/s]

BiLSTM_CRF(
  (self_attentin): MultiHeadedAttention(
    (linear_layers): ModuleList(
      (0): Linear(in_features=600, out_features=600, bias=True)
      (1): Linear(in_features=600, out_features=600, bias=True)
      (2): Linear(in_features=600, out_features=600, bias=True)
    )
    (output_linear): Linear(in_features=600, out_features=600, bias=True)
    (attention): Attention()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (lstm2): Seq2SeqEncoder(
    (_encoder): LSTM(600, 300, num_layers=5, batch_first=True, bidirectional=True)
  )
  (lstm3): LSTM_Attention(
    (encoder): Seq2SeqEncoder(
      (_encoder): LSTM(600, 300, num_layers=5, batch_first=True, bidirectional=True)
    )
    (m): Softmax(dim=-1)
  )
  (word_embeds): Embedding(13975, 300, padding_idx=0)
  (lstm): Seq2SeqEncoder(
    (_encoder): LSTM(351, 300, num_layers=5, batch_first=True, bidirectional=True)
  )
  (pos_embeds): Embedding(44, 50)
  (hidden2tag): Linear(in_features=600, out_features=8, bias=True)
  (c

  
Avg. batch proc. time: 4.3113s, loss: 507.0462, accu: 0.8155844155844156: 100%|██████████| 88/88 [06:19<00:00,  4.32s/it]
  0%|          | 0/21 [00:00<?, ?it/s]

train accuracy: 0.7633194864757272


Avg. batch proc. time: 0.6141s, loss: 377.2921, accu: 0.8422222222222222: 100%|██████████| 21/21 [00:12<00:00,  1.62it/s]
  0%|          | 0/88 [00:00<?, ?it/s]

eval accuracy: 0.8006772009029344


Avg. batch proc. time: 4.9732s, loss: 296.3501, accu: 0.8531645569620253: 100%|██████████| 88/88 [07:18<00:00,  4.98s/it]
  0%|          | 0/21 [00:00<?, ?it/s]

train accuracy: 0.8711212581597785


Avg. batch proc. time: 0.6615s, loss: 201.7592, accu: 0.9311111111111111: 100%|██████████| 21/21 [00:13<00:00,  1.50it/s]


eval accuracy: 0.8951467268623025


Avg. batch proc. time: 4.8888s, loss: 190.8340, accu: 0.9535398230088495:  56%|█████▌    | 49/88 [03:59<03:08,  4.84s/it]