In [1]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
import random
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter
import time

torch.manual_seed(1)
device = torch.device("cuda:5" if torch.cuda.is_available() else "cpu")

def make_data_point(sent):
    """
        Creates a dictionary from String to an Array of Strings representing the data.  The dictionary items are:
        dic['tokens'] = Tokens padded with <START> and <STOP>
        dic['pos'] = POS tags padded with <START> and <STOP>
        dic['NP_chunk'] = Tags indicating noun phrase chunks, padded with <START> and <STOP>
        dic['gold_tags'] = The gold tags padded with <START> and <STOP>
    :param sent: String.  The input CoNLL format string
    :return: Dict from String to Array of Strings.
    """
    dic = {}
    sent = [s.strip().split() for s in sent]
    dic['tokens'] = ['<START>'] + [s[0] for s in sent] + ['<STOP>']
    dic['pos'] = ['<START>'] + [s[1] for s in sent] + ['<STOP>']
    dic['NP_chunk'] = ['<START>'] + [s[2] for s in sent] + ['<STOP>']
    dic['gold_tags'] = ['<START>'] + [s[3] for s in sent] + ['<STOP>']
    return dic

def read_data(filename):
    """
    Reads the CoNLL 2003 data into an array of dictionaries (a dictionary for each data point).
    :param filename: String
    :return: Array of dictionaries.  Each dictionary has the format returned by the make_data_point function.
    """
    data = []
    with open(filename, 'r') as f:
        sent = []
        for line in f.readlines():
            if line.strip():
                sent.append(line)
            else:
                data.append(make_data_point(sent))
                sent = []
        data.append(make_data_point(sent))

    return data


def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))
        
class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim, char_embedding_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

        self.char_embedding = nn.Embedding(10, char_embedding_dim)
        self.char_cnn = nn.Conv2d(in_channels=1, out_channels=char_embedding_dim, kernel_size=(1, char_embedding_dim))

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2),
                torch.randn(2, 1, self.hidden_dim // 2))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size).to(forward_var.device)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1).to(forward_var.device)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]].to(forward_var.device)
        alpha = log_sum_exp(terminal_var)
        return alpha
    
    def get_char_indices(self, word_idx):
        """
        Extracts character indices using nltk.word_tokenize.
        """
        word = train_data[word_idx]['tokens']  # Assuming 'tokens' key holds the word
        char_indices = [word_to_ix[char] for char in word]  # Assuming word_to_ix exists
        return char_indices

    def _get_lstm_features_cnn(self, sentence):
        self.hidden = self.init_hidden()

        sentence = sentence.to(self.word_embeds.weight.device)

        # Character Embeddings (assuming you have a character embedding layer)
        char_embeddings = self.char_embedding.to(sentence.device)  # Assuming `self.char_embedding` exists

        # Get character indices for each word (replace with your logic)
        char_ids = []
        for word_idx in sentence:
            # Convert word index to a list of character indices based on your vocabulary
            chars = self.get_char_indices(word_idx)# Your logic to get character indices from word index
            char_ids.append(torch.tensor(chars).to(sentence.device)) 

        # Pad sequences to have the same length (optional, but recommended)
        char_ids = pad_sequence(char_ids, batch_first=True, padding_value=0)  # 0 for padding

        self.conv1 = nn.Conv1d(in_channels=char_embeddings.num_embeddings,  # Assuming embedding dim
                            out_channels=11,  # Adjust as needed
                            kernel_size=4,  # Experiment with kernel size
                            padding=1)  # Pad to maintain input size

        # Pass character embeddings through CNN
        cnn_out = self.conv1(char_embeddings(char_ids))
        

        lstm_out = torch.max(F.relu(cnn_out), dim=2)[0]  # ReLU activation and max pooling

        # Reshape for LSTM
        lstm_out = lstm_out.view(len(sentence), -1)

        # LSTM layers
        lstm_out, self.hidden = self.lstm(lstm_out, self.hidden)

        # Final output
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim * 2)
        lstm_feats = self.hidden2tag(lstm_out)

        return lstm_feats

    def _get_lstm_features(self, sentence): 
        self.hidden = self.init_hidden()
        
        sentence = sentence.to(self.word_embeds.weight.device)
        
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        
        self.hidden = tuple(h.to(embeds.device) for h in self.hidden)
        
        
        lstm_out, self.hidden = self.lstm(embeds, self.hidden) 
        
        
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats



    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1, device=feats.device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000., device=feats.device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq



train_data = read_data('ner.train')  # Assume this returns a list of dictionaries
dev_data = read_data('ner.dev')
test_data = read_data('ner.test')
START_TAG = "<START>"
STOP_TAG = "<STOP>"
EMBEDDING_DIM = 40
HIDDEN_DIM = 40
CHAR_EMBEDDING_DIM = 4

word_to_ix = {word: i for i, word in enumerate(set([word for sentence in train_data + dev_data + test_data for word in sentence['tokens']]))}
tag_to_ix = {tag: i for i, tag in enumerate(set([tag for sentence in train_data + dev_data + test_data for tag in sentence['gold_tags']]))}

# Model, optimizer and loss initialization
model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, CHAR_EMBEDDING_DIM).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Check predictions before training
with torch.no_grad():
    precheck_sent = prepare_sequence(train_data[0]['tokens'], word_to_ix)
    precheck_tags = torch.tensor([tag_to_ix[t] for t in train_data[0]['gold_tags']], dtype=torch.long).to(device)
    print(model(precheck_sent))

def generate_minibatches(training_data, batch_size):
    random.shuffle(training_data)
    minibatches = []
    for i in range(0, len(training_data), batch_size):
        minibatch = training_data[i:i + batch_size]
        minibatches.append(minibatch)
    return minibatches


def make_data_point(sent):
    """
        Creates a dictionary from String to an Array of Strings representing the data.  The dictionary items are:
        dic['tokens'] = Tokens padded with <START> and <STOP>
        dic['pos'] = POS tags padded with <START> and <STOP>
        dic['NP_chunk'] = Tags indicating noun phrase chunks, padded with <START> and <STOP>
        dic['gold_tags'] = The gold tags padded with <START> and <STOP>
    :param sent: String.  The input CoNLL format string
    :return: Dict from String to Array of Strings.
    """
    dic = {}
    sent = [s.strip().split() for s in sent]
    dic['tokens'] = ['<START>'] + [s[0] for s in sent] + ['<STOP>']
    dic['pos'] = ['<START>'] + [s[1] for s in sent] + ['<STOP>']
    dic['NP_chunk'] = ['<START>'] + [s[2] for s in sent] + ['<STOP>']
    dic['gold_tags'] = ['<START>'] + [s[3] for s in sent] + ['<STOP>']
    return dic



writer = SummaryWriter()

batch_size = 64
start_time = time.time()
for epoch in tqdm(range(10)):  # again, normally you would NOT do 300 epochs, it is toy data
    minibatches = generate_minibatches(train_data, batch_size)
    for i, minibatch in enumerate(minibatches):
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentences_in = [prepare_sequence(sentence['tokens'], word_to_ix) for sentence in minibatch]
        sentences_in = pad_sequence(sentences_in, batch_first=True)
        targets = [torch.tensor([tag_to_ix[t] for t in sentence['gold_tags']], dtype=torch.long) for sentence in minibatch]
        targets = pad_sequence(targets, batch_first=True)

        # Step 3. Run our forward pass.
        loss = 0
        for sentence_in, target in zip(sentences_in, targets):
            loss += model.neg_log_likelihood(sentence_in, target)
        writer.add_scalar('Loss/train', loss.item(), epoch * len(minibatches) + i)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()
    end_time = time.time()
    if epoch < 1:
        elapsed_time = end_time - start_time
        print(f"Training time for one epoch with batch size {batch_size}: {elapsed_time:.2f} seconds")
    torch.save(model.state_dict(), f'bilstm_crf_model_epoch_{epoch}.pth')

# Check predictions after training
with torch.no_grad():
    precheck_sent = prepare_sequence(train_data[0]['tokens'], word_to_ix).to(device)
    print(model(precheck_sent))

output_file = "dev_predictions.txt"
loaded_model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, CHAR_EMBEDDING_DIM)
loaded_model.load_state_dict(torch.load('bilstm_crf_model_epoch_9.pth'))
loaded_model.to(device)

# Open a file to write the predicted tags
with open('dev_predictions.txt', 'w') as f:
    # Check predictions on dev data after training
    with torch.no_grad():
        for example in dev_data:
            # Prepare the sequence from the current example in dev data
            dev_sent = prepare_sequence(example['tokens'], word_to_ix).to(device)
            # Pass the sequence to the model for prediction
            _, predicted_tags = loaded_model(dev_sent)
            
            # Convert predicted tags from tensor to a list of tag strings
            predicted_tags = [list(tag_to_ix.keys())[list(tag_to_ix.values()).index(tag_id)] for tag_id in predicted_tags]
            
            # Write the predicted tags to the file
            f.write(' '.join(predicted_tags) + '\n')

output_file = "test_predictions.txt"

# Open a file to write the predicted tags
with open('test_predictions.txt', 'w') as f:
    # Check predictions on dev data after training
    with torch.no_grad():
        for example in test_data:
            # Prepare the sequence from the current example in dev data
            dev_sent = prepare_sequence(example['tokens'], word_to_ix).to(device)
            
            # Pass the sequence to the model for prediction
            _, predicted_tags = loaded_model(dev_sent)
            
            # Convert predicted tags from tensor to a list of tag strings
            predicted_tags = [list(tag_to_ix.keys())[list(tag_to_ix.values()).index(tag_id)] for tag_id in predicted_tags]
            
            # Write the predicted tags to the file
            f.write(' '.join(predicted_tags) + '\n')

2024-03-27 20:16:48.071000: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-27 20:16:48.130131: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


(tensor(27.4154, device='cuda:0'), [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1])


 10%|████████                                                                        | 1/10 [44:55<6:44:22, 2695.86s/it]

Training time for one epoch with batch size 64: 2695.85 seconds


100%|███████████████████████████████████████████████████████████████████████████████| 10/10 [7:22:33<00:00, 2655.33s/it]


(tensor(-11257.4043, device='cuda:0'), [6, 6, 6, 6, 2, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6])


In [13]:
def read_iob_file(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            # Split the line into tags and filter out empty strings
            tags = line.strip().split()
            if tags:  # only add non-empty lists
                data.append(tags)
    return data

# Using the function to read a file and store data in a list
# Note: Replace 'your_file.txt' with your actual file name
file_path = './dev_predictions.txt'
iob_tags = read_iob_file(file_path)

gold_tags_list=[]
for example in dev_data:
    gold_tags_list.append(example['gold_tags'])


from sklearn.metrics import precision_recall_fscore_support

def calculate_metrics(gold, pred):
    # Flatten the list of tags into a single list for each set
    gold_flat = [item for sublist in gold for item in sublist]
    pred_flat = [item for sublist in pred for item in sublist]

    # Calculate metrics
    precision, recall, f1, _ = precision_recall_fscore_support(
        gold_flat, pred_flat, average='macro', labels=['I-PER', 'I-LOC', 'I-ORG']
    )

    return precision, recall, f1
calculate_metrics(gold_tags_list,iob_tags)

(0.7303628572335515, 0.7140596222866256, 0.7213724655391789)

In [12]:
def read_iob_file(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            # Split the line into tags and filter out empty strings
            tags = line.strip().split()
            if tags:  # only add non-empty lists
                data.append(tags)
    return data

# Using the function to read a file and store data in a list
# Note: Replace 'your_file.txt' with your actual file name
file_path = './test_predictions.txt'
iob_tags = read_iob_file(file_path)

gold_tags_list=[]
for example in test_data:
    gold_tags_list.append(example['gold_tags'])


from sklearn.metrics import precision_recall_fscore_support

def calculate_metrics(gold, pred):
    # Flatten the list of tags into a single list for each set
    gold_flat = [item for sublist in gold for item in sublist]
    pred_flat = [item for sublist in pred for item in sublist]

    # Calculate metrics
    precision, recall, f1, _ = precision_recall_fscore_support(
        gold_flat, pred_flat, average='macro', labels=['I-PER', 'I-LOC', 'I-ORG']
    )

    return precision, recall, f1
calculate_metrics(gold_tags_list,iob_tags)

(0.6551379235768838, 0.6066772753114434, 0.6286521666352259)