In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
pip install contractions fasttext

Collecting contractions
  Downloading https://files.pythonhosted.org/packages/85/41/c3dfd5feb91a8d587ed1a59f553f07c05f95ad4e5d00ab78702fbf8fe48a/contractions-0.0.24-py2.py3-none-any.whl
Collecting fasttext
[?25l  Downloading https://files.pythonhosted.org/packages/f8/85/e2b368ab6d3528827b147fdb814f8189acc981a4bc2f99ab894650e05c40/fasttext-0.9.2.tar.gz (68kB)
[K     |████████████████████████████████| 71kB 2.0MB/s 
[?25hCollecting textsearch
  Downloading https://files.pythonhosted.org/packages/42/a8/03407021f9555043de5492a2bd7a35c56cc03c2510092b5ec018cae1bbf1/textsearch-0.0.17-py2.py3-none-any.whl
Collecting pyahocorasick
[?25l  Downloading https://files.pythonhosted.org/packages/f4/9f/f0d8e8850e12829eea2e778f1c90e3c53a9a799b7f412082a5d21cd19ae1/pyahocorasick-1.4.0.tar.gz (312kB)
[K     |████████████████████████████████| 317kB 8.1MB/s 
[?25hCollecting Unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0

In [0]:
import string
from collections import Counter, OrderedDict
import re
import numpy as np
import pandas as pd
from contractions import contractions_dict
import unicodedata
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy import displacy
import nltk
from nltk.corpus import stopwords
import time
import fasttext
from gensim.models.fasttext import FastText

In [0]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
from torchtext import data as data_t
torch.manual_seed(1)

<torch._C.Generator at 0x7ff249aa1cb0>

In [0]:
COLUMN_LIST = ["tweet_id", "begin", "end", "type", "extraction", "drug", "tweet", "meddra_code", "meddra_term"]

In [0]:
TWEET_COL = "tweet"
TWEET_ID = "tweet_id"
BEGIN_COL = "begin"
END_COL = "end"
TYPE_COL = "type"
EXTRACTION_COL = "extraction"
DRUG_COL = "drug"
MEDDRA_CODE = "meddra_code"
MEDDRA_TERM = "meddra_term"
NON_ADR = "NO-ADR"
NO_DRUG = "NO-DRUG"
DRUG_TAG = "D"
ADR_TAG = "A"
OTHER_TAG = "O"
PAD_TOKEN = "PAD_T"
START_TAG = "START-T"
STOP_TAG = "STOP-T"
UNKNOWN = "UNK-T"
BLANK_SPACE = " "

In [0]:
spacy_model = spacy.load("en_core_web_sm")

In [0]:
# get stopwords from nltk
nltk.download('stopwords')
NLTK_STOP = list(set(stopwords.words("english")))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
# load the training set
data = pd.read_csv("/content/gdrive/My Drive/task3_training.tsv", sep="\t", usecols=COLUMN_LIST)
data_val = pd.read_csv("/content/gdrive/My Drive/task3_validation.tsv", sep="\t")

In [0]:
# accented char conversion, use if english
def removeAccented(tweet):
  return unicodedata.normalize('NFKD', tweet).encode('ascii', 'ignore').decode('utf-8', 'ignore')

In [0]:
# expand all contractions in given sentence
def expandContr(tweet, c_dict=contractions_dict):
  word_list = tweet.split()
  expanded_tweet = []
  for word in word_list:
    if word in c_dict:
      expanded_tweet.append(c_dict[word])
    else:
      expanded_tweet.append(word)
  return BLANK_SPACE.join(expanded_tweet)

In [0]:
HANDLE_MENTION = True
HANDLE_HASHTAG = True
REMOVE_DIGITS = True

In [0]:
# custom punctuation removal 
def removePunct(tweet):
  tweet = str(tweet)
  word_list = tweet.split()
  clean_tweet = []
  for word in word_list:
    # if it is a mention, replace it with a proper noun 
    if HANDLE_MENTION:
      if word[0] == "@":
        clean_tweet.append(word[1:])
        continue
    # just replace the hashtag with it's word counterpart 
    if HANDLE_HASHTAG:
      if word[0] == "#":
        clean_tweet.append(word[1:])
        continue
    # replace punctuations with space and build tweet cleanly
    final_word = ""
    for w in word:
      if not REMOVE_DIGITS:
        if w.isalnum():
          final_word += w
      else:
        if w.isalpha():
          final_word += w
    clean_tweet.append(final_word)
    
  return BLANK_SPACE.join(clean_tweet)

In [0]:
# perform lemmatization
def lemmatize(tweet):
  tweet = spacy_model(tweet) # redundant, find a way to get over this!
  return BLANK_SPACE.join([tok.lemma_ if tok.lemma_ != "-PRON-" else tok.text for tok in tweet])

In [0]:
N = 30
USE_NLTK = True

In [0]:
# generate top N custom stopwords 
def generateCustomStop(data):
  custom_stopwords = []
  count_words = Counter()
  if USE_NLTK:
    stop_list = NLTK_STOP
  else:
    stop_list = STOP_WORDS
  for tweet in data[TWEET_COL]:
    for word in tweet.split():
      count_words[word] += 1
  count = 0
  # reverse it because 
  count_words = count_words.most_common()[::-1]
  while count != N:
    word = count_words.pop()[0]
    if word in stop_list:
      custom_stopwords.append(word)
      count += 1
    else:
      continue  
  return custom_stopwords

In [0]:
R_ACC = True
CONTR = True
R_URL = True
R_PUNCT = True
LEMMATIZE = True
R_STOP = True
CUSTOM_STOP = True
if CUSTOM_STOP:
  custom_stopwords = generateCustomStop(data)

In [0]:
# clean up
def cleanTweet(tweet):
  # convert to lower case 
  tweet = tweet.lower()
  
  # remove accented characters if any
  if R_ACC:
    tweet = removeAccented(tweet)

  # expand contractions
  if CONTR:
    tweet = expandContr(tweet)

  # remove urls
  if R_URL:
    tweet = re.sub(r"http\S+", "", tweet)
  
  # remove punctuations
  if R_PUNCT:
    tweet = removePunct(tweet)
  
  # do lemmatization
  if LEMMATIZE:
    tweet = lemmatize(tweet)

  # remove stopwords
  if R_STOP:
    if CUSTOM_STOP:
      tweet = BLANK_SPACE.join([i for i in tweet.split() if i not in custom_stopwords])
    else:
      tweet = BLANK_SPACE.join([i for i in tweet.split() if i not in STOP_WORDS])
  # tweet = BLANK_SPACE.join([dict_drugs[i] if i in dict_drugs else i for i in tweet.split()])
  tweet = BLANK_SPACE.join([i for i in tweet.split() if not len(i) == 0])
  return tweet

In [0]:
def cleanData(data, column_name):
  clean_data = data.copy()
  clean_data[column_name] = data[column_name].apply(cleanTweet)
  return clean_data

In [0]:
def getColumnAsList(data, column_name): 
  return list(data[column_name])

In [0]:
data[EXTRACTION_COL].fillna(value=NON_ADR, inplace=True)
data[DRUG_COL].fillna(value=NO_DRUG, inplace=True)
data[DRUG_COL] = data[DRUG_COL].apply(str.lower)

In [0]:
# clean the data
data = cleanData(data, TWEET_COL)
data = cleanData(data, EXTRACTION_COL)

In [0]:
data = data.drop_duplicates([TWEET_COL])

In [0]:
# data_overload = cleanData(data_overload, TWEET_COL)

In [0]:
# get tag to vector
# tag_list = [ADR_TAG, DRUG_TAG, OTHER_TAG, START_TAG, STOP_TAG]
tag_list = [ADR_TAG, OTHER_TAG, START_TAG, STOP_TAG]
tag_to_vector = {tag:idx for idx, tag in enumerate(tag_list)}

In [0]:
# get word list
def getWordNERList(data):
  PAD_LENGTH = 36
  PAD_TAG = "_pad_"
  word_ner = []
  words_list = []
  sentence_list = []
  sentence_words = []
  tag_list = []
  for idx, row in data.iterrows():
    sentence_word = []
    sentence_tag = []
    sentence_list.append(row[TWEET_COL])
    for word in row[TWEET_COL].split(" "):
      words_list.append(word)
      # if word == row[DRUG_COL]:
      #   sentence_word.append(word)
      #   sentence_tag.append(DRUG_TAG)
      #   tag_list.append(DRUG_TAG)
      # elif word in row[EXTRACTION_COL].split(" "):
      if word in row[EXTRACTION_COL].split(" "):
        sentence_word.append(word)
        sentence_tag.append(ADR_TAG)
        tag_list.append(ADR_TAG)
      else:
        sentence_word.append(word)
        sentence_tag.append(OTHER_TAG)
        tag_list.append(OTHER_TAG)
    # sentence_words.append(BLANK_SPACE.join(sentence_word))
    word_ner.append((sentence_word, sentence_tag))
  return word_ner, words_list, sentence_list, tag_list, words_list

In [0]:
sentence_tags, all_words, all_sentences, all_tags, all_sentence_words = getWordNERList(data)

In [0]:
# write sentences to file
with open('/content/gdrive/My Drive/model_sentences.txt', 'w') as filehandle:
    for listitem in all_sentences:
        filehandle.write('%s\n' % listitem)

In [0]:
len(all_words)

In [0]:
def getWordVecs(data):
  word_to_ix = {}
  word_to_ix[UNKNOWN] = len(word_to_ix)
  for word in data:
      if word not in word_to_ix:
          word_to_ix[word] = len(word_to_ix)
  return word_to_ix

In [0]:
word_to_ix = getWordVecs(all_words)
len(word_to_ix)

In [0]:
data_val[EXTRACTION_COL].fillna(value=NON_ADR, inplace=True)
data_val[DRUG_COL].fillna(value=NO_DRUG, inplace=True)
data_val[DRUG_COL] = data_val[DRUG_COL].apply(str.lower)
data_val = cleanData(data_val, TWEET_COL)
data_val = cleanData(data_val, EXTRACTION_COL)
data_val = data_val.drop_duplicates([TWEET_COL])

sentence_tags_valll, all_words_val, all_sentences_val, all_tags_val, all_sentence_words_val = getWordNERList(data_val)

In [0]:
print(len(all_words_val))

In [0]:
# parameters
EMBEDDING_DIM = 100
HIDDEN_DIM = 64
WINDOW_SIZE = 10
MIN_WORD = 5
DOWN_SAMPLING = 1e-2

In [0]:
# fasttext model
fasttext_model = fasttext.train_unsupervised('/content/gdrive/My Drive/model_sentences.txt', model='skipgram', dim=EMBEDDING_DIM,
                                    ws=WINDOW_SIZE, minCount=MIN_WORD, t=DOWN_SAMPLING)

In [0]:
# get embeddings for each word
tensor_list = list()
tensor_list.append([0.0]*100)
for word in set(all_words):
  tensor_list.append(list(fasttext_model[word]))
embeddings_fasttext = torch.Tensor(tensor_list)

In [0]:
embeddings_fasttext.size()

In [0]:
def argmax(vec):
    # return the argmax 
    _, idx = torch.max(vec, 1)
    return idx.item()

# convert word to positional vector representation
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] if w in to_ix else to_ix[UNKNOWN] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


# compute log sum exponent for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

# calc epoch time
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [0]:
import torch.nn as nn
class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_vector, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_vector = tag_to_vector
        self.tagset_size = len(tag_to_vector)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        print(self.vocab_size)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning to 'i' from 'j'.
        # CRF graph for transitions amongst different labels/states
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # So that we never transfer to the start tag backward
        # and we never transfer from the stop tag forward
        self.transitions.data[tag_to_vector[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_vector[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2),
                torch.randn(2, 1, self.hidden_dim // 2))

    def _forward_alg(self, feats):
        init_alphas = torch.full((1, self.tagset_size), -10000.)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_vector[START_TAG]] = 0.

        forward_var = init_alphas

        # Iterate through the sentence and find which feature
        # has how much probability of changing to which state 
        # based on lstm features, general transition score
        for feat in feats:
            alphas_t = []  # The forward tensors 
            for next_tag in range(self.tagset_size):
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # transition score 
                trans_score = self.transitions[next_tag].view(1, -1)
                next_tag_var = forward_var + trans_score + emit_score
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_vector[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        # print(self.hidden[0].size())
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        # reshape for next layer 
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1)
        tags = torch.cat([torch.tensor([self.tag_to_vector[START_TAG]], dtype=torch.long), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_vector[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.)
        init_vvars[0][self.tag_to_vector[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_vector[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag 
        start = best_path.pop()
        assert start == self.tag_to_vector[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence): 
        # print("Forward called!")
        # Get the  scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [0]:
model = BiLSTM_CRF(len(word_to_ix), tag_to_vector, EMBEDDING_DIM, HIDDEN_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)
model.word_embeds.weight.data.copy_(embeddings_fasttext)
loss_threshold = 0.1
# Check predictions before training
with torch.no_grad():
    precheck_sent = prepare_sequence(sentence_tags[0][0], word_to_ix)
    precheck_tags = torch.tensor([tag_to_vector[t] for t in sentence_tags[0][1]], dtype=torch.long)
    print(model(precheck_sent))

for epoch in range(100):  
    start_time = time.time()

    for sentence, tags in sentence_tags:
        # Pytorch accumulates gradients.
        model.zero_grad()

        # create tensor of sentences 
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = torch.tensor([tag_to_vector[t] for t in tags], dtype=torch.long)

        loss = model.neg_log_likelihood(sentence_in, targets)

        loss.backward()
        optimizer.step()
    
    # calc time to each epoch
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    print("Loss:", loss)
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    if loss < loss_threshold:
      break
# Check predictions after training
with torch.no_grad():
    precheck_sent = prepare_sequence(sentence_tags[0][0], word_to_ix)
    print(model(precheck_sent))

In [0]:
# TEXT.vocab.vectors.size()

In [0]:
val_sentence_tags, val_words, val_sentences, tag_val_all, words_val_all = getWordNERList(data_val)

In [0]:
val_words_sent = [[word for word in sentence.split()] for sentence in val_sentences]

In [0]:
all_original_tags = [[tag_to_vector[tag] for tag in sentence[1]] for sentence in val_sentence_tags]

In [0]:
LENGTH_VAL = len(val_sentence_tags)
print(len(tag_val_all))

In [0]:
print(all_original_tags[0])
print(sentence_tags[0][0])

In [0]:
# prediction
compare_list = list()
total = 0
model.eval()
with torch.no_grad():
  for idx in range(LENGTH_VAL):
    compare_list.append((model(prepare_sequence(val_sentence_tags[idx][0], word_to_ix)), all_original_tags[idx]))
    total += len(val_sentences[idx].split())

In [0]:
from sklearn.metrics import f1_score, precision_score, plot_precision_recall_curve, recall_score

In [0]:
predictions = [pred[1] for pred, orig in compare_list]

In [0]:
# flatten all tags
final_pred = []
final_true = []
for orig, pred in zip(all_original_tags, predictions):
  for t, p in zip(orig, pred):
    final_true.append(t)
    final_pred.append(p)

In [0]:
print("Final metric: ")
print("F1-score:", f1_score(final_true, final_pred, zero_division=1, average='macro'))
print("Precision:", precision_score(final_true, final_pred, zero_division=1, average='macro'))
print("Recall:", recall_score(final_true, final_pred, zero_division=1, average='macro'))

In [0]:
# compare_length = len(compare_list)
# f1_acc = 0
# prec_acc = 0
# recall_acc = 0

# # calculating accuracy for each sentence and averaging it 
# for pred, y in compare_list:
#   f1_acc += f1_score(y, pred[1], zero_division=1, average='macro')
#   prec_acc += precision_score(y, pred[1], zero_division=1, average='macro')
#   recall_acc += recall_score(y, pred[1], zero_division=1, average='macro')
# print(f1_acc/compare_length)
# print(prec_acc/compare_length)
# print(recall_acc/compare_length)