In [1]:
import re
import numpy as np
import pandas as pd
import csv
import sys
import codecs

import tensorflow as tf
from tensorflow.keras.preprocessing.text import text_to_word_sequence

import warnings
warnings.filterwarnings('ignore')

# Helper functions

In [2]:
def annotate_tweets(data_ids, data_ann, i):
  #split tweet into a list of words and punctuation
  tweet = data_ids["tweet"][i]
  tweet = re.sub(r'\n', ' ', tweet) #remove newlines from tweets
  tweet = re.sub(r'"', '', tweet) #remove quotes from tweets
  to_tokenize = '!#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t'
  tweet = re.sub(r'(['+to_tokenize+'])', r' \1 ', tweet) #add spaces in between punctuations
  seq = text_to_word_sequence(tweet, filters='', split=' ', lower=False) #convert tweet to list

  #find the adr associated with the tweet from the annotation file
  tweet_id = data_ids["id"][i]
  adrs = data_ann[data_ann["id"]==tweet_id]["adr"].tolist()
  adrs_list = []
  for adr in adrs:
    adr = re.sub(r'\n', ' ', adr) #remove newlines from tweets
    to_tokenize = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t'
    adr = re.sub(r'(['+to_tokenize+'])', r' \1 ', adr) #add spaces in between punctuations
    adr = text_to_word_sequence(adr, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t', split=' ', lower=False)
    adrs_list.append(adr)

  #build the ner labels
  ner_annotation = []
  for s in seq:
    ner_annotation.append(assign_bio(s, adrs_list))
  
  return(ner_annotation)

In [3]:
def assign_bio(word, adrs_list):
  for adrs in adrs_list:
    for i,adr in enumerate(adrs):
      if word.lower()==adr.lower():
        if i==0:
          return(word+'\tB')
        else:
          return(word+'\tI')
  return(word+'\tO')

# Training data

In [4]:
DATA_DIR = "../data/twitter_ner/ner_tweet_dataset/"
OUTPUT_DIR = "../datasets/NER/"

In [5]:
#load training data
train_ids =  pd.read_csv(DATA_DIR+'/train_ids_valid.csv')
train_ids.head()

Unnamed: 0,index,tweet_id,user_id,id,tweet,url,adr_present
0,1,344616533332467713,173701851,vyvanse-51b7c4c75378b9555a2f1ab8,"Vyvanse, commonly known as OCD in a pill.",http://twitter.com/173701851/status/3446165333...,1
1,2,344630199217958912,385562257,vyvanse-51b7d1815378b9555a2f1b0c,@MTV When are you going to do True Life: I am ...,http://twitter.com/385562257/status/3446301992...,1
2,3,342117282585133056,1110475417,cymbalta-51aead545378f924d02efd7c,@upasbook Great read as always. I was on Cymba...,http://twitter.com/1110475417/status/342117282...,1
3,9,351446999637299200,1061562434,seroquel-51d09e2a53785f584a9ae686,I wonder if seroquel /then/ food still equals ...,http://twitter.com/1061562434/status/351446999...,1
4,12,348262745730338817,97567146,rivaroxaban-51c5089353785f584a9a91e5,Rivaroxaban diary day 22. Last tablet taken Tu...,http://twitter.com/97567146/status/34826274573...,1


In [6]:
train_ids['adr_present'].value_counts()

1    360
0    352
Name: adr_present, dtype: int64

In [7]:
train_ann =  pd.read_csv(DATA_DIR+'/train_tweet_annotations.tsv', delimiter='\t', lineterminator='\n', header=None)
train_ann.columns = ["id","start","end","type","adr","drug1","drug2"]
train_ann.head()

Unnamed: 0,id,start,end,type,adr,drug1,drug2
0,baclofen-518bf599ac6ab35b4d48099f,60,66,ADR,tired,baclofen,baclofen\r
1,baclofen-518bf599ac6ab35b4d48099f,67,73,ADR,sleepy,baclofen,baclofen\r
2,baclofen-518bf599ac6ab35b4d48099f,74,77,ADR,fog,baclofen,baclofen\r
3,baclofen-51905f6cac6ab35b4d483a24,69,75,ADR,gorked,baclofen,baclofen\r
4,baclofen-51ae70ba5378f924d02efb33,23,47,ADR,make me a bigger asshole,baclofen,baclofen\r


In [8]:
#annotate train tweets with ner labels
train_annotations = []
num_data_ids = train_ids.shape[0]
for i in range(num_data_ids):
  train_annotations.append(annotate_tweets(train_ids, train_ann, i))

In [11]:
#save full train dataset to csv file
full_train_tweet_ids_ner = pd.concat([train_ids, pd.DataFrame(np.array(train_annotations),columns=['ner'])], axis=1)
full_train_tweet_ids_ner.to_csv(DATA_DIR+'/full_train_tweet_ids_ner.csv', index=False)

In [12]:
#split into train dataset to csv file
train_tweet_ids_ner = full_train_tweet_ids_ner
train_tweet_ids_ner.to_csv(DATA_DIR+'train_tweet_ids_ner.csv', index=False)

In [13]:
train_annotations

[['Vyvanse\tO',
  ',\tO',
  'commonly\tO',
  'known\tO',
  'as\tO',
  'OCD\tB',
  'in\tO',
  'a\tO',
  'pill\tO',
  '.\tO'],
 ['@\tO',
  'MTV\tO',
  'When\tO',
  'are\tO',
  'you\tO',
  'going\tO',
  'to\tO',
  'do\tO',
  'True\tO',
  'Life\tO',
  ':\tO',
  'I\tO',
  'am\tO',
  'addicted\tB',
  'to\tO',
  'Vyvanse\tO',
  '?\tO'],
 ['@\tO',
  'upasbook\tO',
  'Great\tO',
  'read\tO',
  'as\tO',
  'always\tO',
  '.\tO',
  'I\tO',
  'was\tO',
  'on\tO',
  'Cymbalta\tO',
  'for\tO',
  '5\tO',
  'days\tO',
  '.\tO',
  'Cold\tO',
  'turkey\tO',
  'had\tO',
  'sweats\tB',
  ',\tO',
  'migraine\tB',
  ',\tO',
  'tremors\tB',
  'while\tO',
  'on\tO',
  '&\tO',
  '3\tO',
  'days\tO',
  'after\tO',
  '.\tO'],
 ['I\tO',
  'wonder\tO',
  'if\tO',
  'seroquel\tO',
  '/\tO',
  'then\tO',
  '/\tO',
  'food\tO',
  'still\tO',
  'equals\tO',
  'restless\tB',
  'legs\tI'],
 ['Rivaroxaban\tO',
  'diary\tO',
  'day\tO',
  '22\tO',
  '.\tO',
  'Last\tO',
  'tablet\tO',
  'taken\tO',
  'Tuesday\tO',
  '18th\

In [20]:
train_tweet_ids_ner['ner'].tolist()[0]

['Vyvanse\tO',
 ',\tO',
 'commonly\tO',
 'known\tO',
 'as\tO',
 'OCD\tB',
 'in\tO',
 'a\tO',
 'pill\tO',
 '.\tO']

In [37]:
#split into train and dev dataset ner labels to tsv file
with open(OUTPUT_DIR+'train.tsv', 'w', newline='\n', encoding='utf-8') as f_output:
  tsv_output = csv.writer(f_output, delimiter='\n', quoting = csv.QUOTE_NONE, escapechar='\\')
  for annot in train_tweet_ids_ner['ner'].tolist():
    tsv_output.writerow(annot)
    tsv_output.writerow('')

In [38]:
train_ner = pd.read_csv(DATA_DIR+'train_tweet_ner_labels.tsv', delimiter='\t', header=None)
train_ner[1].value_counts()

O    14592
B      504
I      449
Name: 1, dtype: int64

# Load test data

In [None]:
#load test data
test_ids = pd.read_csv(DATA_DIR+'/test_ids_valid.csv')
test_ids.shape[0]

In [None]:
test_ids['adr_present'].value_counts()

In [None]:
test_ann =  pd.read_csv(DATA_DIR+'/test_tweet_annotations.tsv', delimiter='\t', lineterminator='\n', header=None)
test_ann.columns = ["id","start","end","type","adr","drug1","drug2"]
test_ann.head()

In [None]:
#annotate test tweets with ner labels
test_annotations = []
num_data_ids = test_ids.shape[0]
for i in range(num_data_ids):
  test_annotations.append(annotate_tweets(test_ids, test_ann, i))

In [None]:
#save test dataset to csv file
test_tweet_ids_ner = pd.concat([test_ids, pd.DataFrame(np.array(test_annotations),columns=['ner'])], axis=1)
test_tweet_ids_ner.to_csv(DATA_DIR+'test_tweet_ids_ner.csv', index=False)

In [None]:
#save test ner labels to tsv file
with open(OUTPUT_DIR+'test.tsv', 'w', newline='\n', encoding='utf-8') as f_output:
  tsv_output = csv.writer(f_output, delimiter='\n')
  for annot in test_annotations:
    annot = [word.encode(sys.stdout.encoding, errors='replace') for word in annot]
    tsv_output.writerow(annot)
    tsv_output.writerow('')