In [1]:
%tensorflow_version 1.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

TensorFlow 1.x selected.
Found GPU at: /device:GPU:0


In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
!pip install sklearn_crfsuite

Collecting sklearn_crfsuite
  Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl
Collecting python-crfsuite>=0.8.3
[?25l  Downloading https://files.pythonhosted.org/packages/95/99/869dde6dbf3e0d07a013c8eebfb0a3d30776334e0097f8432b631a9a3a19/python_crfsuite-0.9.7-cp36-cp36m-manylinux1_x86_64.whl (743kB)
[K     |████████████████████████████████| 747kB 4.7MB/s 
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.7 sklearn-crfsuite-0.3.6


In [0]:
import os
import re
import numpy as np
import pandas as pd
import json
import scipy.stats as stats
import csv   

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib

import sklearn_crfsuite
from itertools import chain

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from tensorflow.keras.models import Model, Sequential
#import transformers as ppb
import torch

import warnings
warnings.filterwarnings('ignore')

In [0]:
os.chdir("/content/drive/My Drive/MIDS/W266/")
!ls

# Prepare dataset

Dataset from: http://diego.asu.edu/Publications/ADRMine.html

Helper functions

In [0]:
def annotate_tweets(data_ids, data_ann, i):
  #split tweet into a list of words and punctuation
  tweet = data_ids["tweet"][i]
  tweet = re.sub(r'\n', ' ', tweet) #remove newlines from tweets
  tweet = re.sub(r'"|”', '', tweet) #remove quotes from tweets
  tweet = re.sub(r"'|’", '', tweet) #remove quotes from tweets
  to_tokenize = "…!#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t"
  tweet = re.sub(r'(['+to_tokenize+'])', r' \1 ', tweet) #add spaces in between punctuations
  seq = text_to_word_sequence(tweet, filters='', split=' ', lower=False) #convert tweet to list

  #find the adr associated with the tweet from the annotation file
  tweet_id = data_ids["id"][i]
  adrs = data_ann[data_ann["id"]==tweet_id]["adr"].to_list()
  adrs_list = []
  for adr in adrs:
    adr = re.sub(r'\n', ' ', adr) #remove newlines from adrs
    adr = re.sub(r'"|”', '', adr) #remove quotes from adrs 
    adr = re.sub(r"'|’", '', adr) #remove quotes from tweets 
    to_tokenize = "…!#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t"
    adr = re.sub(r'(['+to_tokenize+'])', r' \1 ', adr) #add spaces in between punctuations
    adr = text_to_word_sequence(adr, filters="…!#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t", split=' ', lower=False)
    adrs_list.append(adr)

  #build the ner labels
  ner_annotation = []
  for s in seq:
    ner_annotation.append(assign_bio(s, adrs_list))
  
  return(ner_annotation)

In [0]:
def assign_bio(word, adrs_list):
  for adrs in adrs_list:
    for i,adr in enumerate(adrs):
      if word.lower()==adr.lower():
        if i==0:
          return(word+'\tB')
        else:
          return(word+'\tI')
  return(word+'\tO')

Training data

In [0]:
DATA_DIR = "/content/drive/My Drive/MIDS/W266/ner_tweet_dataset"

In [9]:
#load training data
train_ids =  pd.read_csv(DATA_DIR+'/train_ids_valid.csv')
train_ids.head()

Unnamed: 0,index,tweet_id,user_id,id,tweet,url,adr_present
0,1,344616533332467713,173701851,vyvanse-51b7c4c75378b9555a2f1ab8,"Vyvanse, commonly known as OCD in a pill.",http://twitter.com/173701851/status/3446165333...,1
1,2,344630199217958912,385562257,vyvanse-51b7d1815378b9555a2f1b0c,@MTV When are you going to do True Life: I am ...,http://twitter.com/385562257/status/3446301992...,1
2,3,342117282585133056,1110475417,cymbalta-51aead545378f924d02efd7c,@upasbook Great read as always. I was on Cymba...,http://twitter.com/1110475417/status/342117282...,1
3,9,351446999637299200,1061562434,seroquel-51d09e2a53785f584a9ae686,I wonder if seroquel /then/ food still equals ...,http://twitter.com/1061562434/status/351446999...,1
4,12,348262745730338817,97567146,rivaroxaban-51c5089353785f584a9a91e5,Rivaroxaban diary day 22. Last tablet taken Tu...,http://twitter.com/97567146/status/34826274573...,1


In [10]:
train_ids.shape[0]

712

In [17]:
train_ann =  pd.read_csv(DATA_DIR+'/train_tweet_annotations.tsv', delimiter='\t', lineterminator='\n', header=None)
train_ann.columns = ["id","start","end","type","adr","drug1","drug2"]
train_ann = train_ann[train_ann["type"]=="ADR"]
train_ann.head()

Unnamed: 0,id,start,end,type,adr,drug1,drug2
0,baclofen-518bf599ac6ab35b4d48099f,60,66,ADR,tired,baclofen,baclofen
1,baclofen-518bf599ac6ab35b4d48099f,67,73,ADR,sleepy,baclofen,baclofen
2,baclofen-518bf599ac6ab35b4d48099f,74,77,ADR,fog,baclofen,baclofen
3,baclofen-51905f6cac6ab35b4d483a24,69,75,ADR,gorked,baclofen,baclofen
4,baclofen-51ae70ba5378f924d02efb33,23,47,ADR,make me a bigger asshole,baclofen,baclofen


In [0]:
#annotate train tweets with ner labels
train_annotations = []
num_data_ids = train_ids.shape[0]
for i in range(num_data_ids):
  train_annotations.append(annotate_tweets(train_ids, train_ann, i))

In [0]:
#save full train dataset to csv file
full_train_tweet_ids_ner = pd.concat([train_ids, pd.DataFrame(np.array(train_annotations),columns=['ner'])], axis=1)
full_train_tweet_ids_ner.to_csv(DATA_DIR+'/full_train_tweet_ids_ner.csv', index=False)

In [22]:
#split into train and dev dataset to csv file
train_tweet_ids_ner, dev_tweet_ids_ner = train_test_split(full_train_tweet_ids_ner, test_size=0.2, random_state=0, stratify=full_train_tweet_ids_ner["adr_present"])
train_tweet_ids_ner.to_csv(DATA_DIR+'/train_tweet_ids_ner.csv', index=False)
dev_tweet_ids_ner.to_csv(DATA_DIR+'/dev_tweet_ids_ner.csv', index=False)
train_tweet_ids_ner.shape[0], dev_tweet_ids_ner.shape[0]

(569, 143)

In [23]:
train_tweet_ids_ner['adr_present'].value_counts()

1    288
0    281
Name: adr_present, dtype: int64

In [24]:
dev_tweet_ids_ner['adr_present'].value_counts()

1    72
0    71
Name: adr_present, dtype: int64

In [0]:
#save full train ner labels to tsv file
with open(DATA_DIR+'/full_train_tweet_ner_labels.tsv', 'w', newline='\n') as f_output:
  tsv_output = csv.writer(f_output, delimiter='\n')
  for annot in train_annotations:
    tsv_output.writerow(annot)
    tsv_output.writerow('')

In [0]:
#split into train and dev dataset ner labels to tsv file
with open(DATA_DIR+'/train.tsv', 'w', newline='\n') as f_output:
  tsv_output = csv.writer(f_output, delimiter='\n')
  for annot in train_tweet_ids_ner['ner'].to_list():
    tsv_output.writerow(annot)
    tsv_output.writerow('')
with open(DATA_DIR+'/dev.tsv', 'w', newline='\n') as f_output:
  tsv_output = csv.writer(f_output, delimiter='\n')
  for annot in dev_tweet_ids_ner['ner'].to_list():
    tsv_output.writerow(annot)
    tsv_output.writerow('')

In [27]:
train_ner = pd.read_csv(DATA_DIR+'/train.tsv', delimiter='\t', header=None)
train_ner[1].value_counts()

O    11627
B      348
I      345
Name: 1, dtype: int64

In [28]:
train_ner[1].value_counts()/train_ner[1].value_counts().sum()

O    0.943750
B    0.028247
I    0.028003
Name: 1, dtype: float64

In [29]:
train_ner.shape[0]

12320

In [30]:
dev_ner = pd.read_csv(DATA_DIR+'/dev.tsv', delimiter='\t', header=None)
dev_ner[1].value_counts()

O    3054
I      93
B      88
Name: 1, dtype: int64

In [31]:
dev_ner[1].value_counts()/dev_ner[1].value_counts().sum()

O    0.944049
I    0.028748
B    0.027202
Name: 1, dtype: float64

In [32]:
dev_ner.shape[0]

3235

Test data

In [33]:
#load test data
test_ids =  pd.read_csv(DATA_DIR+'/test_ids_valid.csv')
test_ids.shape[0]

253

In [34]:
test_ids['adr_present'].value_counts()

1    140
0    113
Name: adr_present, dtype: int64

In [35]:
test_ann =  pd.read_csv(DATA_DIR+'/test_tweet_annotations.tsv', delimiter='\t', lineterminator='\n', header=None)
test_ann.columns = ["id","start","end","type","adr","drug1","drug2"]
test_ann = test_ann[test_ann["type"]=="ADR"]
test_ann.head()

Unnamed: 0,id,start,end,type,adr,drug1,drug2
0,avelox-51c3e5a853785f584a9a8c01,76,93,ADR,connective tissue,avelox,avelox
1,avelox-51c3e5a853785f584a9a8c01,94,99,ADR,lungs,avelox,avelox
2,avelox-51c3e5a853785f584a9a8c01,104,111,ADR,thyroid,avelox,avelox
3,baclofen-51b35e355378b9555a2f0709,78,84,ADR,drowsy,baclofen,baclofen
4,baclofen-51b35e355378b9555a2f0709,110,114,ADR,high,baclofen,baclofen


In [0]:
#annotate test tweets with ner labels
test_annotations = []
num_data_ids = test_ids.shape[0]
for i in range(num_data_ids):
  test_annotations.append(annotate_tweets(test_ids, test_ann, i))

In [0]:
#save test dataset to csv file
test_tweet_ids_ner = pd.concat([test_ids, pd.DataFrame(np.array(test_annotations),columns=['ner'])], axis=1)
test_tweet_ids_ner.to_csv(DATA_DIR+'/test_tweet_ids_ner.csv', index=False)

In [0]:
#save test ner labels to tsv file
with open(DATA_DIR+'/test.tsv', 'w', newline='\n') as f_output:
  tsv_output = csv.writer(f_output, delimiter='\n')
  for annot in test_annotations:
    tsv_output.writerow(annot)
    tsv_output.writerow('')

In [39]:
test_ner = pd.read_csv(DATA_DIR+'/test.tsv', delimiter='\t', header=None)
test_ner[1].value_counts()

O    5081
B     160
I     117
Name: 1, dtype: int64

In [40]:
test_ner[1].value_counts()/test_ner[1].value_counts().sum()

O    0.948302
B    0.029862
I    0.021837
Name: 1, dtype: float64

In [41]:
test_ner.shape[0]

5358

# Fine tune with BERT

NER task code adapted from: https://github.com/arnavbhandari/clinical-BioBERT

In [0]:
BERT_BASE_DIR="/content/drive/My Drive/MIDS/W266/W266/model/cased_L-12_H-768_A-12"
DATA_DIR="/content/drive/My Drive/MIDS/W266/ner_tweet_dataset"
TRAINED_CLASSIFIER="/content/drive/My Drive/MIDS/W266/trained_models/ner/cased_epoch3"

In [0]:
if not os.path.exists(TRAINED_CLASSIFIER):
  os.makedirs(TRAINED_CLASSIFIER)

In [390]:
!ls "$BERT_BASE_DIR"

bert_config.json		     bert_model.ckpt.index  vocab.txt
bert_model.ckpt.data-00000-of-00001  bert_model.ckpt.meta


In [224]:
!ls "$DATA_DIR"

BIO_labels			 test.tsv
BO_labels			 test_tweet_annotations.tsv
dev.tsv				 test_tweet_ids_ner.csv
dev_tweet_ids_ner.csv		 train_ids_valid.csv
dev_tweet_ner_labels.tsv	 train.tsv
full_train_tweet_ids_ner.csv	 train_tweet_annotations.tsv
full_train_tweet_ner_labels.tsv  train_tweet_ids_ner.csv
test_ids_valid.csv		 train_tweet_ner_labels.tsv


In [0]:
!ls "$TRAINED_CLASSIFIER"

In [0]:
!python ./clinical-BioBERT/run_ner.py \
  --do_train=true \
  --do_eval=true \
  --do_predict=true \
  --do_lower_case=false \
  --vocab_file="$BERT_BASE_DIR/vocab.txt" \
  --bert_config_file="$BERT_BASE_DIR/bert_config.json" \
  --init_checkpoint="$BERT_BASE_DIR/bert_model.ckpt" \
  --num_train_epochs=3 \
  --data_dir="$DATA_DIR" \
  --output_dir="$TRAINED_CLASSIFIER"

Calculate BERT results

In [0]:
TRAINED_CLASSIFIER = "/content/drive/My Drive/MIDS/W266/trained_models/ner/uncased_epoch10"

In [0]:
with open(TRAINED_CLASSIFIER+"/token_test.txt") as f:
  tokens_raw = [x.strip() for x in f.readlines()]

In [0]:
with open(TRAINED_CLASSIFIER+"/label_test.txt") as f:
  predict_raw = [x.strip() for x in f.readlines()]

In [0]:
predict = pd.DataFrame([[tokens_raw[i], predict_raw[i]] for i in range(len(tokens_raw))])
predict = predict[(predict[1]!='[CLS]')&(predict[1]!='[SEP]')]
predict = predict[[False if '##' in x else True for x in predict[0]]]
predict.reset_index(drop=True, inplace=True)

In [0]:
labels = pd.read_csv(DATA_DIR+"/test.tsv", delimiter="\t", lineterminator='\n', header=None)
labels = labels[labels[1].isnull()==False]
labels[1] = [x.strip() for x in labels[1]]
labels.reset_index(drop=True, inplace=True)

In [496]:
labels.shape == predict.shape

True

In [497]:
labels[1].value_counts()

O    5081
B     160
I     117
Name: 1, dtype: int64

In [498]:
predict[1].value_counts()

O    5117
B     153
I      88
Name: 1, dtype: int64

In [499]:
predict[predict[1]=='X']

Unnamed: 0,0,1


In [0]:
#if there were any predictions as X, change them to O
predict[predict[1]=='X'] = 'O'

In [501]:
#check if there are any mismatch in tokens between ground truth and prediction
for i in range(predict.shape[0]):
  if predict[0][i].lower() in labels[0][i].lower():
    pass
  else:
    print(i)
    print(predict[0][i], predict[1][i])
    print(labels[0][i], labels[1][i])
    print("")
print("check completed")

check completed


In [0]:
results = pd.concat([labels, predict], axis=1)
results.columns = ['word','truth','token','predict']
results["mismatch"] = [1 if results['truth'][i]!=results['predict'][i] else 0 for i in range(results.shape[0])]

In [503]:
#test f-score
np.round(f1_score(results['truth'], results['predict'], average="macro"),3)

0.72

In [506]:
#test accuracy
np.round(accuracy_score(results['truth'], results['predict']),3)

0.965

In [507]:
cm = confusion_matrix(results['predict'], results['truth'])
pd.DataFrame((cm/cm.sum())*100, index=["pred B","pred I", "pred O"], columns=["true B","true I", "true O"]).round(2).astype(str).add('%')

Unnamed: 0,true B,true I,true O
pred B,2.07%,0.19%,0.6%
pred I,0.06%,0.9%,0.69%
pred O,0.86%,1.1%,93.54%


In [508]:
pd.DataFrame(cm, index=["pred B","pred I", "pred O"], columns=["true B","true I", "true O"])

Unnamed: 0,true B,true I,true O
pred B,111,10,32
pred I,3,48,37
pred O,46,59,5012


Errror analysis


In [0]:
#combine B and I into one category
results["truth2"] = ['B' if results['truth'][i]=='I' else results['truth'][i] for i in range(results.shape[0])]
results["predict2"] = ['B' if results['predict'][i]=='I' else results['predict'][i] for i in range(results.shape[0])]

In [510]:
cm2 = confusion_matrix(results['predict2'], results['truth2'])
pd.DataFrame((cm2/cm2.sum())*100, index=["pred B/I","pred O"], columns=["true B/I", "true O"]).round(2).astype(str).add('%')

Unnamed: 0,true B/I,true O
pred B/I,3.21%,1.29%
pred O,1.96%,93.54%


In [511]:
pd.DataFrame(cm2, index=["pred B/I", "pred O"], columns=["true B/I", "true O"])

Unnamed: 0,true B/I,true O
pred B/I,172,69
pred O,105,5012


In [0]:
tB_pB = pd.DataFrame(results[(results["truth2"]=='B')&(results["predict2"]=='B')]["word"].value_counts())
tB_pB.to_csv("/content/drive/My Drive/MIDS/W266/trained_models/ner/uncased_epoch10/error_analysis/tB_pB.csv")

In [0]:
tB_pO = pd.DataFrame(results[(results["truth2"]=='B')&(results["predict2"]=='O')]["word"].value_counts())
tB_pO.to_csv("/content/drive/My Drive/MIDS/W266/trained_models/ner/uncased_epoch10/error_analysis/tB_pO.csv")

In [0]:
tO_pB = pd.DataFrame(results[(results["truth2"]=='O')&(results["predict2"]=='B')]["word"].value_counts())
tO_pB.to_csv("/content/drive/My Drive/MIDS/W266/trained_models/ner/uncased_epoch10/error_analysis/tO_pB.csv")

In [0]:
tO_pO = pd.DataFrame(results[(results["truth2"]=='O')&(results["predict2"]=='O')]["word"].value_counts())
tO_pO.to_csv("/content/drive/My Drive/MIDS/W266/trained_models/ner/uncased_epoch10/error_analysis/tO_pO.csv")

In [0]:
train_ner["combine"] = [train_ner[0][i]+" "+train_ner[1][i] for i in range(train_ner.shape[0])]

In [0]:
training_O = train_ner[train_ner[1]=="O"]["combine"].value_counts()
training_O.to_csv("/content/drive/My Drive/MIDS/W266/trained_models/ner/uncased_epoch10/error_analysis/training_O.csv")

In [0]:
training_BI = train_ner[train_ner[1]!="O"]["combine"].value_counts()
training_BI.to_csv("/content/drive/My Drive/MIDS/W266/trained_models/ner/uncased_epoch10/error_analysis/training_BI.csv")

In [0]:
test_tweet_ids_ner = pd.read_csv(DATA_DIR+'/test_tweet_ids_ner.csv')

In [0]:
def restore_string_to_list(data):
  all_str_to_list = []
  for i in range(data.shape[0]):
    str_to_list = data['ner'][i].strip("[|]").split(", ")
    str_to_list = [x.split("\\t")[0][1:] for x in str_to_list]
    all_str_to_list.append(str_to_list)
  return(all_str_to_list)

In [0]:
test_tweet_ids_ner["ner"] = restore_string_to_list(test_tweet_ids_ner)

In [524]:
test_tweet_ids_ner["ner"]

0      [@, catthoma, Right, ,, some, SSRIs, are, used...
1      [@, dsymons, @, MelissaDee, _, I, was, only, o...
2      [great, ., ., I, have, hives, up, and, down, m...
3      [Taking, trazodone, and, literally, passing, o...
4      [@, Joanne, _, _, Howe, enbrel, is, a, miracle...
                             ...                        
248    [YOUNG, NIGGA, JUST, GAVE, ME, A, 70MG, VYVANS...
249    [03, ., 25, day, 13, Rivaroxaban, diary, ., Ne...
250    [@, kitschmagnet, I, was, on, tysabri, for, 50...
251    [@, craigsdaughter, no, joke, !, #, tysabri, w...
252    [@, Jules, _, Clarke, banana, ?, Hot, milk, ?,...
Name: ner, Length: 253, dtype: object

In [0]:
def match_tweet_results(data, results):
  current_index = 0
  tweet_lengths = []
  ner_mismatches = []
  ner_result_compares = []

  for i in range(data.shape[0]):
    tweet_length = len(data["ner"][i])
    tweet_result_table = results[current_index:current_index+tweet_length]
    ner_result_compare = []
    for result in tweet_result_table.iterrows():
      ner_result_compare.append(result[1]["word"]+' '+result[1]["truth"]+' '+result[1]["predict"])
    ner_mismatch = tweet_result_table["mismatch"].sum()

    tweet_lengths.append(tweet_length)
    ner_mismatches.append(ner_mismatch)
    ner_result_compares.append(ner_result_compare)

    current_index += tweet_length
  
  return(tweet_lengths, ner_mismatches, ner_result_compares)

In [0]:
tweet_lengths, ner_mismatches, ner_result_compares = match_tweet_results(test_tweet_ids_ner, results)

In [0]:
test_tweet_ids_ner["tweet_lengths"] = tweet_lengths
test_tweet_ids_ner["ner_mismatches"] = ner_mismatches
test_tweet_ids_ner["ner_result_compares"] = ner_result_compares

In [528]:
#number of tweets with no mismatches
test_tweet_ids_ner[test_tweet_ids_ner['ner_mismatches']==0].shape[0]

171

In [529]:
#number of tweets with no mismatches
test_tweet_ids_ner[test_tweet_ids_ner['ner_mismatches']>0].shape[0]

82

In [530]:
#number of ner mismatches
test_tweet_ids_ner['ner_mismatches'].sum()

187

# CRF baseline model

code adapted from: https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html

In [0]:
def word2features(sent, i):
    word = sent[i]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

In [0]:
def get_label_list(sentences, label_table):
  current_index = 0
  labels = []

  for i in range(sentences.shape[0]):
    tweet_length = len(sentences[i])
    label = label_table[1][current_index:current_index+tweet_length].to_list()
    labels.append(label)

    current_index += tweet_length
  
  return(labels)

In [0]:
train_labels = pd.read_csv(DATA_DIR+"/train.tsv", delimiter="\t", lineterminator='\n', header=None)
train_labels = train_labels[train_labels[1].isnull()==False]
train_labels[1] = [x.strip() for x in train_labels[1]]
train_labels.reset_index(drop=True, inplace=True)

In [0]:
train_tweet_ids_ner = pd.read_csv(DATA_DIR+'/train_tweet_ids_ner.csv')
train_tweet_ids_ner["ner"] = restore_string_to_list(train_tweet_ids_ner)
train_tweet_ids_ner["label"] = get_label_list(train_tweet_ids_ner["ner"], train_labels)

In [0]:
train_X = [sent2features(train_tweet_ids_ner["ner"][i]) for i in range(train_tweet_ids_ner.shape[0])]
train_y = train_tweet_ids_ner["label"].to_list()

In [0]:
test_labels = pd.read_csv(DATA_DIR+"/test.tsv", delimiter="\t", lineterminator='\n', header=None)
test_labels = test_labels[test_labels[1].isnull()==False]
test_labels[1] = [x.strip() for x in test_labels[1]]
test_labels.reset_index(drop=True, inplace=True)

In [0]:
test_tweet_ids_ner = pd.read_csv(DATA_DIR+'/test_tweet_ids_ner.csv')
test_tweet_ids_ner["ner"] = restore_string_to_list(test_tweet_ids_ner)
test_tweet_ids_ner["label"] = get_label_list(test_tweet_ids_ner["ner"], test_labels)

In [0]:
test_X = [sent2features(test_tweet_ids_ner["ner"][i]) for i in range(test_tweet_ids_ner.shape[0])]
test_y = test_tweet_ids_ner["label"].to_list()

In [478]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(train_X, train_y)

CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [0]:
pred_y = crf.predict(test_X)

In [0]:
pred_y_flat = list(chain(*pred_y))
test_y_flat = list(chain(*test_y))

In [481]:
#test f-score
np.round(f1_score(test_y_flat, pred_y_flat, average='macro'),3)

0.502

In [482]:
#test accuracy
np.round(accuracy_score(test_y_flat, pred_y_flat),3)

0.951

In [31]:
len(test_X[0])

26

In [35]:
#example of features for CRF
test_X[0][0]

{'+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:word.lower()': 'catthoma',
 'BOS': True,
 'bias': 1.0,
 'word.isdigit()': False,
 'word.istitle()': False,
 'word.isupper()': False,
 'word.lower()': '@',
 'word[-2:]': '@',
 'word[-3:]': '@'}