In [94]:
# imports

import re

%matplotlib inline
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

import spacy
from spacymoji import Emoji

import scipy.stats

import sklearn
from sklearn.metrics import make_scorer
# from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import cross_val_score
# from sklearn.grid_search import RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

import pandas as pd

# matplot
plt.style.use('ggplot')

In [95]:
# file read

def conll_read(path: str):
    words = []
    bio_tags = []
    i = 0
    with open(path) as f:
        for line in f:
            i += 1
            splitted = line.strip().split('\t')
            if len(splitted) == 1:
                if splitted[0] == '':
                    words.append('\n')
                    bio_tags.append('\n')
                else:
                    # special case for last line of dev set
                    # maybe we just skip it altogether
                    words.append(splitted[0])
                    bio_tags.append('O')
            else:
                words.append(splitted[0])
                bio_tags.append(splitted[1])
    return words, bio_tags

In [96]:
# calls nltk 
def add_pos_tags(tokens: list):
    tagged_train = nltk.pos_tag(tokens)
    return zip(*tagged_train)

In [97]:
# create the sentences with their features
def create_sents(seqs):
    seperators = [i for i, item in enumerate(seqs) if item[0] == '\n']
    sents = []
    for idx, pos in enumerate(seperators):
        start = seperators[idx - 1] + 1
        end = seperators[idx]
        
        if idx == 0:
            start = 0
            end = pos
    
        sequence = seqs[start: end]
        sents.append(sequence)
    return sents

In [98]:
# train set
words, bio_tags = conll_read('W-NUT_data/wnut17train.conll')
words, pos_tags = add_pos_tags(words)
complete = list(zip(words, pos_tags, bio_tags))
train_sequences = create_sents(complete)

# dev set
words, bio_tags = conll_read('W-NUT_data/emerging.dev.conll')
words, pos_tags = add_pos_tags(words)
complete = list(zip(words, pos_tags, bio_tags))
dev_sequences = create_sents(complete)

# test set
words, bio_tags = conll_read('W-NUT_data/emerging.test.annotated')
words, pos_tags = add_pos_tags(words)
complete = list(zip(words, pos_tags, bio_tags))
test_sequences = create_sents(complete)

In [108]:
# extra features
is_hashtag_regex = re.compile(r"#(\w+)?")
is_mention_regex = re.compile(r"^@(\w+)?")
is_money_regex = re.compile(r"^$(\w+)?")
is_url_regex = re.compile(r"(https?:\/\/(?:www\.|(?!www))|www\.|www\.)")
is_punct_reg = re.compile(r"^[\.\,!\?\"\':;_\-]$")
is_repeated_punct_reg = re.compile(r"^[\.\,!\?\"\':;_\-]{2,}$")
is_first_capital_reg = re.compile(r"^[A-Z][a-z]+")
stop_words_set = set(stopwords.words('english'))

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe('emoji', first=True)

lemmatizer = WordNetLemmatizer()

def is_hashtag(word: str) -> bool:
    return bool(is_hashtag_regex.match(word))

def is_mention(word: str) -> bool:
    return bool(is_mention_regex.match(word))

def is_money(word: str) -> bool:
    return bool(is_money_regex.match(word))

def is_url(word: str) -> bool:
    return bool(is_url_regex.match(word))

def is_punct(word: str) -> bool:
    return bool(is_punct_reg.match(word))

def is_repeated_punct(word: str) -> bool:
    return bool(is_repeated_punct_reg.match(word))

def is_stopword(word: str) -> bool:
    return word.lower() in stop_words_set

def is_first_capital(word: str) -> bool:
    return bool(is_first_capital_reg.match(word))

def has_emoji(word: str) -> bool:
    doc = nlp(word)
    return doc._.has_emoji

In [109]:
feature_regex = {
                    'is_mention': is_mention_regex,
                    'is_money': is_money_regex,
                    'is_url': is_url_regex,
                    'is_hashtag': is_hashtag_regex,
                    'is_punct': is_punct_reg,
                    'is_repeated_punct': is_repeated_punct_reg,
                    'is_first_capital': is_first_capital_reg,
                }

def for_features(sent, i, features, features_add, context):
    
    for feat in features_add:
        add_in = ''
        word = sent[i][0]
        features = add_feature(word, features, feat, add_in)
        for c in context:
            if c == 0:
                continue
            if c <= i:
                word = sent[i - c][0]
                add_in = f'-{c}:'
                features = add_feature(word, features, feat, add_in)
            if i < (len(sent) - c):
                word = sent[i + c][0]
                add_in = f'+{c}:'
                features = add_feature(word, features, feat, add_in)
        
    return features

def add_feature(word, features, add_feature, context):
    if add_feature == 'has_emoji':
        feature_value = has_emoji(word)
    elif add_feature == 'is_stopword':
        feature_value = is_stopword(word)
    elif add_feature == 'lemma':
        feature_value = lemmatizer.lemmatize(word)
    else:
        reg = feature_regex[add_feature]
        feature_value = bool(reg.match(word))
    
    features.update({f'{context}word.{add_feature}': feature_value})
    return features

In [103]:
def preword2feat(sent, i, extended, extra=None):
    features = word2features(sent, i)
    if extended:
        features = extended_context(features=features, sent=sent, i=i)
    if extra:
        context = extra["context"]
        extra_features = extra["features"]
        features = for_features(sent, i, features, extra_features, context)
    return features

# def add_extra_features(features, sent, i):
#     word = sent[i][0]

#     features.update({
#                         'word.is_hashtag': is_hashtag(word),
#                         'word.is_mention': is_mention(word),
#                         'word.is_money': is_money(word),
#                         'word.is_stopword': is_stopword(word),
#                         'word.is_url': is_url(word),
#                         'word.is_punct': is_punct(word),
#                         'word.is_repeated_punct': is_repeated_punct(word),
#                         'word.is_first_capital': is_first_capital(word),
#                         'word.has_emoji': has_emoji(word),
#                         'word.stem': ps.stem(word),
#                         'word.lemma': lemmatizer.lemmatize(word)
#                     })
#     return features

def extended_context(features, sent, i):

    if i > 1:
        word2 = sent[i-2][0]
        postag2 = sent[i-2][1]
        features.update({
            '-2:word.lower()': word2.lower(),
            '-2:word.istitle()': word2.istitle(),
            '-2:word.isupper()': word2.isupper(),
            '-2:postag': postag2,
            '-2:postag[:2]': postag2[:2],
        })

    if i < len(sent)-2:
        word2 = sent[i+2][0]
        postag2 = sent[i+2][1]
        features.update({
            '+2:word.lower()': word2.lower(),
            '+2:word.istitle()': word2.istitle(),
            '+2:word.isupper()': word2.isupper(),
            '+2:postag': postag2,
            '+2:postag[:2]': postag2[:2],
        })
    return features

def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }

    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent, extended=False, extra=None):
    return [preword2feat(sent, i, extended=extended, extra=extra) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [8]:
# create X and y for train, dev, test
X_train = [sent2features(s) for s in train_sequences]
y_train = [sent2labels(s) for s in train_sequences]

X_dev = [sent2features(s) for s in dev_sequences]
y_dev = [sent2labels(s) for s in dev_sequences]

X_test = [sent2features(s) for s in test_sequences]
y_test = [sent2labels(s) for s in test_sequences]

In [16]:
def hyper_param_opt(x: list, y: list, labels: list, params: dict, hyper_params: dict):
    crf = sklearn_crfsuite.CRF(
                                algorithm=params["algorithm"],
                                max_iterations=params["max_iter"],
                                all_possible_transitions=params["poss_trans"]
                              )

    # use the same metric for evaluation
    f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)
    
    # search
    rs = RandomizedSearchCV(crf, hyper_params,
                            cv=3,
                            verbose=1,
                            n_jobs=-1,
                            n_iter=50,
                            scoring=f1_scorer,
                            random_state=1)

    rs.fit(x, y)

    return rs

In [11]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

labels = list(crf.classes_)
labels.remove('O')
labels

y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.13829694021844366

In [17]:
rs_params = {
                "algorithm": 'lbfgs',
                "max_iter": 100,
                "poss_trans": True
            }

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

rs = hyper_param_opt(X_dev, y_dev, labels, rs_params, params_space)

print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best_c1 = rs.best_params_["c1"]
best_c2 = rs.best_params_["c2"]

y_pred = rs.best_estimator_.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.


Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   22.2s finished


best params: {'c1': 0.010529268793042638, 'c2': 0.05466491579572729}
best CV score: 0.35136944932248887
model size: 0.62M


0.1921589462256364

In [18]:
# extended context
X_train_cont = [sent2features(s, True) for s in train_sequences]
y_train_cont = [sent2labels(s) for s in train_sequences]

X_dev_cont = [sent2features(s, True) for s in dev_sequences]
y_dev_cont = [sent2labels(s) for s in dev_sequences]

X_test_cont = [sent2features(s, True) for s in test_sequences]
y_test_cont = [sent2labels(s) for s in test_sequences]

In [20]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=best_c1,
    c2=best_c2,
    max_iterations=100,
    all_possible_transitions=True
)

crf.fit(X_train_cont, y_train_cont)

y_pred = crf.predict(X_test_cont)
metrics.flat_f1_score(y_test_cont, y_pred,
                      average='weighted', labels=labels)

0.13523076229715575

In [104]:
# all extended features
extra = {
        "context": [1],
        "features": ['is_mention', 'is_money', 'is_url', 'is_hashtag', 'is_punct', 'is_repeated_punct', 'is_first_capital', 'is_stopword', 'has_emoji']
        }


X_train_ext = [sent2features(s, True, extra) for s in train_sequences]
y_train_ext = [sent2labels(s) for s in train_sequences]

X_dev_ext = [sent2features(s, True, extra) for s in dev_sequences]
y_dev_ext = [sent2labels(s) for s in dev_sequences]

X_test_ext = [sent2features(s, True, extra) for s in test_sequences]
y_test_ext = [sent2labels(s) for s in test_sequences]

In [35]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=best_c1,
    c2=best_c2,
    max_iterations=100,
    all_possible_transitions=True
)

crf.fit(X_train_ext, y_train_ext)

y_pred = crf.predict(X_test_ext)
metrics.flat_f1_score(y_test_ext, y_pred,
                      average='weighted', labels=labels)

0.15784783065542354

In [36]:
rs = hyper_param_opt(X_dev_ext, y_dev_ext, labels, rs_params, params_space)

print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best_c1_ext = rs.best_params_["c1"]
best_c2_ext = rs.best_params_["c2"]

y_pred = rs.best_estimator_.predict(X_test_ext)
metrics.flat_f1_score(y_test_ext, y_pred,
                      average='weighted', labels=labels)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   35.4s finished


best params: {'c1': 0.022660139525943226, 'c2': 0.02455232191882188}
best CV score: 0.38031014853886297
model size: 0.79M


0.21292233712721764

In [111]:
import gensim

In [118]:
import gensim.downloader
glove_vectors = gensim.downloader.load('glove-twitter-25')



IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





In [127]:
sents_only = []

for sent in train_sequences:
    sent_words = []
    for w in sent:
        sent_words.append(w[0])
    sents_only.append(sent_words)

In [139]:
from gensim.models import Word2Vec

model = gensim.models.Word2Vec(sentences=sents_only, vector_size=25)

In [166]:
glove_vectors.save('models/gensim-twitter-25.bin')

In [174]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors

glove_file = datapath('glove-twitter-25')
tmp_file = get_tmpfile("test_word2vec.txt")
_ = glove2word2vec(glove_file, tmp_file)


  _ = glove2word2vec(glove_file, tmp_file)


FileNotFoundError: [Errno 2] No such file or directory: '/Users/andreassavva/miniforge3/envs/crf/lib/python3.10/site-packages/gensim/test/test_data/glove-twitter-25'

In [170]:
model_2 = gensim.models.KeyedVectors.load_word2vec_format('test_word2vec.txt', binary=False, unicode_errors='ignore')

FileNotFoundError: [Errno 2] No such file or directory: 'test_word2vec.txt'