In [30]:
# imports

import re

%matplotlib inline
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

import spacy
from spacymoji import Emoji

import scipy.stats

import sklearn
from sklearn.metrics import make_scorer
# from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import cross_val_score
# from sklearn.grid_search import RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import PredefinedSplit
from sklearn.utils._testing import ignore_warnings

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

import pandas as pd
import numpy as np

import gensim
from gensim.models import Word2Vec
import gensim.downloader
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

# matplot
plt.style.use('ggplot')

In [31]:
# file read

def conll_read(path: str):
    words = []
    bio_tags = []
    i = 0
    with open(path) as f:
        for line in f:
            i += 1
            splitted = line.strip().split('\t')
            if len(splitted) == 1:
                if splitted[0] == '':
                    words.append('\n')
                    bio_tags.append('\n')
                else:
                    # special case for last line of dev set
                    # maybe we just skip it altogether
                    words.append(splitted[0])
                    bio_tags.append('O')
            else:
                words.append(splitted[0])
                bio_tags.append(splitted[1])
    return words, bio_tags

In [32]:
# calls nltk 
def add_pos_tags(tokens: list):
    tagged_train = nltk.pos_tag(tokens)
    return zip(*tagged_train)

In [33]:
# create the sentences with their features
def create_sents(seqs):
    seperators = [i for i, item in enumerate(seqs) if item[0] == '\n']
    sents = []
    for idx, pos in enumerate(seperators):
        start = seperators[idx - 1] + 1
        end = seperators[idx]
        
        if idx == 0:
            start = 0
            end = pos
    
        sequence = seqs[start: end]
        sents.append(sequence)
    return sents

In [34]:
# train set
words, bio_tags = conll_read('W-NUT_data/wnut17train.conll')
words, pos_tags = add_pos_tags(words)
complete = list(zip(words, pos_tags, bio_tags))
train_sequences = create_sents(complete)

# dev set
words, bio_tags = conll_read('W-NUT_data/emerging.dev.conll')
words, pos_tags = add_pos_tags(words)
complete = list(zip(words, pos_tags, bio_tags))
dev_sequences = create_sents(complete)

# test set
words, bio_tags = conll_read('W-NUT_data/emerging.test.annotated')
words, pos_tags = add_pos_tags(words)
complete = list(zip(words, pos_tags, bio_tags))
test_sequences = create_sents(complete)

In [35]:
# extra features
is_hashtag_regex = re.compile(r"#(\w+)?")
is_mention_regex = re.compile(r"^@(\w+)?")
is_money_regex = re.compile(r"^$(\w+)?")
is_url_regex = re.compile(r"(https?:\/\/(?:www\.|(?!www))|www\.|www\.)")
is_punct_reg = re.compile(r"^[\.\,!\?\"\':;_\-]$")
is_repeated_punct_reg = re.compile(r"^[\.\,!\?\"\':;_\-]{2,}$")
is_first_capital_reg = re.compile(r"^[A-Z][a-z]+")
stop_words_set = set(stopwords.words('english'))

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe('emoji', first=True)

lemmatizer = WordNetLemmatizer()

def is_hashtag(word: str) -> bool:
    return bool(is_hashtag_regex.match(word))

def is_mention(word: str) -> bool:
    return bool(is_mention_regex.match(word))

def is_money(word: str) -> bool:
    return bool(is_money_regex.match(word))

def is_url(word: str) -> bool:
    return bool(is_url_regex.match(word))

def is_punct(word: str) -> bool:
    return bool(is_punct_reg.match(word))

def is_repeated_punct(word: str) -> bool:
    return bool(is_repeated_punct_reg.match(word))

def is_stopword(word: str) -> bool:
    return word.lower() in stop_words_set

def is_first_capital(word: str) -> bool:
    return bool(is_first_capital_reg.match(word))

def has_emoji(word: str) -> bool:
    doc = nlp(word)
    return doc._.has_emoji

In [36]:
# for word embeddings models

sents_only = []

for sent in train_sequences:
    sent_words = []
    for w in sent:
        processed_word = lemmatizer.lemmatize(w[0].lower())
        sent_words.append(processed_word)
    sents_only.append(sent_words)

In [37]:
# pre-trained model, glove-twitter-25
glove_vectors = gensim.downloader.load('glove-twitter-25')

# custom model
vector_size = 5
model = Word2Vec(vector_size=vector_size)
model.build_vocab(sents_only)  # prepare the model vocabulary
model.train(sents_only, total_examples=model.corpus_count, epochs=model.epochs)

(166654, 313650)

In [38]:
def apply_w2v(word: str):
    processed_word = lemmatizer.lemmatize(word.lower())
    if word in model.wv:
        pred = model.wv.get_vector(processed_word)
        if vector_size == 1:
            return pred[0]
        
        return pred
        
    return np.zeros(vector_size)


def apply_glove(word: str):
    processed_word = lemmatizer.lemmatize(word.lower())
    if word in glove_vectors:
        pred = glove_vectors[processed_word]
        if vector_size == 1:
            return pred[0]
        
        return pred
        
    return np.zeros(vector_size)

In [39]:
feature_regex = {
                    'is_mention': is_mention_regex,
                    'is_money': is_money_regex,
                    'is_url': is_url_regex,
                    'is_hashtag': is_hashtag_regex,
                    'is_punct': is_punct_reg,
                    'is_repeated_punct': is_repeated_punct_reg,
                    'is_first_capital': is_first_capital_reg,
                }

def for_features(sent, i, features, features_add, context):
    
    for feat in features_add:
        add_in = ''
        word = sent[i][0]
        features = add_feature(word, features, feat, add_in)
        for c in context:
            if c == 0:
                continue
            if c <= i:
                word = sent[i - c][0]
                add_in = f'-{c}:'
                features = add_feature(word, features, feat, add_in)
            if i < (len(sent) - c):
                word = sent[i + c][0]
                add_in = f'+{c}:'
                features = add_feature(word, features, feat, add_in)
        
    return features

def add_feature(word, features, add_feature, context):
    if add_feature == 'has_emoji':
        feature_value = has_emoji(word)
    elif add_feature == 'is_stopword':
        feature_value = is_stopword(word)
    elif add_feature == 'lemma':
        feature_value = lemmatizer.lemmatize(word)
    elif add_feature == 'w2v':
        feature_values = apply_w2v(word)
        for index, feature_value in enumerate(feature_values):
            features.update({f'{context}word.{add_feature}{index}': feature_value})
        return features
    
    elif add_feature == 'glove':
        feature_values = apply_glove(word)
        for index, feature_value in enumerate(feature_values):
            features.update({f'{context}word.{add_feature}{index}': feature_value})
        return features
            
    else:
        reg = feature_regex[add_feature]
        feature_value = bool(reg.match(word))
    
    features.update({f'{context}word.{add_feature}': feature_value})
    return features

In [40]:
def preword2feat(sent, i, extended, extra=None):
    features = word2features(sent, i)
    if extended:
        features = extended_context(features=features, sent=sent, i=i)
    if extra:
        context = extra["context"]
        extra_features = extra["features"]
        features = for_features(sent, i, features, extra_features, context)
    return features

def extended_context(features, sent, i):

    if i > 1:
        word2 = sent[i-2][0]
        postag2 = sent[i-2][1]
        features.update({
            '-2:word.lower()': word2.lower(),
            '-2:word.istitle()': word2.istitle(),
            '-2:word.isupper()': word2.isupper(),
            '-2:postag': postag2,
            '-2:postag[:2]': postag2[:2],
        })

    if i < len(sent)-2:
        word2 = sent[i+2][0]
        postag2 = sent[i+2][1]
        features.update({
            '+2:word.lower()': word2.lower(),
            '+2:word.istitle()': word2.istitle(),
            '+2:word.isupper()': word2.isupper(),
            '+2:postag': postag2,
            '+2:postag[:2]': postag2[:2],
        })
    return features

def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }

    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent, extended=False, extra=None):
    return [preword2feat(sent, i, extended=extended, extra=extra) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [41]:

def hyper_param_opt(x_train: list, y_train: list,x_dev: list, y_dev: list, labels: list, params: dict, hyper_params: dict):
  crf = sklearn_crfsuite.CRF(
                              algorithm=params["algorithm"],
                              max_iterations=params["max_iter"],
                              all_possible_transitions=params["poss_trans"]
                            )

  # use the same metric for evaluation
  f1_scorer = make_scorer(metrics.flat_f1_score,
                      average='weighted', labels=labels)

  train_set_x = x_train + x_dev
  train_set_y = y_train + y_dev

  test_fold = [-1 if i < len(x_train) else 0 for i in range(len(train_set_x))]
  ps = PredefinedSplit(test_fold)
  
  # search
  rs = RandomizedSearchCV(crf, hyper_params,
                          cv=ps,
                          verbose=0,
                          n_jobs=-1,
                          n_iter=50,
                          scoring=f1_scorer,
                          random_state=1)

  rs.fit(train_set_x, train_set_y)

  return rs

### Baseline

In [42]:
# create X and y for train, dev, test
X_train = [sent2features(s) for s in train_sequences]
y_train = [sent2labels(s) for s in train_sequences]

X_dev = [sent2features(s) for s in dev_sequences]
y_dev = [sent2labels(s) for s in dev_sequences]

X_test = [sent2features(s) for s in test_sequences]
y_test = [sent2labels(s) for s in test_sequences]

In [43]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

labels = list(crf.classes_)
labels.remove('O')
labels

y_pred = crf.predict(X_test)

print('F1-Score:', metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels))

print(metrics.flat_classification_report(y_test, y_pred, labels=labels, digits=3))

F1-Score: 0.13829694021844366
                 precision    recall  f1-score   support

     B-location      0.357     0.200     0.256       150
     I-location      0.304     0.074     0.120        94
        B-group      0.318     0.042     0.075       165
  B-corporation      0.000     0.000     0.000        66
       B-person      0.570     0.133     0.216       429
B-creative-work      0.250     0.021     0.039       142
      B-product      0.500     0.024     0.045       127
       I-person      0.583     0.214     0.313       131
I-creative-work      0.286     0.037     0.065       218
  I-corporation      0.000     0.000     0.000        22
        I-group      0.400     0.086     0.141        70
      I-product      0.214     0.048     0.078       126

      micro avg      0.420     0.089     0.147      1740
      macro avg      0.315     0.073     0.112      1740
   weighted avg      0.386     0.089     0.138      1740



In [44]:
rs_params = {
                "algorithm": 'lbfgs',
                "max_iter": 100,
                "poss_trans": True
            }

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

rs = hyper_param_opt(X_train, y_train, X_dev, y_dev, labels, rs_params, params_space)

print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best_c1 = rs.best_params_["c1"]
best_c2 = rs.best_params_["c2"]

y_pred = rs.best_estimator_.predict(X_test)

print('F1-Score:', metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels))

print(metrics.flat_classification_report(y_test, y_pred, labels=labels, digits=3))



best params: {'c1': 0.44495286986762533, 'c2': 0.06015838309715876}
best CV score: 0.17831900525216518
model size: 0.69M
F1-Score: 0.2057116353635765
                 precision    recall  f1-score   support

     B-location      0.357     0.267     0.305       150
     I-location      0.333     0.128     0.185        94
        B-group      0.300     0.036     0.065       165
  B-corporation      0.400     0.030     0.056        66
       B-person      0.609     0.273     0.377       429
B-creative-work      0.500     0.063     0.112       142
      B-product      0.389     0.055     0.097       127
       I-person      0.564     0.237     0.333       131
I-creative-work      0.378     0.064     0.110       218
  I-corporation      0.333     0.045     0.080        22
        I-group      0.227     0.071     0.109        70
      I-product      0.214     0.071     0.107       126

      micro avg      0.452     0.145     0.220      1740
      macro avg      0.384     0.112     0.161    

### Extended Context

In [45]:
# extended context
X_train_cont = [sent2features(s, True) for s in train_sequences]
y_train_cont = [sent2labels(s) for s in train_sequences]

X_dev_cont = [sent2features(s, True) for s in dev_sequences]
y_dev_cont = [sent2labels(s) for s in dev_sequences]

X_test_cont = [sent2features(s, True) for s in test_sequences]
y_test_cont = [sent2labels(s) for s in test_sequences]

In [46]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=best_c1,
    c2=best_c2,
    max_iterations=100,
    all_possible_transitions=True
)

crf.fit(X_train_cont, y_train_cont)

y_pred = crf.predict(X_test_cont)

print('F1-Score:', metrics.flat_f1_score(y_test_cont, y_pred,
                      average='weighted', labels=labels))

print(metrics.flat_classification_report(y_test_cont, y_pred, labels=labels, digits=3))

F1-Score: 0.15152805308205766
                 precision    recall  f1-score   support

     B-location      0.449     0.267     0.335       150
     I-location      0.433     0.138     0.210        94
        B-group      0.320     0.048     0.084       165
  B-corporation      0.000     0.000     0.000        66
       B-person      0.512     0.145     0.225       429
B-creative-work      0.222     0.028     0.050       142
      B-product      0.250     0.008     0.015       127
       I-person      0.462     0.229     0.306       131
I-creative-work      0.256     0.046     0.078       218
  I-corporation      0.000     0.000     0.000        22
        I-group      0.375     0.086     0.140        70
      I-product      0.308     0.032     0.058       126

      micro avg      0.423     0.102     0.165      1740
      macro avg      0.299     0.086     0.125      1740
   weighted avg      0.359     0.102     0.152      1740



  _warn_prf(average, modifier, msg_start, len(result))


In [47]:
rs = hyper_param_opt(X_train_cont, y_train_cont, X_dev_cont, y_dev_cont, labels, rs_params, params_space)

print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

# best_c1 = rs.best_params_["c1"]
# best_c2 = rs.best_params_["c2"]

y_pred = rs.best_estimator_.predict(X_test_cont)

print('F1-Score:', metrics.flat_f1_score(y_test_cont, y_pred,
                      average='weighted', labels=labels))

print(metrics.flat_classification_report(y_test_cont, y_pred, labels=labels, digits=3))



best params: {'c1': 0.338216973284279, 'c2': 0.0027419547244570343}
best CV score: 0.17996364460883024
model size: 0.80M
F1-Score: 0.2304239705395028
                 precision    recall  f1-score   support

     B-location      0.442     0.280     0.343       150
     I-location      0.415     0.181     0.252        94
        B-group      0.238     0.030     0.054       165
  B-corporation      0.500     0.045     0.083        66
       B-person      0.576     0.310     0.403       429
B-creative-work      0.333     0.056     0.096       142
      B-product      0.250     0.039     0.068       127
       I-person      0.514     0.290     0.371       131
I-creative-work      0.415     0.101     0.162       218
  I-corporation      0.250     0.045     0.077        22
        I-group      0.556     0.071     0.127        70
      I-product      0.312     0.119     0.172       126

      micro avg      0.470     0.169     0.249      1740
      macro avg      0.400     0.131     0.184    

### Extended Features Test

In [52]:
def create_xy(sequences, extended, extra):
    x = [sent2features(s, extended, extra) for s in sequences]
    y = [sent2labels(s) for s in sequences]
    return x, y


def run_train(train_x, train_y):
    crf = sklearn_crfsuite.CRF(
                algorithm='lbfgs',
                c1=best_c1,
                c2=best_c2,
                max_iterations=100,
                all_possible_transitions=True
                )

    crf.fit(train_x, train_y)

    return crf

def run_test(extra, labels, fname):
    X_train_ext, y_train_ext = create_xy(train_sequences, True, extra)
    X_dev_ext, y_dev_ext = create_xy(dev_sequences, True, extra)
    X_test_ext, y_test_ext = create_xy(test_sequences, True, extra)
    crf = run_train(X_train_ext, y_train_ext)

    y_pred = crf.predict(X_test_ext)

    train_f1 = metrics.flat_f1_score(y_test_ext, y_pred, average='weighted', labels=labels)

    train_report = metrics.flat_classification_report(y_test_ext, y_pred, labels=labels, digits=3, output_dict=True)

    rs = hyper_param_opt(X_train_ext, y_train_ext, X_dev_ext, y_dev_ext, labels, rs_params, params_space)

    print('best params:', rs.best_params_)
    print('best CV score:', rs.best_score_)
    print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

    best_c1 = rs.best_params_["c1"]
    best_c2 = rs.best_params_["c2"]

    y_pred = rs.best_estimator_.predict(X_test_ext)

    hpo_f1 = metrics.flat_f1_score(y_test_ext, y_pred, average='weighted', labels=labels)

    hpo_report = metrics.flat_classification_report(y_test_ext, y_pred, labels=labels, digits=3, output_dict=True)

    df_train = pd.DataFrame(train_report).transpose()
    df_train  = df_train.reindex(df_train.index.values.tolist()+['HPO'])
    df_hpo = pd.DataFrame(hpo_report).transpose()
    final = pd.concat([df_train, df_hpo])
    final.to_csv(f'results/{fname}')

In [53]:
# all extended features
all_features = ['is_mention', 'is_money', 'is_url', 'is_hashtag', 'is_punct', 'is_repeated_punct', 'is_first_capital', 'is_stopword', 'has_emoji', 'lemma', 'w2v', 'glove']

# test features individually
for feat in all_features:
        extra = {
                "context": [0],
                "features": [feat]
                }
        
        run_test(extra, labels, fname=f'0-{feat}')



best params: {'c1': 0.27166968530912255, 'c2': 0.057793985485471114}
best CV score: 0.18468059963870465
model size: 1.01M


  _warn_prf(average, modifier, msg_start, len(result))


best params: {'c1': 0.338216973284279, 'c2': 0.0027419547244570343}
best CV score: 0.17996364460883024
model size: 0.80M


  _warn_prf(average, modifier, msg_start, len(result))


best params: {'c1': 0.5927393689073626, 'c2': 0.026737072845138946}
best CV score: 0.18094767007406312
model size: 0.68M


  _warn_prf(average, modifier, msg_start, len(result))


best params: {'c1': 0.42679602002848455, 'c2': 0.007933879076308518}
best CV score: 0.18467416655568344
model size: 0.75M


  _warn_prf(average, modifier, msg_start, len(result))


best params: {'c1': 0.25348058103431914, 'c2': 0.009037391614426122}
best CV score: 0.1825561213874408
model size: 0.90M




best params: {'c1': 0.27166968530912255, 'c2': 0.057793985485471114}
best CV score: 0.18270998097458646
model size: 1.02M




best params: {'c1': 0.16968094082387417, 'c2': 0.006964745469403186}
best CV score: 0.19832554972071642
model size: 0.95M


  _warn_prf(average, modifier, msg_start, len(result))


best params: {'c1': 0.42679602002848455, 'c2': 0.007933879076308518}
best CV score: 0.18369063409328557
model size: 0.74M


  _warn_prf(average, modifier, msg_start, len(result))


best params: {'c1': 0.338216973284279, 'c2': 0.0027419547244570343}
best CV score: 0.17996364460883024
model size: 0.80M


  _warn_prf(average, modifier, msg_start, len(result))


best params: {'c1': 0.44495286986762533, 'c2': 0.06015838309715876}
best CV score: 0.18068208176721123
model size: 1.03M




best params: {'c1': 0.5886282974499543, 'c2': 0.01895447350848429}
best CV score: 0.1807484386328053
model size: 0.67M


  _warn_prf(average, modifier, msg_start, len(result))


best params: {'c1': 0.27166968530912255, 'c2': 0.057793985485471114}
best CV score: 0.20400492781078583
model size: 1.01M


In [54]:
extra = {
        "context": [0],
        "features": ['is_mention', 'is_money', 'is_url', 'is_hashtag', 'is_punct', 'is_repeated_punct', 'is_first_capital', 'is_stopword']
        }

run_test(extra, labels, fname='0-is_regex')



best params: {'c1': 0.338216973284279, 'c2': 0.0027419547244570343}
best CV score: 0.19672434353563523
model size: 0.78M


In [55]:
extra = {
        "context": [0],
        "features": ['is_mention', 'is_money', 'is_url', 'is_hashtag', 'is_punct',  'is_repeated_punct', 'is_first_capital', 'is_stopword', 'w2v']
        }

run_test(extra, labels, fname='0-is_regex_w2v')



best params: {'c1': 0.338216973284279, 'c2': 0.0027419547244570343}
best CV score: 0.2045543114447928
model size: 0.78M


In [56]:
extra = {
        "context": [0],
        "features": ['is_mention', 'is_money', 'is_url', 'is_hashtag', 'is_punct',  'is_repeated_punct', 'is_first_capital', 'is_stopword', 'lemma']
        }

run_test(extra, labels, fname='0-is_regex_lemma')

  _warn_prf(average, modifier, msg_start, len(result))


best params: {'c1': 0.27166968530912255, 'c2': 0.057793985485471114}
best CV score: 0.19147617477254542
model size: 1.20M


In [57]:
extra = {
        "context": [0],
        "features": ['is_mention', 'is_money', 'is_url', 'is_hashtag', 'is_punct',  'is_repeated_punct', 'is_first_capital', 'is_stopword', 'glove']
        }

run_test(extra, labels, fname='0-is_regex_glove')



best params: {'c1': 0.2622398782739819, 'c2': 0.01352663045303994}
best CV score: 0.2251858760682679
model size: 0.87M


In [58]:
extra = {
        "context": [1],
        "features": ['is_mention', 'is_money', 'is_url', 'is_hashtag', 'is_punct', 'is_repeated_punct', 'is_first_capital', 'is_stopword']
        }

run_test(extra, labels, fname='1-is_regex')

extra = {
        "context": [1],
        "features": ['is_mention', 'is_money', 'is_url', 'is_hashtag', 'is_punct',  'is_repeated_punct', 'is_first_capital', 'is_stopword', 'w2v']
        }

run_test(extra, labels, fname='1-is_regex_w2v')

extra = {
        "context": [1],
        "features": ['is_mention', 'is_money', 'is_url', 'is_hashtag', 'is_punct',  'is_repeated_punct', 'is_first_capital', 'is_stopword', 'lemma']
        }

run_test(extra, labels, fname='1-is_regex_lemma')

extra = {
        "context": [1],
        "features": ['is_mention', 'is_money', 'is_url', 'is_hashtag', 'is_punct',  'is_repeated_punct', 'is_first_capital', 'is_stopword', 'glove']
        }

run_test(extra, labels, fname='1-is_regex_glove')



best params: {'c1': 0.07935479759733695, 'c2': 0.004844193582686673}
best CV score: 0.19967063495601692
model size: 1.08M




best params: {'c1': 0.18801930645448509, 'c2': 0.05893517492215625}
best CV score: 0.19809596075586017
model size: 1.14M


  _warn_prf(average, modifier, msg_start, len(result))


best params: {'c1': 0.2622398782739819, 'c2': 0.01352663045303994}
best CV score: 0.1995374087309951
model size: 1.23M




best params: {'c1': 0.16968094082387417, 'c2': 0.006964745469403186}
best CV score: 0.2315624445996689
model size: 0.92M


In [59]:
vector_size = 25

model = Word2Vec(vector_size=vector_size)
model.build_vocab(sents_only)
model.train(sents_only, total_examples=model.corpus_count, epochs=model.epochs)

glove_file = 'models/glove.twitter.27B.25d.txt'
tmp_file = get_tmpfile("test_word2vec.txt")
_ = glove2word2vec(glove_file, tmp_file)
glove_vectors = KeyedVectors.load_word2vec_format(tmp_file)

model.build_vocab([list(glove_vectors.index_to_key)], update=True)
model.wv.vectors_lockf = np.ones(len(model.wv))
model.wv.intersect_word2vec_format(tmp_file, binary=False)

total_examples = model.corpus_count
model.train(sents_only, total_examples=total_examples, epochs=model.epochs)

extra = {
        "context": [0],
        "features": ['is_mention', 'is_money', 'is_url', 'is_hashtag', 'is_punct',  'is_repeated_punct', 'is_first_capital', 'is_stopword', 'w2v']
        }

run_test(extra, labels, fname='0-is_regex_w2v_finetuned')

  _ = glove2word2vec(glove_file, tmp_file)


best params: {'c1': 0.2622398782739819, 'c2': 0.01352663045303994}
best CV score: 0.1978379783387171
model size: 0.87M


In [60]:
extra = {
        "context": [0],
        "features": ['w2v']
        }

run_test(extra, labels, fname='0-is_w2v_finetuned')

  _warn_prf(average, modifier, msg_start, len(result))


best params: {'c1': 0.44495286986762533, 'c2': 0.06015838309715876}
best CV score: 0.18602764494418067
model size: 0.86M


In [62]:
extra = {
        "context": [1],
        "features": ['w2v']
        }

run_test(extra, labels, fname='1-w2v_finetuned')

  _warn_prf(average, modifier, msg_start, len(result))


best params: {'c1': 0.338216973284279, 'c2': 0.0027419547244570343}
best CV score: 0.19626975768350075
model size: 0.79M


In [63]:
extra = {
        "context": [0],
        "features": ['is_mention', 'is_money', 'is_url', 'is_hashtag', 'is_punct', 'is_first_capital', 'is_stopword', 'has_emoji', 'lemma', 'w2v']
        }

run_test(extra, labels, fname='0-everything_finetuned')

  _warn_prf(average, modifier, msg_start, len(result))


best params: {'c1': 0.25348058103431914, 'c2': 0.009037391614426122}
best CV score: 0.19453053063297387
model size: 1.03M
