In [17]:
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import pandas as pd
import numpy as np
import math
import thinc.extra.datasets

import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

import spacy
from spacy.util import minibatch, compounding, decaying

In [2]:
n_iter = 20

In [3]:
def get_data(validation_ratio=0.1):
    df_trn = pd.read_csv('germeval2018.training.txt', sep='\t', header=None, names=['text', 'bin', 'detail']).drop('detail', axis=1)
    
    idx = np.arange(len(df_trn))
    np.random.shuffle(idx)
    val_size = math.ceil(len(df_trn) * validation_ratio)
    
    val_df = df_trn.iloc[idx[:val_size]]
    trn_df = df_trn.iloc[idx[val_size:]]
    
    trn_labels = [{'OFFENSE': x == 'OFFENSE'} for x in trn_df['bin'].values]
    val_labels = [{'OFFENSE': x == 'OFFENSE'} for x in val_df['bin'].values]
    
    return (tuple(trn_df['text'].values), trn_labels), (tuple(val_df['text'].values), val_labels)
    

In [4]:
def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 1e-8  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 1e-8  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_score = 2 * (precision * recall) / (precision + recall)
    return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}


In [5]:
(train_texts, train_cats), (dev_texts, dev_cats) = get_data()

#print(f'text: {train_texts[:2]}')
#print(f'cats: {train_cats[:2]}')
#print(f'cats: {dev_cats[:2]}')

print("Using examples ({} training, {} evaluation)".format(len(train_texts), len(dev_texts)))
train_data = list(zip(train_texts, [{'cats': cats} for cats in train_cats]))
train_data[3:4]

Using examples (4508 training, 501 evaluation)


[('@Frank_Pasemann @KrahMax 😂😂...in Deutschland ist alles normal....wer tatsächlich noch Respekt den deutschen gegenüber bringt gilt schon als Aussätziger.... |LBR| Wir hätten die Juden in Ruhe lassen sollen...wir hätten den Krieg niemals verlieren dürfen.',
  {'cats': {'OFFENSE': False}})]

In [6]:
class TwitterHandleMatcher(object):
    def __init__(self, nlp):
        self.pos_value = 95 # PROPN

    def __call__(self, doc):
        for i, t in enumerate(doc):
            if t.text.startswith('@'):
                span = doc[i:i+1]
                span.merge(pos=self.pos_value)
        return doc
                

In [15]:
#nlp = spacy.load('de_core_news_md')
nlp = spacy.load('/Users/michel/innoq/machinelearning/germeval2018/twitter_vec_200')
DEFAULT_PIPES = ['tagger', 'parser', 'ner']
for n in DEFAULT_PIPES:
    p = nlp.create_pipe(n)
    nlp.add_pipe(p, last=True)
print(f'loaded model {nlp.lang}')

twitter = TwitterHandleMatcher(nlp)
nlp.add_pipe(twitter, after='tagger')

if 'textcat' not in nlp.pipe_names:
    textcat = nlp.create_pipe('textcat')
    nlp.add_pipe(textcat, last=True)
else:
    textcat = nlp.get_pipe('textcat')

textcat.add_label('OFFENSE')

print(f'pipeline {nlp.pipe_names}')

loaded model de
pipeline ['tagger', 'TwitterHandleMatcher', 'parser', 'ner', 'textcat']


In [8]:
#doc = nlp(train_texts[3])

In [20]:
%xmode Verbose
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()
    optimizer.max_grad_norm = 0.6
    print("Training the model...")
    print('\t{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
    best_f = 0
    last_f = 0
    n_iter_nogain = 0
    dropout = decaying(0.45, 0.2, 1e-4)
    for i in range(n_iter):
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(train_data, size=compounding(1, 8, 1.01))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=next(dropout), losses=losses)
        with textcat.model.use_params(optimizer.averages):
            # evaluate on the dev data split off in load_data()
            scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
        if scores['textcat_f'] > best_f:
            best_f = scores['textcat_f']
            n_iter_nogain = 0
        elif scores['textcat_f'] > last_f:
            n_iter_nogain = 0
        else:
            n_iter_nogain += 1
        last_f = scores['textcat_f']
        print('{4}\t{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  # print a simple table
              .format(losses['textcat'], scores['textcat_p'],
                      scores['textcat_r'], scores['textcat_f'], i))
        if n_iter_nogain > 3:
            print('early stopping')
            break


Exception reporting mode: Verbose
Training the model...
	LOSS 	  P  	  R  	  F  
0	86.283	0.759	0.434	0.552
1	64.526	0.727	0.534	0.616
2	43.696	0.683	0.503	0.579
3	32.662	0.676	0.508	0.580
4	26.190	0.680	0.540	0.602
5	19.169	0.690	0.529	0.599
6	15.901	0.669	0.524	0.588
7	14.385	0.671	0.519	0.585
8	11.039	0.667	0.529	0.590
9	10.520	0.647	0.534	0.586
10	10.945	0.641	0.529	0.580
11	8.094	0.660	0.566	0.610
12	7.519	0.664	0.534	0.592
13	5.268	0.639	0.524	0.576
14	6.019	0.669	0.545	0.601
15	4.473	0.652	0.534	0.587
16	5.199	0.647	0.534	0.586
17	4.056	0.633	0.529	0.576
18	4.486	0.647	0.524	0.579
19	4.686	0.645	0.529	0.581
