 построить  feed forward NN модель на pytorch для задачи NER из 4 дз. разрешается использовать эмбеддинги. Необходимо побить бейзлайны.

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder
from sklearn.dummy import DummyClassifier

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


SEED=1337

In [2]:
df = pd.read_csv('ner_short.csv', index_col=0)
df.head()

Unnamed: 0,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,sentence_idx,word,tag
0,NNS,demonstrators,IN,of,NNS,__START1__,__START2__,__START2__,__START1__,1.0,Thousands,O
1,VBP,have,NNS,demonstrators,IN,NNS,__START1__,__START1__,Thousands,1.0,of,O
2,VBN,marched,VBP,have,NNS,IN,NNS,Thousands,of,1.0,demonstrators,O
3,IN,through,VBN,marched,VBP,NNS,IN,of,demonstrators,1.0,have,O
4,NNP,London,IN,through,VBN,VBP,NNS,demonstrators,have,1.0,marched,O


In [3]:
# sentence length
tdf = df.set_index('sentence_idx')
tdf['length'] = df.groupby('sentence_idx').tag.count()
df = tdf.reset_index(drop=False)

In [4]:
# encode categorial variables

le = LabelEncoder()
df['pos'] = le.fit_transform(df.pos)
df['next-pos'] = le.fit_transform(df['next-pos'])
df['next-next-pos'] = le.fit_transform(df['next-next-pos'])
df['prev-pos'] = le.fit_transform(df['prev-pos'])
df['prev-prev-pos'] = le.fit_transform(df['prev-prev-pos'])

In [5]:
# splitting
y = LabelEncoder().fit_transform(df.tag)

df_train, df_test, y_train, y_test = model_selection.train_test_split(df, y, stratify=y, 
                                                                      test_size=0.25, random_state=SEED, shuffle=True)
print('train', df_train.shape[0])
print('test', df_test.shape[0])

train 50155
test 16719


In [6]:
# some wrappers to work with word2vec
from gensim.models.word2vec import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin
from collections import defaultdict

   
class Word2VecWrapper(TransformerMixin):
    def __init__(self, window=5,negative=5, size=100, iter=100, is_cbow=False, random_state=SEED):
        self.window_ = window
        self.negative_ = negative
        self.size_ = size
        self.iter_ = iter
        self.is_cbow_ = is_cbow
        self.w2v = None
        self.random_state = random_state
        
    def get_size(self):
        return self.size_

    def fit(self, X, y=None):
        """
        X: list of strings
        """
        sentences_list = [x.split() for x in X]
        self.w2v = Word2Vec(sentences_list, 
                            window=self.window_,
                            negative=self.negative_, 
                            size=self.size_, 
                            iter=self.iter_,
                            sg=not self.is_cbow_, seed=self.random_state)

        return self
    
    def has(self, word):
        return word in self.w2v

    def transform(self, X):
        """
        X: a list of words
        """
        if self.w2v is None:
            raise Exception('model not fitted')
        return np.array([self.w2v[w] if w in self.w2v else np.zeros(self.size_) for w in X ])

In [7]:
%%time
# here we exploit that word2vec is an unsupervised learning algorithm
# so we can train it on the whole dataset (subject to discussion)

sentences_list = [x.strip() for x in ' '.join(df.word).split('.')]

w2v_cbow = Word2VecWrapper(window=5, negative=5, size=300, iter=300, is_cbow=True, random_state=SEED)
w2v_cbow.fit(sentences_list)


CPU times: user 28.2 s, sys: 300 ms, total: 28.5 s
Wall time: 12 s


In [8]:
%%time

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import LinearSVC
import scipy.sparse as sp

embeding = w2v_cbow
encoder_pos = OneHotEncoder()
X_train = sp.hstack([
    embeding.transform(df_train.word),
    embeding.transform(df_train['next-word']),
    embeding.transform(df_train['next-next-word']),
    embeding.transform(df_train['prev-word']),
    embeding.transform(df_train['prev-prev-word']),
    encoder_pos.fit_transform(df_train[['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']])
])
X_test = sp.hstack([
    embeding.transform(df_test.word),
    embeding.transform(df_test['next-word']),
    embeding.transform(df_test['next-next-word']),
    embeding.transform(df_test['prev-word']),
    embeding.transform(df_test['prev-prev-word']),
    encoder_pos.transform(df_test[['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']])
])


CPU times: user 5.79 s, sys: 1.49 s, total: 7.29 s
Wall time: 7.32 s


feed forward NN

In [9]:
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

import pandas as pd
import torch as tt
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import DataLoader, TensorDataset
from torchvision import datasets, transforms
from tqdm import tqdm_notebook
from tensorboardX import SummaryWriter

In [10]:
class Demo1(nn.Module):
    
    def __init__(self, p):
        super(Demo1, self).__init__()
        self.drop = nn.Dropout(p)
        self.fc1 = nn.Linear(X_train.shape[1], 100)
        
        # out dim = 10 because we have only 10 digits
        self.fc2 = nn.Linear(100, 17)

    def forward(self, x):
        
        # flatten
        x = x.view(x.size(0), -1)
        
        # 1st layer
        x = self.fc1(x.float())
        x = F.elu(x)
        
        # dropout layer
        x = self.drop(x)
        
        # 2nd layer
        x = self.fc2(x)
        return x

In [11]:
def train_nn(model, train_loader, val_loader, n_batches, optimizer, criterion, n_epochs=20, 
             device=tt.device('cpu'),
             mu=0.9, 
             logdir=None,
             checkdir=None,
             reduce_lr_patience=2,
             early_stopping=4,
             verbose=True
            ):
    if logdir:
        sw = SummaryWriter(logdir)
    else:
        sw = None
        
    early_stopping_epochs = 0
    prev_loss = 100500
    history = []
    
    if reduce_lr_patience > 0:
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=reduce_lr_patience, verbose=verbose)
    
    for epoch in range(n_epochs):
        
        model.train()
        running_loss = 0
        
        if verbose:
            batch_iter = tqdm_notebook(enumerate(train_loader), total=n_batches, desc='epoch %d' % (epoch + 1), leave=True)
        else:
            batch_iter = enumerate(train_loader)
            
        for i, (X, y) in batch_iter:
            
            X = X.to(device)
            y = y.to(device)
            
            optimizer.zero_grad()
            
            prediction = model(X)
            
            loss = criterion(prediction, y)
            
            loss.backward()
            optimizer.step()
            
            current_loss = loss.data.detach().item()
            running_loss = running_loss * mu + current_loss * (1-mu)
            
            if verbose:
                batch_iter.set_postfix(loss='%.4f' % running_loss)
                
            niter = epoch * n_batches + i
            
            if sw:
                sw.add_scalar('Train/Loss', current_loss, niter)
                
                
        # validation on epoch
        model.eval()
        val_loss = []

        with tt.no_grad():
            for X, y in val_loader:
                X = X.to(device)
                y = y.to(device)

                prediction = model(X)
                loss = criterion(prediction, y)
                loss = loss.data.detach().item()
                val_loss.append( loss )

        val_loss = np.mean(val_loss)

        if verbose:
            print('validation loss=%.4f' % val_loss)

        if sw:
            sw.add_scalar('Validation/Loss', val_loss, epoch)

        if reduce_lr_patience > 0:
            scheduler.step(val_loss)

        if checkdir:
            tt.save(model.state_dict(), checkdir + 'epoch_%d_val_loss_%f' % (epoch, val_loss))


        history.append({
            'epoch': epoch,
            'train_loss': running_loss,
            'val_loss': val_loss,
        })

        if early_stopping > 0:
            if val_loss > prev_loss:
                early_stopping_epochs = 1
            else:
                early_stopping_epochs = 0

            if early_stopping_epochs >= early_stopping:
                if verbose:
                    print('Early stopping, best val_loss=%.4f' % prev_loss)
                break

            prev_loss = min(prev_loss, val_loss)

    return pd.DataFrame(history)

In [12]:
train = TensorDataset(tt.tensor(X_train.toarray()), tt.tensor(y_train))
train_loader = DataLoader(train, batch_size=32, shuffle=True)

val = TensorDataset(tt.tensor(X_test.toarray()), tt.tensor(y_test))
val_loader = DataLoader(val, batch_size=32, shuffle=True)

In [14]:
dropout_rate = 0


device = tt.device('cpu')

model = Demo1(dropout_rate).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

criterion = nn.CrossEntropyLoss()

n_batches = int(np.ceil(len(train_loader.dataset) / 32))

In [15]:
train_nn(model, train_loader, val_loader, n_batches, optimizer, criterion, 
         n_epochs=10,
         device=device
        )


validation loss=0.1854



validation loss=0.1444



validation loss=0.1239



validation loss=0.1140



validation loss=0.1012



validation loss=0.0925



validation loss=0.0917



validation loss=0.0887



validation loss=0.0929



validation loss=0.0924


Unnamed: 0,epoch,train_loss,val_loss
0,0,0.186302,0.185375
1,1,0.133275,0.144405
2,2,0.085888,0.123917
3,3,0.060294,0.114022
4,4,0.077882,0.101222
5,5,0.023641,0.092546
6,6,0.031795,0.09171
7,7,0.025458,0.088666
8,8,0.031854,0.092859
9,9,0.014345,0.092395


In [19]:
def predict(X_test, model):
    xt = tt.tensor(X_test.todense(), dtype=tt.float)
    pred = model.forward(xt)
    pred = tt.softmax(pred, dim=-1)
    pred = pred.detach().numpy()
    predicted_y = np.argmax(pred, axis=1)
    return predicted_y

In [20]:
y_pred = predict(X_test, model)

In [21]:
from sklearn.metrics import f1_score

f1_score(y_test, y_pred, average='macro')

0.8219913233093631

Бейзлайн 0.8122 побит