In [39]:
import os
import sys
import time
import pickle

import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
import numpy as np

import allennlp
# from allennlp.common.testing import AllenNlpTestCase, ModelTestCase
from allennlp.data.vocabulary import Vocabulary
from allennlp.data.iterators import BucketIterator
from allennlp.data.token_indexers import SingleIdTokenIndexer
# from Module import 

In [44]:
raw = pickle.load(open('rnn_input.txt', 'rb'))

In [45]:
print(raw[0:2])

[{'verbs': (('worked_with',), ['shortened', 'stylized_as', 'is', 'is_owned_by']), 'label': 'SUPPORTS'}, {'verbs': (('worked_with',), ['played', 'appearing_as', 'intended_as']), 'label': 'SUPPORTS'}]


In [46]:
x_train, y_train = [], []

In [47]:
for d in raw:
    x_train.append(d['verbs'])
    y_train.append(d['label'])

In [48]:
print(y_train[:100])

['SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'REFUTES', 'NOT ENOUGH INFO', 'SUPPORTS', 'SUPPORTS', 'NOT ENOUGH INFO', 'NOT ENOUGH INFO', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'REFUTES', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'REFUTES', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'NOT ENOUGH INFO', 'SUPPORTS', 'NOT ENOUGH INFO', 'NOT ENOUGH INFO', 'SUPPORTS', 'REFUTES', 'REFUTES', 'REFUTES', 'REFUTES', 'SUPPORTS', 'NOT ENOUGH INFO', 'SUPPORTS', 'NOT ENOUGH INFO', 'NOT ENOUGH INFO', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'NOT ENOUGH INFO', 'NOT ENOUGH INFO', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'REFUTES', 'SUPPORTS', 'SUPPORTS', 'REFUTES', 'SUPPORTS', 'NOT ENOUGH INFO', 'SUPPORTS', 'REFUTES', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', '

In [49]:
y_train = np.array(y_train)
print(y_train[:20])

['SUPPORTS' 'SUPPORTS' 'SUPPORTS' 'SUPPORTS' 'SUPPORTS' 'REFUTES'
 'NOT ENOUGH INFO' 'SUPPORTS' 'SUPPORTS' 'NOT ENOUGH INFO'
 'NOT ENOUGH INFO' 'SUPPORTS' 'SUPPORTS' 'SUPPORTS' 'SUPPORTS' 'SUPPORTS'
 'SUPPORTS' 'SUPPORTS' 'SUPPORTS' 'SUPPORTS']


In [50]:
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(y_train)
print(integer_encoded[:20])

[2 2 2 2 2 1 0 2 2 0 0 2 2 2 2 2 2 2 2 2]


In [52]:
# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded[:20])

[[0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]]


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [53]:
print(x_train[0])

(('worked_with',), ['shortened', 'stylized_as', 'is', 'is_owned_by'])


In [54]:
clf = SVC(gamma='auto')

In [4]:
from nltk.tokenize import word_tokenize
from allennlp.data.tokenizers import Token
from allennlp.data.fields import TextField, LabelField
from typing import Iterator, List, Dict
from allennlp.data import Instance
from allennlp.data.dataset_readers import DatasetReader
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.models import Model
from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.data.vocabulary import Vocabulary
from allennlp.training.metrics import CategoricalAccuracy
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits
from allennlp.modules.token_embedders import Embedding
from allennlp.data.iterators import BucketIterator,BasicIterator
from allennlp.training.trainer import Trainer
from allennlp.predictors import SentenceTaggerPredictor

import torch.optim as optim
import numpy as np
import pickle
import torch

class VerbDatasetReader(DatasetReader):

    def __init__(self,sentence_indexers:Dict[str,TokenIndexer]=None )-> None:
        super().__init__(lazy=False)
        self.sentence_indexers=sentence_indexers or {"sentence":SingleIdTokenIndexer()}

    def text_to_instance(self, sentence:List[List],labels:str = None)->Instance:
        sent_tokenized=[]
        for sent in sentence:
            for word in word_tokenize(sent):
                sent_tokenized.append(Token(word))
        sentence_field=TextField(sent_tokenized,self.sentence_indexers)
        fields={'sentence':sentence_field,'labels':LabelField(labels)}
        return Instance(fields)

    def _read(self, file_path: str)->Iterator[Instance]:
        mlinput_merge=pickle.load(open(file_path,'rb'))
        for entry in mlinput_merge[:2000]:
            sentence_input=[entry['claim']]
            for sent in entry['evidence']:
                full_sent=' '.join(sent)
                sentence_input.append(full_sent)
            yield self.text_to_instance(sentence_input,entry['label'])

class Lstm(Model):
    def __init__(self,
                 word_embeddings:TextFieldEmbedder,
                 encoder:Seq2SeqEncoder,
                 vocab:Vocabulary)->None:
        super().__init__(vocab)
        self.word_embeddings=word_embeddings
        self.encoder=encoder
        self.hidden2tag = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                          out_features=vocab.get_vocab_size('labels'))
        self.accuracy = CategoricalAccuracy()

    def forward(self,
                sentence:Dict[str,torch.Tensor],
                labels:torch.Tensor==None)->Dict[str,torch.Tensor]:
        mask=get_text_field_mask(sentence)
        print(len(sentence),len(labels))
        embeddings = self.word_embeddings(sentence)
        encoder_out = self.encoder(embeddings, mask)
        tag_logits = self.hidden2tag(encoder_out)
        output = {"tag_logits": tag_logits}
        if labels is not None:
            self.accuracy(tag_logits, labels, mask)
            output["loss"] = sequence_cross_entropy_with_logits(tag_logits, labels, mask)

        return output

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {"accuracy": self.accuracy.get_metric(reset)}

reader = VerbDatasetReader()
train_dataset = reader.read('mlinput_merge.txt')

vocab=Vocabulary.from_instances(train_dataset)

2000it [00:02, 878.27it/s]
100%|██████████| 2000/2000 [00:00<00:00, 25349.12it/s]


In [12]:
vars(vars(train_dataset[0])['fields']['sentence'])

{'tokens': [Nikolaj,
  Coster-Waldau,
  worked,
  with,
  the,
  Fox,
  Broadcasting,
  Company,
  .,
  The,
  Fox,
  Broadcasting,
  Company,
  -LRB-,
  often,
  shortened,
  to,
  Fox,
  and,
  stylized,
  as,
  FOX,
  -RRB-,
  is,
  an,
  American,
  English,
  language,
  commercial,
  broadcast,
  television,
  network,
  that,
  is,
  owned,
  by,
  the,
  Fox,
  Entertainment,
  Group,
  subsidiary,
  of,
  21st,
  Century,
  Fox,
  .,
  He,
  then,
  played,
  Detective,
  John,
  Amsterdam,
  in,
  the,
  short-lived,
  Fox,
  television,
  series,
  New,
  Amsterdam,
  -LRB-,
  2008,
  -RRB-,
  ,,
  as,
  well,
  as,
  appearing,
  as,
  Frank,
  Pike,
  in,
  the,
  2009,
  Fox,
  television,
  film,
  Virtuality,
  ,,
  originally,
  intended,
  as,
  a,
  pilot,
  .],
 '_token_indexers': {'sentence': <allennlp.data.token_indexers.single_id_token_indexer.SingleIdTokenIndexer at 0x12b2ec5c0>},
 '_indexed_tokens': None,
 '_indexer_name_to_indexed_token': None}

In [16]:
vocab.get_vocab_size('sentence')

2

In [17]:
vars(vocab)['_token_to_index']

_TokenToIndexDefaultDict(None,
                         {'tokens': {'@@PADDING@@': 0,
                           '@@UNKNOWN@@': 1,
                           ',': 2,
                           '.': 3,
                           'the': 4,
                           'and': 5,
                           'in': 6,
                           'of': 7,
                           'a': 8,
                           'is': 9,
                           '-RRB-': 10,
                           '-LRB-': 11,
                           'N': 12,
                           'O': 13,
                           'The': 14,
                           'was': 15,
                           'by': 16,
                           'for': 17,
                           'as': 18,
                           'to': 19,
                           'film': 20,
                           'on': 21,
                           'an': 22,
                           '``': 23,
                           "'s": 24,
                  

In [18]:
EMBEDDING_DIM = 64
HIDDEN_DIM = 64

# token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('sentence'),
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('sentence'),
                            embedding_dim=EMBEDDING_DIM, padding_index=0)

In [19]:
type(token_embedding)

allennlp.modules.token_embedders.embedding.Embedding

In [20]:
vars(token_embedding)

{'_backend': <torch.nn.backends.thnn.THNNFunctionBackend at 0x1275bc438>,
 '_parameters': OrderedDict([('weight', Parameter containing:
               tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
                         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
                         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
                         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
                         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
                         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
                         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
                         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
                       [-0.1245, -0.1443,  0.1845,  0.0281,  0.1360, -0.0480, -0.2813, -0.2301,

In [21]:
word_embeddings = BasicTextFieldEmbedder({"sentence": token_embedding})

In [22]:
vars(word_embeddings)

{'_backend': <torch.nn.backends.thnn.THNNFunctionBackend at 0x1275bc438>,
 '_parameters': OrderedDict(),
 '_buffers': OrderedDict(),
 '_backward_hooks': OrderedDict(),
 '_forward_hooks': OrderedDict(),
 '_forward_pre_hooks': OrderedDict(),
 '_state_dict_hooks': OrderedDict(),
 '_load_state_dict_pre_hooks': OrderedDict(),
 '_modules': OrderedDict([('token_embedder_sentence', Embedding())]),
 'training': True,
 '_token_embedders': {'sentence': Embedding()},
 '_embedder_to_indexer_map': None,
 '_allow_unmatched_keys': False}

In [23]:
lstm = PytorchSeq2SeqWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

In [24]:
vars(lstm)

{'_backend': <torch.nn.backends.thnn.THNNFunctionBackend at 0x1275bc438>,
 '_parameters': OrderedDict(),
 '_buffers': OrderedDict(),
 '_backward_hooks': OrderedDict(),
 '_forward_hooks': OrderedDict(),
 '_forward_pre_hooks': OrderedDict(),
 '_state_dict_hooks': OrderedDict(),
 '_load_state_dict_pre_hooks': OrderedDict(),
 '_modules': OrderedDict([('_module', LSTM(64, 64, batch_first=True))]),
 'training': True,
 'stateful': False,
 '_states': None,
 '_is_bidirectional': False,
 '_num_directions': 1}

In [25]:
model = Lstm(word_embeddings, lstm, vocab)

In [26]:
vars(model)

{'_backend': <torch.nn.backends.thnn.THNNFunctionBackend at 0x1275bc438>,
 '_parameters': OrderedDict(),
 '_buffers': OrderedDict(),
 '_backward_hooks': OrderedDict(),
 '_forward_hooks': OrderedDict(),
 '_forward_pre_hooks': OrderedDict(),
 '_state_dict_hooks': OrderedDict(),
 '_load_state_dict_pre_hooks': OrderedDict(),
 '_modules': OrderedDict([('word_embeddings', BasicTextFieldEmbedder(
                 (token_embedder_sentence): Embedding()
               )), ('encoder', PytorchSeq2SeqWrapper(
                 (_module): LSTM(64, 64, batch_first=True)
               )), ('hidden2tag',
               Linear(in_features=64, out_features=3, bias=True))]),
 'training': True,
 'vocab': Vocabulary with namespaces:  tokens, Size: 12927 || labels, Size: 3 || Non Padded Namespaces: {'*tags', '*labels'},
 '_regularizer': None,
 'accuracy': <allennlp.training.metrics.categorical_accuracy.CategoricalAccuracy at 0x135413eb8>}

In [27]:
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [28]:
vars(optimizer)

{'defaults': {'lr': 0.1,
  'momentum': 0,
  'dampening': 0,
  'weight_decay': 0,
  'nesterov': False},
 'state': defaultdict(dict, {}),
 'param_groups': [{'params': [Parameter containing:
    tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
              0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
              0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
              0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
              0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
              0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
              0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
              0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
            [-0.1245, -0.1443,  0.1845,  0.0281,  0.1360, -0.0480, -0.2813, -0.2301,
              0.0536, -0.1116,  0.1333,  0.217

In [29]:
iterator = BucketIterator(batch_size=32, sorting_keys=[("sentence", "num_tokens")])
iterator.index_with(vocab)

In [35]:
vars(iterator)

{'vocab': Vocabulary with namespaces:  tokens, Size: 12927 || labels, Size: 3 || Non Padded Namespaces: {'*tags', '*labels'},
 '_batch_size': 32,
 '_max_instances_in_memory': None,
 '_instances_per_epoch': None,
 '_maximum_samples_per_batch': None,
 '_cache_instances': False,
 '_cache': defaultdict(list, {}),
 '_track_epoch': False,
 '_epochs': defaultdict(int, {}),
 '_cursors': {},
 '_sorting_keys': [('sentence', 'num_tokens')],
 '_padding_noise': 0.1,
 '_biggest_batch_first': False}

In [36]:
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=train_dataset,
                  patience=10,
                  num_epochs=1000)

In [37]:
vars(trainer)['train_data']

[<allennlp.data.instance.Instance at 0x136baf2e8>,
 <allennlp.data.instance.Instance at 0x136bafc50>,
 <allennlp.data.instance.Instance at 0x136bb7780>,
 <allennlp.data.instance.Instance at 0x136bb7be0>,
 <allennlp.data.instance.Instance at 0x136bb7e10>,
 <allennlp.data.instance.Instance at 0x136bbccc0>,
 <allennlp.data.instance.Instance at 0x136bbcf98>,
 <allennlp.data.instance.Instance at 0x136bc11d0>,
 <allennlp.data.instance.Instance at 0x136bc1978>,
 <allennlp.data.instance.Instance at 0x136bc62b0>,
 <allennlp.data.instance.Instance at 0x136bc6d68>,
 <allennlp.data.instance.Instance at 0x136bce048>,
 <allennlp.data.instance.Instance at 0x136bd24a8>,
 <allennlp.data.instance.Instance at 0x136bd7198>,
 <allennlp.data.instance.Instance at 0x136bd7eb8>,
 <allennlp.data.instance.Instance at 0x136bdbe10>,
 <allennlp.data.instance.Instance at 0x136be0a58>,
 <allennlp.data.instance.Instance at 0x136be3710>,
 <allennlp.data.instance.Instance at 0x136be3c50>,
 <allennlp.data.instance.Instan

In [44]:
vars(vars(vars(trainer)['train_data'][0])['fields']['sentence'])

{'tokens': [Nikolaj,
  Coster-Waldau,
  worked,
  with,
  the,
  Fox,
  Broadcasting,
  Company,
  .,
  The,
  Fox,
  Broadcasting,
  Company,
  -LRB-,
  often,
  shortened,
  to,
  Fox,
  and,
  stylized,
  as,
  FOX,
  -RRB-,
  is,
  an,
  American,
  English,
  language,
  commercial,
  broadcast,
  television,
  network,
  that,
  is,
  owned,
  by,
  the,
  Fox,
  Entertainment,
  Group,
  subsidiary,
  of,
  21st,
  Century,
  Fox,
  .,
  He,
  then,
  played,
  Detective,
  John,
  Amsterdam,
  in,
  the,
  short-lived,
  Fox,
  television,
  series,
  New,
  Amsterdam,
  -LRB-,
  2008,
  -RRB-,
  ,,
  as,
  well,
  as,
  appearing,
  as,
  Frank,
  Pike,
  in,
  the,
  2009,
  Fox,
  television,
  film,
  Virtuality,
  ,,
  originally,
  intended,
  as,
  a,
  pilot,
  .],
 '_token_indexers': {'sentence': <allennlp.data.token_indexers.single_id_token_indexer.SingleIdTokenIndexer at 0x12b2ec5c0>},
 '_indexed_tokens': None,
 '_indexer_name_to_indexed_token': None}

In [124]:
trainer.train()




  0%|          | 0/63 [00:00<?, ?it/s][A[A[A

1 32


RuntimeError: index out of range at ../aten/src/TH/generic/THTensorEvenMoreMath.cpp:193

In [19]:
vars(vars(vars(trainer)['train_data'][0])['fields']['sentence'])

{'tokens': [Nikolaj,
  Coster-Waldau,
  worked,
  with,
  the,
  Fox,
  Broadcasting,
  Company,
  .,
  The,
  Fox,
  Broadcasting,
  Company,
  -LRB-,
  often,
  shortened,
  to,
  Fox,
  and,
  stylized,
  as,
  FOX,
  -RRB-,
  is,
  an,
  American,
  English,
  language,
  commercial,
  broadcast,
  television,
  network,
  that,
  is,
  owned,
  by,
  the,
  Fox,
  Entertainment,
  Group,
  subsidiary,
  of,
  21st,
  Century,
  Fox,
  .,
  He,
  then,
  played,
  Detective,
  John,
  Amsterdam,
  in,
  the,
  short-lived,
  Fox,
  television,
  series,
  New,
  Amsterdam,
  -LRB-,
  2008,
  -RRB-,
  ,,
  as,
  well,
  as,
  appearing,
  as,
  Frank,
  Pike,
  in,
  the,
  2009,
  Fox,
  television,
  film,
  Virtuality,
  ,,
  originally,
  intended,
  as,
  a,
  pilot,
  .],
 '_token_indexers': {'sentence': <allennlp.data.token_indexers.single_id_token_indexer.SingleIdTokenIndexer at 0x130a934e0>},
 '_indexed_tokens': None,
 '_indexer_name_to_indexed_token': None}

In [None]:
print("start saving the weights")
with open("test.th", 'wb') as f:
    torch.save(model.state_dict(), f)
print("saving finished")
print("start saving vocab")
vocab.save_to_files("vocabulary")
print("saving finished")

In [None]:
predictor = SentenceTaggerPredictor(model, dataset_reader=reader)

print('start doing prediction')
tag_logits = predictor.predict(['The Khmer Empire was not weak.','The Khmer Empire , officially the Angkor Empire , the predecessor state to modern Cambodia -LRB- `` Kampuchea '' or `` Srok Khmer '' to the Khmer people -RRB- , was a powerful Hindu-Buddhist empire in Southeast Asia .'])['tag_logits']
tag_ids = np.argmax(tag_logits, axis=-1)
print([model.vocab.get_token_from_index(i, 'labels') for i in tag_ids])

print("predicting finished")