In [39]:
import os
import sys
import time
import pickle

import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
import numpy as np

import allennlp
# from allennlp.common.testing import AllenNlpTestCase, ModelTestCase
from allennlp.data.vocabulary import Vocabulary
from allennlp.data.iterators import BucketIterator
from allennlp.data.token_indexers import SingleIdTokenIndexer
# from Module import 

In [44]:
raw = pickle.load(open('rnn_input.txt', 'rb'))

In [45]:
print(raw[0:2])

[{'verbs': (('worked_with',), ['shortened', 'stylized_as', 'is', 'is_owned_by']), 'label': 'SUPPORTS'}, {'verbs': (('worked_with',), ['played', 'appearing_as', 'intended_as']), 'label': 'SUPPORTS'}]


In [46]:
x_train, y_train = [], []

In [47]:
for d in raw:
    x_train.append(d['verbs'])
    y_train.append(d['label'])

In [48]:
print(y_train[:100])

['SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'REFUTES', 'NOT ENOUGH INFO', 'SUPPORTS', 'SUPPORTS', 'NOT ENOUGH INFO', 'NOT ENOUGH INFO', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'REFUTES', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'REFUTES', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'NOT ENOUGH INFO', 'SUPPORTS', 'NOT ENOUGH INFO', 'NOT ENOUGH INFO', 'SUPPORTS', 'REFUTES', 'REFUTES', 'REFUTES', 'REFUTES', 'SUPPORTS', 'NOT ENOUGH INFO', 'SUPPORTS', 'NOT ENOUGH INFO', 'NOT ENOUGH INFO', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'NOT ENOUGH INFO', 'NOT ENOUGH INFO', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'REFUTES', 'SUPPORTS', 'SUPPORTS', 'REFUTES', 'SUPPORTS', 'NOT ENOUGH INFO', 'SUPPORTS', 'REFUTES', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', '

In [49]:
y_train = np.array(y_train)
print(y_train[:20])

['SUPPORTS' 'SUPPORTS' 'SUPPORTS' 'SUPPORTS' 'SUPPORTS' 'REFUTES'
 'NOT ENOUGH INFO' 'SUPPORTS' 'SUPPORTS' 'NOT ENOUGH INFO'
 'NOT ENOUGH INFO' 'SUPPORTS' 'SUPPORTS' 'SUPPORTS' 'SUPPORTS' 'SUPPORTS'
 'SUPPORTS' 'SUPPORTS' 'SUPPORTS' 'SUPPORTS']


In [50]:
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(y_train)
print(integer_encoded[:20])

[2 2 2 2 2 1 0 2 2 0 0 2 2 2 2 2 2 2 2 2]


In [52]:
# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded[:20])

[[0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]]


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [53]:
print(x_train[0])

(('worked_with',), ['shortened', 'stylized_as', 'is', 'is_owned_by'])


In [54]:
clf = SVC(gamma='auto')

In [265]:
from nltk.tokenize import word_tokenize
from allennlp.data.tokenizers import Token
from allennlp.data.fields import TextField, LabelField
from typing import Iterator, List, Dict
from allennlp.data import Instance
from allennlp.data.dataset_readers import DatasetReader
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.models import Model
from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.data.vocabulary import Vocabulary
from allennlp.training.metrics import CategoricalAccuracy
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits
from allennlp.modules.token_embedders import Embedding
from allennlp.data.iterators import BucketIterator,BasicIterator
from allennlp.training.trainer import Trainer
from allennlp.predictors import SentenceTaggerPredictor

import torch.optim as optim
import numpy as np
import pickle
import torch

class VerbDatasetReader(DatasetReader):

    def __init__(self,sentence_indexers:Dict[str,TokenIndexer]=None )-> None:
        super().__init__(lazy=False)
        self.sentence_indexers=sentence_indexers or {"sentence":SingleIdTokenIndexer()}

    def text_to_instance(self, sentence:List[List],labels:str = None)->Instance:
        sent_tokenized=[]
        for sent in sentence:
            for word in word_tokenize(sent):
                sent_tokenized.append(Token(word))
        sentence_field=TextField(sent_tokenized,self.sentence_indexers)
        fields={'sentence':sentence_field,'labels':LabelField(labels)}
        return Instance(fields)

    def _read(self, file_path: str)->Iterator[Instance]:
        mlinput_merge=pickle.load(open(file_path,'rb'))
        for entry in mlinput_merge[:2000]:
            sentence_input=[entry['claim']]
            for sent in entry['evidence']:
                full_sent=' '.join(sent)
                sentence_input.append(full_sent)
            yield self.text_to_instance(sentence_input,entry['label'])

class Lstm(Model):
    def __init__(self,
                 word_embeddings:TextFieldEmbedder,
                 encoder:Seq2SeqEncoder,
                 vocab:Vocabulary)->None:
        super().__init__(vocab)
        self.word_embeddings=word_embeddings
        self.encoder=encoder
        self.hidden2tag = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                          out_features=vocab.get_vocab_size('labels'))
        self.accuracy = CategoricalAccuracy()

    def forward(self,
                sentence:Dict[str,torch.Tensor],
                labels:torch.Tensor==None)->Dict[str,torch.Tensor]:
        mask=get_text_field_mask(sentence)
#         print(sentence)
        print(labels)
        print(type(labels))
        print(len(sentence),len(labels))
        embeddings = self.word_embeddings(sentence)
        encoder_out = self.encoder(embeddings, mask)
        tag_logits = self.hidden2tag(encoder_out)
        output = {"tag_logits": tag_logits}
        if labels is not None:
            self.accuracy(tag_logits, labels, mask)
            output["loss"] = sequence_cross_entropy_with_logits(tag_logits, labels, mask)
        return output

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {"accuracy": self.accuracy.get_metric(reset)}

reader = VerbDatasetReader()
train_dataset = reader.read('mlinput_merge.txt')

vocab=Vocabulary.from_instances(train_dataset)




0it [00:00, ?it/s][A[A[A
[A
1it [00:00,  1.27it/s][A
100it [00:00,  1.81it/s][A
226it [00:00,  2.59it/s][A
301it [00:01,  3.69it/s][A
413it [00:01,  5.26it/s][A
512it [00:01,  7.50it/s][A
626it [00:01, 10.69it/s][A
742it [00:01, 15.21it/s][A
855it [00:01, 21.60it/s][A
963it [00:01, 30.60it/s][A
1080it [00:01, 43.23it/s][A
1193it [00:01, 60.76it/s][A
1316it [00:01, 84.99it/s][A
1444it [00:02, 118.03it/s][A
1562it [00:02, 161.39it/s][A
1679it [00:02, 217.52it/s][A
1808it [00:02, 288.72it/s][A
1936it [00:02, 376.02it/s][A
100%|██████████| 2000/2000 [00:00<00:00, 24727.58it/s]


In [243]:
vars(vars(train_dataset[0])['fields']['sentence'])

{'tokens': [Nikolaj,
  Coster-Waldau,
  worked,
  with,
  the,
  Fox,
  Broadcasting,
  Company,
  .,
  The,
  Fox,
  Broadcasting,
  Company,
  -LRB-,
  often,
  shortened,
  to,
  Fox,
  and,
  stylized,
  as,
  FOX,
  -RRB-,
  is,
  an,
  American,
  English,
  language,
  commercial,
  broadcast,
  television,
  network,
  that,
  is,
  owned,
  by,
  the,
  Fox,
  Entertainment,
  Group,
  subsidiary,
  of,
  21st,
  Century,
  Fox,
  .,
  He,
  then,
  played,
  Detective,
  John,
  Amsterdam,
  in,
  the,
  short-lived,
  Fox,
  television,
  series,
  New,
  Amsterdam,
  -LRB-,
  2008,
  -RRB-,
  ,,
  as,
  well,
  as,
  appearing,
  as,
  Frank,
  Pike,
  in,
  the,
  2009,
  Fox,
  television,
  film,
  Virtuality,
  ,,
  originally,
  intended,
  as,
  a,
  pilot,
  .],
 '_token_indexers': {'sentence': <allennlp.data.token_indexers.single_id_token_indexer.SingleIdTokenIndexer at 0x1355bc748>},
 '_indexed_tokens': None,
 '_indexer_name_to_indexed_token': None}

In [244]:
vocab.get_vocab_size('tokens')

12927

In [245]:
vars(vocab)

{'_padding_token': '@@PADDING@@',
 '_oov_token': '@@UNKNOWN@@',
 '_non_padded_namespaces': {'*labels', '*tags'},
 '_token_to_index': _TokenToIndexDefaultDict(None,
                          {'tokens': {'@@PADDING@@': 0,
                            '@@UNKNOWN@@': 1,
                            ',': 2,
                            '.': 3,
                            'the': 4,
                            'and': 5,
                            'in': 6,
                            'of': 7,
                            'a': 8,
                            'is': 9,
                            '-RRB-': 10,
                            '-LRB-': 11,
                            'N': 12,
                            'O': 13,
                            'The': 14,
                            'was': 15,
                            'by': 16,
                            'for': 17,
                            'as': 18,
                            'to': 19,
                            'film': 20,
            

In [246]:
EMBEDDING_DIM = 100
HIDDEN_DIM = 100

# token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('sentence'),
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens') + 2,
                            embedding_dim=EMBEDDING_DIM, padding_index=0)

In [247]:
type(token_embedding)

allennlp.modules.token_embedders.embedding.Embedding

In [248]:
vars(token_embedding)

{'_backend': <torch.nn.backends.thnn.THNNFunctionBackend at 0x1275bc438>,
 '_parameters': OrderedDict([('weight', Parameter containing:
               tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
                       [-0.0086,  0.0133, -0.0209,  ...,  0.0169,  0.0007, -0.0088],
                       [ 0.0098, -0.0122,  0.0100,  ...,  0.0005,  0.0080, -0.0069],
                       ...,
                       [ 0.0025, -0.0064, -0.0158,  ..., -0.0015, -0.0008, -0.0114],
                       [ 0.0204, -0.0024, -0.0009,  ...,  0.0062,  0.0025, -0.0036],
                       [-0.0214, -0.0035, -0.0040,  ..., -0.0048, -0.0214,  0.0195]],
                      requires_grad=True))]),
 '_buffers': OrderedDict(),
 '_backward_hooks': OrderedDict(),
 '_forward_hooks': OrderedDict(),
 '_forward_pre_hooks': OrderedDict(),
 '_state_dict_hooks': OrderedDict(),
 '_load_state_dict_pre_hooks': OrderedDict(),
 '_modules': OrderedDict(),
 'training': True,
 'num_embeddin

In [249]:
word_embeddings = BasicTextFieldEmbedder({"sentence": token_embedding})

In [250]:
vars(word_embeddings)

{'_backend': <torch.nn.backends.thnn.THNNFunctionBackend at 0x1275bc438>,
 '_parameters': OrderedDict(),
 '_buffers': OrderedDict(),
 '_backward_hooks': OrderedDict(),
 '_forward_hooks': OrderedDict(),
 '_forward_pre_hooks': OrderedDict(),
 '_state_dict_hooks': OrderedDict(),
 '_load_state_dict_pre_hooks': OrderedDict(),
 '_modules': OrderedDict([('token_embedder_sentence', Embedding())]),
 'training': True,
 '_token_embedders': {'sentence': Embedding()},
 '_embedder_to_indexer_map': None,
 '_allow_unmatched_keys': False}

In [251]:
lstm = PytorchSeq2SeqWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

In [252]:
vars(lstm)

{'_backend': <torch.nn.backends.thnn.THNNFunctionBackend at 0x1275bc438>,
 '_parameters': OrderedDict(),
 '_buffers': OrderedDict(),
 '_backward_hooks': OrderedDict(),
 '_forward_hooks': OrderedDict(),
 '_forward_pre_hooks': OrderedDict(),
 '_state_dict_hooks': OrderedDict(),
 '_load_state_dict_pre_hooks': OrderedDict(),
 '_modules': OrderedDict([('_module', LSTM(100, 100, batch_first=True))]),
 'training': True,
 'stateful': False,
 '_states': None,
 '_is_bidirectional': False,
 '_num_directions': 1}

In [253]:
model = Lstm(word_embeddings, lstm, vocab)

In [254]:
vars(model)

{'_backend': <torch.nn.backends.thnn.THNNFunctionBackend at 0x1275bc438>,
 '_parameters': OrderedDict(),
 '_buffers': OrderedDict(),
 '_backward_hooks': OrderedDict(),
 '_forward_hooks': OrderedDict(),
 '_forward_pre_hooks': OrderedDict(),
 '_state_dict_hooks': OrderedDict(),
 '_load_state_dict_pre_hooks': OrderedDict(),
 '_modules': OrderedDict([('word_embeddings', BasicTextFieldEmbedder(
                 (token_embedder_sentence): Embedding()
               )), ('encoder', PytorchSeq2SeqWrapper(
                 (_module): LSTM(100, 100, batch_first=True)
               )), ('hidden2tag',
               Linear(in_features=100, out_features=3, bias=True))]),
 'training': True,
 'vocab': Vocabulary with namespaces:  tokens, Size: 12927 || labels, Size: 3 || Non Padded Namespaces: {'*tags', '*labels'},
 '_regularizer': None,
 'accuracy': <allennlp.training.metrics.categorical_accuracy.CategoricalAccuracy at 0x14275cb00>}

In [255]:
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [256]:
vars(optimizer)

{'defaults': {'lr': 0.1,
  'momentum': 0,
  'dampening': 0,
  'weight_decay': 0,
  'nesterov': False},
 'state': defaultdict(dict, {}),
 'param_groups': [{'params': [Parameter containing:
    tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
            [-0.0086,  0.0133, -0.0209,  ...,  0.0169,  0.0007, -0.0088],
            [ 0.0098, -0.0122,  0.0100,  ...,  0.0005,  0.0080, -0.0069],
            ...,
            [ 0.0025, -0.0064, -0.0158,  ..., -0.0015, -0.0008, -0.0114],
            [ 0.0204, -0.0024, -0.0009,  ...,  0.0062,  0.0025, -0.0036],
            [-0.0214, -0.0035, -0.0040,  ..., -0.0048, -0.0214,  0.0195]],
           requires_grad=True), Parameter containing:
    tensor([[ 0.0975, -0.0424,  0.0068,  ..., -0.0425, -0.0946,  0.0559],
            [ 0.0488,  0.0873,  0.0586,  ..., -0.0147,  0.0143,  0.0446],
            [-0.0161, -0.0360,  0.0964,  ...,  0.0187,  0.0417,  0.0795],
            ...,
            [-0.0229,  0.0704, -0.0854,  ..., -0.0581,  0

In [257]:
iterator = BucketIterator(batch_size=32, sorting_keys=[("sentence", "num_tokens")])
iterator.index_with(vocab)

In [258]:
vars(iterator)

{'vocab': Vocabulary with namespaces:  tokens, Size: 12927 || labels, Size: 3 || Non Padded Namespaces: {'*tags', '*labels'},
 '_batch_size': 32,
 '_max_instances_in_memory': None,
 '_instances_per_epoch': None,
 '_maximum_samples_per_batch': None,
 '_cache_instances': False,
 '_cache': defaultdict(list, {}),
 '_track_epoch': False,
 '_epochs': defaultdict(int, {}),
 '_cursors': {},
 '_sorting_keys': [('sentence', 'num_tokens')],
 '_padding_noise': 0.1,
 '_biggest_batch_first': False}

In [259]:
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=train_dataset,
                  patience=10,
                  num_epochs=1000)

In [260]:
vars(trainer)['train_data']

[<allennlp.data.instance.Instance at 0x143e44400>,
 <allennlp.data.instance.Instance at 0x143e44e80>,
 <allennlp.data.instance.Instance at 0x143e4c9b0>,
 <allennlp.data.instance.Instance at 0x143e4ce10>,
 <allennlp.data.instance.Instance at 0x143e53048>,
 <allennlp.data.instance.Instance at 0x143e53ef0>,
 <allennlp.data.instance.Instance at 0x143e561d0>,
 <allennlp.data.instance.Instance at 0x143e564a8>,
 <allennlp.data.instance.Instance at 0x143e56c50>,
 <allennlp.data.instance.Instance at 0x143e5b588>,
 <allennlp.data.instance.Instance at 0x143e5bef0>,
 <allennlp.data.instance.Instance at 0x143e621d0>,
 <allennlp.data.instance.Instance at 0x143e68630>,
 <allennlp.data.instance.Instance at 0x143e6c320>,
 <allennlp.data.instance.Instance at 0x143e70080>,
 <allennlp.data.instance.Instance at 0x143e70f98>,
 <allennlp.data.instance.Instance at 0x143e76be0>,
 <allennlp.data.instance.Instance at 0x143e7a898>,
 <allennlp.data.instance.Instance at 0x143e7add8>,
 <allennlp.data.instance.Instan

In [261]:
vars(vars(vars(trainer)['train_data'][0])['fields']['sentence'])

{'tokens': [Nikolaj,
  Coster-Waldau,
  worked,
  with,
  the,
  Fox,
  Broadcasting,
  Company,
  .,
  The,
  Fox,
  Broadcasting,
  Company,
  -LRB-,
  often,
  shortened,
  to,
  Fox,
  and,
  stylized,
  as,
  FOX,
  -RRB-,
  is,
  an,
  American,
  English,
  language,
  commercial,
  broadcast,
  television,
  network,
  that,
  is,
  owned,
  by,
  the,
  Fox,
  Entertainment,
  Group,
  subsidiary,
  of,
  21st,
  Century,
  Fox,
  .,
  He,
  then,
  played,
  Detective,
  John,
  Amsterdam,
  in,
  the,
  short-lived,
  Fox,
  television,
  series,
  New,
  Amsterdam,
  -LRB-,
  2008,
  -RRB-,
  ,,
  as,
  well,
  as,
  appearing,
  as,
  Frank,
  Pike,
  in,
  the,
  2009,
  Fox,
  television,
  film,
  Virtuality,
  ,,
  originally,
  intended,
  as,
  a,
  pilot,
  .],
 '_token_indexers': {'sentence': <allennlp.data.token_indexers.single_id_token_indexer.SingleIdTokenIndexer at 0x1355bc748>},
 '_indexed_tokens': None,
 '_indexer_name_to_indexed_token': None}

In [264]:
trainer.train()



  0%|          | 0/63 [00:00<?, ?it/s][A[A

tensor([1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 2, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1,
        0, 1, 0, 1, 1, 1, 0, 1])
<class 'torch.Tensor'>
1 32


RuntimeError: Expected tensor [32, 1], src [768, 3] and index [32, 1] to have the same size apart from dimension 1

In [263]:
labels

NameError: name 'labels' is not defined

In [19]:
vars(vars(vars(trainer)['train_data'][0])['fields']['sentence'])

{'tokens': [Nikolaj,
  Coster-Waldau,
  worked,
  with,
  the,
  Fox,
  Broadcasting,
  Company,
  .,
  The,
  Fox,
  Broadcasting,
  Company,
  -LRB-,
  often,
  shortened,
  to,
  Fox,
  and,
  stylized,
  as,
  FOX,
  -RRB-,
  is,
  an,
  American,
  English,
  language,
  commercial,
  broadcast,
  television,
  network,
  that,
  is,
  owned,
  by,
  the,
  Fox,
  Entertainment,
  Group,
  subsidiary,
  of,
  21st,
  Century,
  Fox,
  .,
  He,
  then,
  played,
  Detective,
  John,
  Amsterdam,
  in,
  the,
  short-lived,
  Fox,
  television,
  series,
  New,
  Amsterdam,
  -LRB-,
  2008,
  -RRB-,
  ,,
  as,
  well,
  as,
  appearing,
  as,
  Frank,
  Pike,
  in,
  the,
  2009,
  Fox,
  television,
  film,
  Virtuality,
  ,,
  originally,
  intended,
  as,
  a,
  pilot,
  .],
 '_token_indexers': {'sentence': <allennlp.data.token_indexers.single_id_token_indexer.SingleIdTokenIndexer at 0x130a934e0>},
 '_indexed_tokens': None,
 '_indexer_name_to_indexed_token': None}

In [None]:
print("start saving the weights")
with open("test.th", 'wb') as f:
    torch.save(model.state_dict(), f)
print("saving finished")
print("start saving vocab")
vocab.save_to_files("vocabulary")
print("saving finished")

In [None]:
predictor = SentenceTaggerPredictor(model, dataset_reader=reader)

print('start doing prediction')
tag_logits = predictor.predict(['The Khmer Empire was not weak.','The Khmer Empire , officially the Angkor Empire , the predecessor state to modern Cambodia -LRB- `` Kampuchea '' or `` Srok Khmer '' to the Khmer people -RRB- , was a powerful Hindu-Buddhist empire in Southeast Asia .'])['tag_logits']
tag_ids = np.argmax(tag_logits, axis=-1)
print([model.vocab.get_token_from_index(i, 'labels') for i in tag_ids])

print("predicting finished")