In [33]:
import jsonlines

import re
import spacy
from spacy.tokenizer import Tokenizer
nlp = spacy.load("en_core_web_sm")

from tqdm import tqdm

In [75]:
doc = nlp('A person on a horse jumps over a broken down airplane $1 ss-ss')
[w.text for w in doc]

['A',
 'person',
 'on',
 'a',
 'horse',
 'jumps',
 'over',
 'a',
 'broken',
 'down',
 'airplane',
 '$',
 '1',
 'ss',
 '-',
 'ss']

'A'

In [68]:
from spacy.lang.en import English
tokenizer = English()
texts = [u"One document.", u"...", u"Lots of documents"]
for doc in tokenizer.pipe(texts, batch_size=50):
    

Lots

In [28]:
import nltk
nltk.word_tokenize('A person on a horse jumps over a broken down airplane.')

['A',
 'person',
 'on',
 'a',
 'horse',
 'jumps',
 'over',
 'a',
 'broken',
 'down',
 'airplane',
 '.']

In [54]:
reader = jsonlines.open('./data/snli/snli_1.0/snli_1.0_train.jsonl')

In [62]:
for obj in reader.iter():
    break

In [63]:
obj

{'annotator_labels': ['neutral'],
 'captionID': '2267923837.jpg#2',
 'gold_label': 'neutral',
 'pairID': '2267923837.jpg#2r1n',
 'sentence1': 'Children smiling and waving at camera',
 'sentence1_binary_parse': '( Children ( ( ( smiling and ) waving ) ( at camera ) ) )',
 'sentence1_parse': '(ROOT (NP (S (NP (NNP Children)) (VP (VBG smiling) (CC and) (VBG waving) (PP (IN at) (NP (NN camera)))))))',
 'sentence2': 'They are smiling at their parents',
 'sentence2_binary_parse': '( They ( are ( smiling ( at ( their parents ) ) ) ) )',
 'sentence2_parse': '(ROOT (S (NP (PRP They)) (VP (VBP are) (VP (VBG smiling) (PP (IN at) (NP (PRP$ their) (NNS parents)))))))'}

In [43]:
def prepare_dataset(dataset, tokenizer):
    
    output = []
    stats = {}
    # counts of each class
    count_E = 0
    count_C = 0
    count_N = 0
    # lengths of sentences
    l_E = {'premise':[], 'hypothesis':[]}
    l_C = {'premise':[], 'hypothesis':[]}
    l_N = {'premise':[], 'hypothesis':[]}
    
    with jsonlines.open(dataset) as reader:
        pbar = tqdm(total=len(reader))
        for t in reader:
            pbar.update(1)
            tmp = {}
            premise = t['sentence1']
            hypothesis = t['sentence2']
            premise_tokens = [w for w in tokenizer(premise)]
            hypothesis_tokens = [w for w in tokenizer(hypothesis)]

            tmp['premise'] = premise
            tmp['hypothesis'] = hypothesis
            tmp['premise_tokens'] = premise_tokens
            tmp['hypothesis_tokens'] = hypothesis_tokens
            tmp['label'] = t['gold_label']
            
            if t['gold_label'] == 'neutral':
                count_N += 1
                l_N['premise'].append(len(premise_tokens))
                l_N['hypothesis'].append(len(hypothesis_tokens))
            elif t['gold_label'] == 'contradiction':
                count_C += 1
                l_C['premise'].append(len(premise_tokens))
                l_C['hypothesis'].append(len(hypothesis_tokens))
            elif t['gold_label'] == 'entailment':
                count_E += 1
                l_E['premise'].append(len(premise_tokens))
                l_E['hypothesis'].append(len(hypothesis_tokens))
            else:
                continue

            output.append(tmp)
        
    return count_E, count_C, count_N, l_E, l_C, l_N, output

In [44]:
tr_e, tr_c, tr_n, tr_le, tr_lc, tr_ln, train_data = prepare_dataset('./data/snli/snli_1.0/snli_1.0_train.jsonl', nlp)
dev_e, dev_c, dev_n, dev_le, dev_lc, dev_ln, dev_data = prepare_dataset('./data/snli/snli_1.0/snli_1.0_dev.jsonl', nlp)
test_e, test_c, test_n, test_le, test_lc, test_ln, test_data = prepare_dataset('./data/snli/snli_1.0/snli_1.0_test.jsonl', nlp)

TypeError: object of type 'Reader' has no len()

In [None]:
data = {'train': list(train_data), 'dev': list(dev_data), 'test': list(test_data), 
        'n_entail': {'train': tr_e, 'dev':dev_e, 'test':test_e},
        'n_contradiction': {'train':tr_c, 'dev':dev_c, 'test':test_c}, 
        'n_neutral': {'train':tr_n, 'dev':dev_n, 'test':test_n}, 
        'len_entail': {'train': tr_le, 'dev':dev_le, 'test':test_le},
        'len_contradiction': {'train':tr_lc, 'dev':dev_lc, 'test':test_lc}, 
        'len_neutral': {'train':tr_ln, 'dev':dev_ln, 'test':test_ln}, 
        'split_size': {'train':tr_e + tr_c + tr_n, 'dev':dev_e+dev_c+dev_n, 'test':test_e+test_c+test_n}}