In [1]:
import torch
import torch.optim as optim
import numpy as np

from torchnlp.datasets import snli_dataset

In [2]:
train_data = snli_dataset(train=True)

In [3]:
dev_data = snli_dataset(dev=True)
test_data = snli_dataset(test=True)

In [4]:
import re
import nltk
nltk.download('punkt')

def prepare_dataset(dataset):
    
    output = []
    stats = {}
    # counts of each class
    count_E = 0
    count_C = 0
    count_N = 0
    # lengths of sentences
    l_E = {'premise':[], 'hypothesis':[]}
    l_C = {'premise':[], 'hypothesis':[]}
    l_N = {'premise':[], 'hypothesis':[]}
    
    for i, t in enumerate(dataset, 0):
        
        premise = t['premise']
        hypothesis = t['hypothesis']
        premise_tokens = nltk.word_tokenize(premise)
        hypothesis_tokens = nltk.word_tokenize(hypothesis)
        
        t['premise_tokens'] = premise_tokens
        t['hypothesis_tokens'] = hypothesis_tokens
        
        if t['label'] == 'neutral':
            count_N += 1
            l_N['premise'].append(len(premise_tokens))
            l_N['hypothesis'].append(len(hypothesis_tokens))
        elif t['label'] == 'contradiction':
            count_C += 1
            l_C['premise'].append(len(premise_tokens))
            l_C['hypothesis'].append(len(hypothesis_tokens))
        elif t['label'] == 'entailment':
            count_E += 1
            l_E['premise'].append(len(premise_tokens))
            l_E['hypothesis'].append(len(hypothesis_tokens))
        else:
            continue
            
        del t['premise_transitions'], t['hypothesis_transitions']
        
        output.append(t)
        
    return count_E, count_C, count_N, l_E, l_C, l_N, output

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
tr_e, tr_c, tr_n, tr_le, tr_lc, tr_ln, train_data = prepare_dataset(train_data)
dev_e, dev_c, dev_n, dev_le, dev_lc, dev_ln, dev_data = prepare_dataset(dev_data)
test_e, test_c, test_n, test_le, test_lc, test_ln, test_data = prepare_dataset(test_data)

In [6]:
data = {'train': list(train_data), 'dev': list(dev_data), 'test': list(test_data), 
        'n_entail': {'train': tr_e, 'dev':dev_e, 'test':test_e},
        'n_contradiction': {'train':tr_c, 'dev':dev_c, 'test':test_c}, 
        'n_neutral': {'train':tr_n, 'dev':dev_n, 'test':test_n}, 
        'len_entail': {'train': tr_le, 'dev':dev_le, 'test':test_le},
        'len_contradiction': {'train':tr_lc, 'dev':dev_lc, 'test':test_lc}, 
        'len_neutral': {'train':tr_ln, 'dev':dev_ln, 'test':test_ln}, 
        'split_size': {'train':tr_e + tr_c + tr_n, 'dev':dev_e+dev_c+dev_n, 'test':test_e+test_c+test_n}}

In [7]:
import os, json

with open(os.path.join('data', 'snli_data.json'), 'w') as outfile:
    json.dump(data, outfile)

In [8]:
data['n_entail']

{'train': 183416, 'dev': 3329, 'test': 3368}

In [9]:
data['n_contradiction']

{'train': 183187, 'dev': 3278, 'test': 3237}

In [10]:
data['n_neutral']

{'train': 182764, 'dev': 3235, 'test': 3219}

In [16]:
np.mean(data['len_entail']['train']['premise'])

14.02853622366642

In [19]:
np.std(data['len_entail']['train']['premise'])

5.971090923679299

In [20]:
np.mean(data['len_entail']['train']['hypothesis'])

7.452048894316745

In [21]:
np.std(data['len_entail']['train']['hypothesis'])

2.8489398118916593

In [22]:
7.45 + 2*2.85

13.15

In [23]:
14.03 + 2*5.97

25.97

In [12]:
from misc.utilities import timeSince, dump_to_json, create_dir, Preload_embedding, read_json_file

In [13]:
aa, bb = torch.max(a, 1)

NameError: name 'a' is not defined

In [None]:
bb.dtype

In [1]:
import nltk

In [7]:
nltk.word_tokenize('An old woman in a sunlit room winds rough yarn into balls, the finished yarn balls placed in a pile to her right.')

['An',
 'old',
 'woman',
 'in',
 'a',
 'sunlit',
 'room',
 'winds',
 'rough',
 'yarn',
 'into',
 'balls',
 ',',
 'the',
 'finished',
 'yarn',
 'balls',
 'placed',
 'in',
 'a',
 'pile',
 'to',
 'her',
 'right',
 '.']