In [12]:
import torch
import numpy as np
from torchtext import data

SEED = 1452 # for reproducibility
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize= 'spacy')
LABEL = data.LabelField(dtype=torch.float)


In [3]:
# Load dataset
with open('datasets/winemag-data-130k-v2.csv') as f:
    lines = f.readlines()

print(len(lines))

129976


In [4]:
# Split in train and test
# @Pierre: en fait on utilisera plutot le split de torchtext (voir plus bas)

TEST_SET_SIZE = .3

indices = list(range(len(lines)))
np.random.seed(SEED)
np.random.shuffle(indices)
split_index = int(TEST_SET_SIZE*len(lines))
train_indices = indices[:split_index]
test_indices = indices[split_index:]
train_set = [lines[k] for k in train_indices]
test_set = [lines[k] for k in test_indices]

print(len(train_set))
print(len(test_set))

38992
90984


In [6]:
# preprocess json
# (torchtext needs a file with a new json record per row and not a proper json)
import json
with open('datasets/winemag-data-130k-v2.json') as f:
    jsonfile = json.loads(f.read())
res = ''
print(str(jsonfile[0]))
print(json.dumps(jsonfile[0]))
for record in jsonfile:
    res += json.dumps(record) + '\n'

print(res[:1000])


{'points': '87', 'title': 'Nicosia 2013 Vulkà Bianco  (Etna)', 'description': "Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity.", 'taster_name': 'Kerin O’Keefe', 'taster_twitter_handle': '@kerinokeefe', 'price': None, 'designation': 'Vulkà Bianco', 'variety': 'White Blend', 'region_1': 'Etna', 'region_2': None, 'province': 'Sicily & Sardinia', 'country': 'Italy', 'winery': 'Nicosia'}
{"points": "87", "title": "Nicosia 2013 Vulk\u00e0 Bianco  (Etna)", "description": "Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity.", "taster_name": "Kerin O\u2019Keefe", "taster_twitter_handle": "@kerinokeefe", "price": null, "designation": "Vulk\u00e0 Bianco", "variety": "White Blend", "region_1": "Etna", "region_2": null, "province": "Sicily & Sardinia", "cou

In [7]:
# Write preprocessed json to new file
with open('testfile', 'w') as out:
    out.write(res)

In [8]:
# Setup a dataset from the preprocessed json
train_dataset = data.TabularDataset(
    path='testfile',
    format='json',
    fields={'description': ('description', data.Field(sequential=True)),
            'points': ('points', data.Field(sequential=False))}
)

In [10]:
import random
vars(train_dataset.examples[0])
train_data, valid_data = train_dataset.split(random_state = random.seed(SEED))
print(len(train_data))

90980


In [13]:
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

In [14]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")

Unique tokens in TEXT vocabulary: 2
