In [1]:
import pandas as pd
import numpy as np

In [2]:
VAL_RATIO = 0.1

def prepare_csv(seed=999):
    data = pd.read_csv("data.csv")[:1000]
    
    idx = data.index.values
    np.random.seed(seed)
    np.random.shuffle(idx)
    
    val_size = int(len(idx) * VAL_RATIO)
    
    data.iloc[idx[:val_size]].to_csv(
        "cache/dataset_val.csv", index=False)
    data.iloc[idx[val_size:2*val_size]].to_csv(
        "cache/dataset_test.csv", index=False)
    data.iloc[idx[2*val_size:]].to_csv(
        "cache/dataset_train.csv", index=False)

In [3]:
import re
import spacy
NLP = spacy.load('en')
MAX_CHARS = 20000

In [4]:
def tokenizer(comment):
    comment = re.sub(
        r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", 
        str(comment))
    comment = re.sub(r"[ ]+", " ", comment)
    comment = re.sub(r"\!+", "!", comment)
    comment = re.sub(r"\,+", ",", comment)
    comment = re.sub(r"\?+", "?", comment)
    if (len(comment) > MAX_CHARS):
        comment = comment[:MAX_CHARS]
    return [
        x.text for x in NLP.tokenizer(comment) if x.text != " "]

In [5]:
import logging
import torch
from torchtext import data
LOGGER = logging.getLogger("reviews_dataset")

In [19]:
def get_dataset(fix_length=100, lower=False, vectors=None):
    
    if vectors is not None:
        # pretrain vectors only supports all lower cases
        lower = True
    
    LOGGER.debug("Preparing CSV files...")
    prepare_csv()
    
    review_text = data.Field(
        sequential=True,
        fix_length=fix_length,
        tokenize=tokenizer,
        pad_first=True,
        dtype=torch.float64,
        lower=lower
    )
    
    theme = data.Field(
                use_vocab=True, 
                sequential=False, 
                dtype=torch.float64)
    
    meta_id = data.Field(
                use_vocab=True, sequential=False, 
                dtype=torch.float64)
    
    fields=[
            ('meta_id', meta_id),
            ('review_text', review_text),
            ('theme', theme)]
    
    LOGGER.debug("Reading train csv file...")
    train, val = data.TabularDataset.splits(
        path='cache/', format='csv', skip_header=True,
        train='dataset_train.csv', validation='dataset_val.csv',
        fields = fields
        )
    
    LOGGER.debug("Reading test csv file...")
    test = data.TabularDataset(
        path='cache/dataset_test.csv', format='csv', 
        skip_header=True, fields=fields)
    
    LOGGER.debug("Building vocabulary...")
    
    review_text.build_vocab(
        train, val, test,
        max_size=30000,
        min_freq=5,
        vectors=vectors
    )

    meta_id.build_vocab(
        train, val, test,
        max_size=float('inf'),
        min_freq=0,
    )
    
    theme.build_vocab(
        train, val, test,
        max_size=10,
        min_freq=0,
    )
    
    
    LOGGER.debug("Done preparing the datasets")
    return train, val, test

In [20]:
train_dataset, val_dataset, test_dataset = get_dataset()

In [66]:
def get_iterator(dataset, batch_size, train=True, shuffle=True, repeat=False):
    
    dataset_iter = data.Iterator(
        dataset, batch_size=batch_size, device='cuda',
        train=train, shuffle=shuffle, repeat=repeat,
        
        sort_key = lambda x: len(x.review_text),
        sort_within_batch=False,
        sort=True
    )
    return dataset_iter

In [67]:
#ff = val_dataset.fields['review_text']

In [68]:
len(val_dataset)

100

In [69]:
for examples in get_iterator(
            val_dataset, 32, train=False,
            shuffle=False, repeat=False,
        ):
    x = examples.review_text # (fix_length, batch_size) Tensor
    y = torch.stack([examples.theme], dim=1)
    print(x.shape, y.shape)

torch.Size([100, 32]) torch.Size([32, 1])
torch.Size([100, 32]) torch.Size([32, 1])
torch.Size([100, 32]) torch.Size([32, 1])
torch.Size([100, 4]) torch.Size([4, 1])
