https://towardsdatascience.com/use-torchtext-to-load-nlp-datasets-part-i-5da6f1c89d84?fbclid=IwAR1zjXlM5w93z47QalvvWeX7OQkIRGL8KK8dAkHepITnk81XFJt_g_FKdVE

The code is heavily inspired from the above blogpost :-) 

In [1]:
import re
import logging

import numpy as np
import pandas as pd
import spacy
import torch
import csv
from torchtext import data
import urllib.request

In [2]:
#The User data 
User_data = pd.read_csv('data/user-info.csv', usecols = ['user.id','doc.id','rating'])
User_data = User_data.rename(columns={'user.id': 'user_id','doc.id': 'doc_id'})


In [3]:
# The article dataset
article_data = pd.read_csv('data/raw-data.csv', usecols = ['doc.id','title','citeulike.id', 'raw.abstract'],encoding = "ISO-8859-1")
article_data = article_data.rename(columns={'raw.abstract': 'abstract','doc.id': 'doc_id','citeulike.id': 'citeulike_id'})


In [4]:
CiteULike_data=pd.merge(User_data,article_data,on="doc_id")

In [5]:
le = max(len(x) for x in CiteULike_data.abstract)
print(le)

122938


In [6]:
NLP = spacy.load('en')
MAX_CHARS = 1229381
VAL_RATIO = 0.2
LOGGER = logging.getLogger("CiteULike_data")

In [7]:
def tokenizer(abs_text):
    abs_text = re.sub(
        r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", 
        str(abs_text))
    abs_text = re.sub(r"[ ]+", " ", str(abs_text))
    abs_text = re.sub(r"\!+", "!", str(abs_text))
    abs_text = re.sub(r"\,+", ",", str(abs_text))
    abs_text = re.sub(r"\?+", "?", str(abs_text))
    if (len(abs_text) > MAX_CHARS):
        abs_text = abs_text[:MAX_CHARS]
    return [
        x.text for x in NLP.tokenizer(abs_text) if x.text != " "]

In [8]:
VAL_RATIO = 0.2

def prepare_csv(seed=999):
    df_train = CiteULike_data
    df_train["abstract"] = \
        df_train.abstract.str.replace("\n", " ")
    idx = np.arange(df_train.shape[0])
    np.random.seed(seed)
    np.random.shuffle(idx)
    val_size = int(len(idx) * VAL_RATIO)
    df_train.iloc[idx[val_size:], :].to_csv(
        "cache/dataset_train.csv", index=False)
    df_train.iloc[idx[:val_size], :].to_csv(
        "cache/dataset_val.csv", index=False)

In [9]:
def get_dataset(fix_length=100, lower=False, vectors=None):
    if vectors is not None:
        # pretrain vectors only supports all lower cases
        lower = True
    LOGGER.debug("Preparing CSV files...")
    prepare_csv()
    abs_text = data.Field(
        sequential=True,
        fix_length=fix_length,
        tokenize=tokenizer,
        pad_first=True,
        tensor_type=torch.LongTensor,
        lower=lower
    )
    LOGGER.debug("Reading train csv file...")
    train, val = data.TabularDataset.splits(
        path='cache/', format='csv', skip_header=True,
        train='dataset_train.csv', validation='dataset_val.csv',
        fields=[
            ('abstract', abs_text),
            ('doc_id', data.Field(
                use_vocab=False, sequential=False, tensor_type=torch.ByteTensor)),
            ('rating', data.Field(
                use_vocab=False, sequential=False, tensor_type=torch.ByteTensor)),
            ('title', data.Field(
                use_vocab=False, sequential=False, tensor_type=torch.ByteTensor)),
            ('citeulike_id', data.Field(
                use_vocab=False, sequential=False, tensor_type=torch.ByteTensor)),
            ('user_id', data.Field(
                use_vocab=False, sequential=False, tensor_type=torch.ByteTensor)),
        ])
    LOGGER.debug("Building vocabulary...")
    abs_text.build_vocab(
        train, val,
        max_size=1229381,
        min_freq=20,
        vectors=vectors
    )
    LOGGER.debug("Done preparing the datasets")
    return train, val


In [37]:
train,val=get_dataset(100,lower=False,vectors=None)

In [43]:
def get_iterator(dataset, batch_size, train=True, shuffle=True, repeat=False, sort=None,sort_key=None):
    dataset_iter = data.Iterator(
        dataset, batch_size=batch_size, device=-1,
        train=train, shuffle=shuffle, repeat=repeat,
        sort=sort,sort_key=sort_key
    )
    return dataset_iter

In [39]:
x = get_iterator(
            train, 100 ,train=True,
            shuffle=True, repeat=False,sort_key=True
        )

In [53]:
next(x)

TypeError: 'Iterator' object is not an iterator

In [51]:
g = get_iterator(
            CiteULike_data, 100 ,train=True,
            shuffle=True, repeat=False,sort_key=True
        )

In [52]:
print(g.dataset.abstract)

0         The Nucleic Acids Research Molecular Biology D...
1         The Nucleic Acids Research Molecular Biology D...
2         The Nucleic Acids Research Molecular Biology D...
3         The Nucleic Acids Research Molecular Biology D...
4         The Nucleic Acids Research Molecular Biology D...
5         The Nucleic Acids Research Molecular Biology D...
6         The Nucleic Acids Research Molecular Biology D...
7         The Nucleic Acids Research Molecular Biology D...
8         The Nucleic Acids Research Molecular Biology D...
9         The Nucleic Acids Research Molecular Biology D...
10        The Nucleic Acids Research Molecular Biology D...
11        The Nucleic Acids Research Molecular Biology D...
12        The Nucleic Acids Research Molecular Biology D...
13        The Nucleic Acids Research Molecular Biology D...
14        The Nucleic Acids Research Molecular Biology D...
15        The Nucleic Acids Research Molecular Biology D...
16        The Nucleic Acids Research Mol

In [44]:
for examples in get_iterator(
            train, 100 ,train=True,
            shuffle=True, repeat=False,sort_key=True
        ):
    x = examples.abstract # (fix_length, batch_size) Tensor
    y = torch.stack([
        examples.doc_id, examples.rating, 
        examples.title,
        examples.citeulike_id, examples.user_id
    ], dim=1)

ValueError: invalid literal for int() with base 10: 'ht tagging paper taxonomy flickr academic article to read'

204986