In [None]:
import os
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import datasets

In [6]:
os.environ['HTTP_PROXY'] = "http://127.0.0.1:7897"
os.environ['HTTPS_PROXY'] = "http://127.0.0.1:7897"

In [7]:
dataset = datasets.load_dataset("ccdv/arxiv-summarization", split='train', streaming=True)
raw_dataset = list(dataset.take(3500))

In [17]:
segments = 10
segment_length = 512
chunk_size = segments * segment_length
chunk_size

5120

In [19]:
raw_dataset = [x['article'] for x in raw_dataset]

TypeError: string indices must be integers

In [20]:
# filtering out articles that's shorter than chunk_size
raw_dataset = [x for x in raw_dataset if len(x) > chunk_size]
len(raw_dataset)

3401

In [27]:
# tokenizer
all_text = "".join(raw_dataset)
chars = sorted(list(set(all_text)))
chars

['\n',
 ' ',
 '!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '{',
 '|',
 '}',
 '~']

In [28]:
stoi = {char: i for i, char in enumerate(chars)}
itos = {i: char for i, char in enumerate(chars)}
del all_text

In [40]:
raw_dataset[0]

'additive models @xcite provide an important family of models for semiparametric regression or classification . some reasons for the success of additive models are their increased flexibility when compared to linear or generalized linear models and their increased interpretability when compared to fully nonparametric models . \n it is well - known that good estimators in additive models are in general less prone to the curse of high dimensionality than good estimators in fully nonparametric models . \n many examples of such estimators belong to the large class of regularized kernel based methods over a reproducing kernel hilbert space @xmath0 , see e.g. @xcite . in the last years \n many interesting results on learning rates of regularized kernel based models for additive models have been published when the focus is on sparsity and when the classical least squares loss function is used , see e.g. @xcite , @xcite , @xcite , @xcite , @xcite , @xcite and the references therein . of course

In [41]:
encoded = [np.fromstring(doc, dtype='uint8') for doc in raw_dataset]

  encoded = [np.fromstring(doc, dtype='uint8') for doc in raw_dataset]


In [42]:
encoded[0]

array([ 97, 100, 100, ...,  44,  32,  93], dtype=uint8)

In [46]:
all_encoded = np.concatenate(encoded)
c_chars = set(all_encoded)

In [98]:
len(c_chars)

70

In [49]:
def clip_article(doc, chunk_size):
    remainder = len(doc) % chunk_size
    return doc[:-remainder]

clipped = [clip_article(doc, chunk_size) for doc in encoded]

In [55]:
clipped[0].reshape(-1, chunk_size).shape

(5, 5120)

In [60]:
clipped = [doc.reshape(-1, chunk_size) for doc in clipped]

In [58]:
clipped[1].shape

(3, 5120)

In [71]:
processed_data = torch.tensor(np.concatenate(clipped), dtype=torch.long)
processed_data.shape

torch.Size([20853, 5120])

In [77]:
data_length = processed_data.shape[0]
eighty_split = int(data_length * 0.8)
ninety_split = int(data_length * 0.9)

In [83]:
train_loader = iter(DataLoader(processed_data[:eighty_split], batch_size=8, shuffle=True))
test_loader = iter(DataLoader(processed_data[eighty_split:ninety_split], batch_size=8, shuffle=True))
val_loader = iter(DataLoader(processed_data[ninety_split:], batch_size=8, shuffle=True))

In [85]:
example = next(val_loader)

In [87]:
example.shape

torch.Size([8, 5120])

In [88]:
seq, labels = example[:, :-1], example[:, 1:]

In [89]:
seq

tensor([[ 32, 118, 101,  ...,  32, 105, 110],
        [ 97, 115,  32,  ...,  97, 110,  32],
        [ 10,  32, 112,  ...,  32, 112, 108],
        ...,
        [116,  32, 111,  ..., 104,  32, 105],
        [ 32, 116, 114,  ..., 111, 110,  32],
        [108, 101, 102,  ...,  43,  32, 104]])

In [90]:
labels

tensor([[118, 101,  99,  ..., 105, 110,  32],
        [115,  32, 105,  ..., 110,  32,  99],
        [ 32, 112, 102,  ..., 112, 108,  97],
        ...,
        [ 32, 111, 102,  ...,  32, 105, 110],
        [116, 114,  97,  ..., 110,  32, 111],
        [101, 102, 116,  ...,  32, 104,  97]])

In [92]:
seq.shape == labels.shape

True

In [97]:
# returns tuple of chunks
seq.chunk(10, dim=-1)[0].shape

torch.Size([8, 512])

In [100]:
model = nn.Sequential(
    nn.Embedding(70, 16), # vocab_size, embedding_dim
    nn.Linear(16, 150),
    nn.ReLU(),
    nn.Linear(150, 128),
    nn.ReLU()
)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.05)
model.train()  # changing the model to training mode

Sequential(
  (0): Embedding(70, 16)
  (1): Linear(in_features=16, out_features=150, bias=True)
  (2): ReLU()
  (3): Linear(in_features=150, out_features=128, bias=True)
  (4): ReLU()
)

In [None]:
max_iters = 300

for i in range(max_iters):
    data = next(train_loader)