In [2]:
!pip install transformers
!pip install torchmetrics
!pip install evaluate



##Load up the libraries

In [41]:
import numpy as np
from transformers import AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Load up the tokenizer and the model

In [32]:
tokenizer = AutoTokenizer.from_pretrained("google/t5-efficient-tiny")
model = T5ForConditionalGeneration.from_pretrained("google/t5-efficient-tiny")

## A simple sentence

In [39]:
input_sequence = "Today's class is about Transformers for the Humanities and Social Sciences Department"

## Tokenize the sentence and convert to IDs

In [40]:
encoding = tokenizer(
    [input_sequence],
    padding="longest",
    max_length=512,
    truncation=True,
    return_tensors="pt",
)
print(encoding, "\n\n\n")
print(tokenizer.convert_ids_to_tokens(encoding.input_ids[0]))
input_ids, attention_mask = encoding.input_ids, encoding.attention_mask

{'input_ids': tensor([[ 1960,    31,     7,   853,    19,    81, 31220,     7,    21,     8,
          3892,  2197,    11,  2730,  9226,  1775,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])} 



['▁Today', "'", 's', '▁class', '▁is', '▁about', '▁Transformer', 's', '▁for', '▁the', '▁Human', 'ities', '▁and', '▁Social', '▁Sciences', '▁Department', '</s>']


In [8]:
# outputs = model.generate(input_ids, max_length=50)
# print(tokenizer.decode(outputs[0], skip_special_tokens=True))

spectra of samples from 7 plant species were used to explore the influence of preprocessing and feature extraction on efficiency of machine learning algorithms. Wavelet Tensor Train ( WTT ) and Discrete Wavelet Transform


## Load up the libraries

In [42]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
import copy
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import accuracy_score, f1_score, classification_report

from transformers import AutoTokenizer, T5Model, T5ForConditionalGeneration
from datasets import load_dataset

## A function to clean text

In [10]:
def preprocess(x):
    x = re.sub('<.*?>', ' ', x)
    x = re.sub('http\S+', ' ', x)
    x = re.sub('\s+', ' ', x)
    return x.lower().strip()

## A pre-processing function

In [11]:
def pipeline(dataframe):
    # Pre-process the sentences
    dataframe['text'] = dataframe['text'].apply(lambda x: preprocess(x))

    # Pre-pend CLS token to each sentence
    sentences = ["[CLS] " + s for s in dataframe.text.values]

    # Extract labels
    labels = dataframe.label.values

    # Tokenize each sentence
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
    tokenized = [tokenizer.tokenize(s) for s in tqdm(sentences)]

    # Append the SEP token and also set a threshold for the number of tokens in a sentence
    MAX_LEN_TRAIN, MAX_LEN_TEST = 140, 140
    tokenized = [t[:(MAX_LEN_TRAIN-1)]+['SEP'] for t in tokenized]

    # Generate IDs of each token and add padding to sentences smaller than given threshold
    ids = [tokenizer.convert_tokens_to_ids(t) for t in tqdm(tokenized)]
    ids = np.array([np.pad(i, (0, MAX_LEN_TRAIN-len(i)), mode='constant') for i in ids])

    # Also generate Attention masks. An attention mask is a binary tensor
    # that indicates the position of padded indices so that the model does not attend to them
    amasks = np.asarray([[float(i>0) for i in seq] for seq in tqdm(ids)])

    return torch.tensor(ids), torch.tensor(labels), torch.tensor(amasks)

## Load the IMDB Dataset in pandas form

In [12]:
dataset = load_dataset('imdb')
df_train = dataset['train'].to_pandas()
display(df_train.head())

df_val = dataset['test'].to_pandas()
display(df_val.head())

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


Unnamed: 0,text,label
0,I love sci-fi and am willing to put up with a ...,0
1,"Worth the entertainment value of a rental, esp...",0
2,its a totally average film with a few semi-alr...,0
3,STAR RATING: ***** Saturday Night **** Friday ...,0
4,"First off let me say, If you haven't enjoyed a...",0


## Create a class that will hold all the basic configuration

In [43]:
class Config:
    def __init__(self):
        super(Config, self).__init__()

        self.SEED = 42
        self.MODEL_PATH = 'google/t5-efficient-tiny'

        # data
        self.TOKENIZER = AutoTokenizer.from_pretrained(self.MODEL_PATH)
        self.SRC_MAX_LENGTH = 320
        self.TGT_MAX_LENGTH = 20
        self.BATCH_SIZE = 16

        # model
        self.DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.LR = 3e-5
        self.EPOCHS = 1

config = Config()

## Create a T5 specific dataset

In [15]:
class T5Dataset(Dataset):
    def __init__(self, df, set_type=None):
        super(T5Dataset, self).__init__()

        self.texts = df["text"].values
        self.set_type = set_type
        if self.set_type != 'test':
            self.labels = np.asarray(["positive" if i==1 else "negative" for i in df["label"].values])

        self.tokenizer = config.TOKENIZER
        self.src_max_length = config.SRC_MAX_LENGTH
        self.tgt_max_length = config.TGT_MAX_LENGTH

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        src_tokenized = self.tokenizer.encode_plus(
            self.texts[index],
            max_length=self.src_max_length,
            pad_to_max_length=True,
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt'
        )
        src_input_ids = src_tokenized['input_ids'].squeeze()
        src_attention_mask = src_tokenized['attention_mask'].squeeze()

        if self.set_type != 'test':
            tgt_tokenized = self.tokenizer(
                self.labels[index],
                max_length=self.tgt_max_length,
                pad_to_max_length=True,
                truncation=True,
                return_attention_mask=True,
                return_token_type_ids=False,
                return_tensors='pt'
            )
            tgt_input_ids = tgt_tokenized['input_ids'].squeeze()
            tgt_attention_mask = tgt_tokenized['attention_mask'].squeeze()

            return {
                'src_input_ids': src_input_ids.long(),
                'src_attention_mask': src_attention_mask.long(),
                'tgt_input_ids': tgt_input_ids.long(),
                'tgt_attention_mask': tgt_attention_mask.long()
            }

        return {
            'src_input_ids': src_input_ids.long(),
            'src_attention_mask': src_attention_mask.long()
        }

## Load the IMDB dataset in terms of the T5 dataset, create a dataloader

In [16]:
train_data = T5Dataset(df_train)
val_data = T5Dataset(df_val)

train_dataloader = DataLoader(train_data, batch_size=config.BATCH_SIZE)
val_dataloader = DataLoader(val_data, batch_size=config.BATCH_SIZE)

b = next(iter(train_dataloader))
for k, v in b.items():
    print(f'{k} shape: {v.shape}')

src_input_ids shape: torch.Size([16, 320])
src_attention_mask shape: torch.Size([16, 320])
tgt_input_ids shape: torch.Size([16, 20])
tgt_attention_mask shape: torch.Size([16, 20])




## Create the model

In [17]:
class T5Model(nn.Module):
    def __init__(self):
        super(T5Model, self).__init__()

        self.t5_model = T5ForConditionalGeneration.from_pretrained(config.MODEL_PATH)

    def forward(
        self,
        input_ids,
        attention_mask=None,
        decoder_input_ids=None,
        decoder_attention_mask=None,
        labels=None
        ):

        return self.t5_model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            labels=labels,
        )

## Set the decvice type

In [44]:
device = config.DEVICE
print(device)

cpu


## Load the model

In [26]:
model = T5Model()
model.to(device);

## We want only two neurons in the output for binary classification

In [27]:
model.t5_model.lm_head.out_features = 2

# Finally start training

In [30]:
torch.manual_seed(config.SEED)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=config.LR)
total_loss = 0

for epoch in range(1):
    for step, batch in enumerate(tqdm(train_dataloader)):
        b_src_input_ids = batch['src_input_ids'].to(device)
        b_src_attention_mask = batch['src_attention_mask'].to(device)

        lm_labels = batch['tgt_input_ids'].to(device)
        lm_labels[lm_labels[:, :] == config.TOKENIZER.pad_token_id] = -100

        b_tgt_attention_mask = batch['tgt_attention_mask'].to(device)

        # clear accumulated gradients
        optimizer.zero_grad()

        # forward pass
        outputs = model(input_ids=b_src_input_ids, attention_mask=b_src_attention_mask, decoder_attention_mask=b_tgt_attention_mask, labels=lm_labels)
        loss = outputs[0]
        total_loss+= loss.item()

        # backward pass
        loss.backward()

        # update weights
        optimizer.step()
        print("Iteration: ", step, "\tLoss:", loss.item(),"\n")

    print("\n\nEpoch: ", epoch, "\tAverage Loss:", total_loss/len(train_dataloader), "\n")

  0%|          | 0/1563 [00:00<?, ?it/s]



Seq2SeqLMOutput(loss=tensor(11.1987, grad_fn=<NllLossBackward0>), logits=tensor([[[ -20.9962,  -20.4858,  -22.9923,  ...,  -84.1084,  -83.3525,
           -83.6444],
         [ -45.9004,   -8.2612,  -36.3058,  ..., -103.5070, -102.4597,
          -102.6672],
         [ -45.2228,   -5.3172,  -34.6345,  ..., -103.2310, -102.2147,
          -102.4168],
         ...,
         [ -21.3417,  -20.3774,  -23.3393,  ...,  -84.6165,  -83.8563,
           -84.1425],
         [ -21.3417,  -20.3774,  -23.3393,  ...,  -84.6165,  -83.8563,
           -84.1425],
         [ -21.3479,  -20.3740,  -23.3454,  ...,  -84.6156,  -83.8556,
           -84.1412]],

        [[ -20.4709,  -20.3874,  -22.8477,  ...,  -83.4097,  -82.6592,
           -82.9736],
         [ -42.9222,   -8.2675,  -34.5631,  ...,  -99.3144,  -98.3259,
           -98.5778],
         [ -42.5699,   -5.8535,  -33.2131,  ..., -100.1186,  -99.1231,
           -99.3686],
         ...,
         [ -20.8635,  -20.2808,  -23.2158,  ...,  -84.0317, 