In [None]:
!pip install sentencepiece
!pip install transformers
!pip install rich[jupyter]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.97
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m69.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
# Importing libraries
import os
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('/content/drive/MyDrive/prepared.csv', sep = ',',encoding = "ISO-8859-1", header= 0)
df.head()

Unnamed: 0.1,Unnamed: 0,Question,Answer
0,0,Do you ever have a hard time falling asleep?,Sometimes
1,1,If you could have a conversation with any per...,Kungfu Panda
2,2,What is your favorite thing to do in school?,Reading
3,3,!#$#% (swear words),Those are bad words and you should not say them
4,4,Are cats girls?,Cats can be female or male.


In [None]:
str(key)


'16fcf882-4e8a-4e5f-9f67-fb731b4842c9'

In [None]:
df.to_csv(str(key)+'.csv')

In [None]:
dd = pd.DataFrame()
dd['Question']= df['Question']
dd['Answer']= df['Answer']
df = dd
df.head()

Unnamed: 0,Question,Answer
0,Do you ever have a hard time falling asleep?,Sometimes
1,If you could have a conversation with any per...,Kungfu Panda
2,What is your favorite thing to do in school?,Reading
3,!#$#% (swear words),Those are bad words and you should not say them
4,Are cats girls?,Cats can be female or male.


In [None]:
# Importing libraries
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration

from rich.table import Column, Table
from rich import box
from rich.console import Console

# define a rich console logger
console=Console(record=True)

def display_df(df):
  """display dataframe in ASCII format"""

  console=Console()
  table = Table(Column("source_text", justify="center" ), Column("target_text", justify="center"), title="Sample Data",pad_edge=False, box=box.ASCII)

  for i, row in enumerate(df.values.tolist()):
    table.add_row(row[0], row[1])

  console.print(table)

training_logger = Table(Column("Epoch", justify="center" ), 
                        Column("Steps", justify="center"),
                        Column("Loss", justify="center"), 
                        title="Training Status",pad_edge=False, box=box.ASCII)


In [None]:
# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
class YourDataSetClass(Dataset):
  """
  Creating a custom dataset for reading the dataset and 
  loading it into the dataloader to pass it to the neural network for finetuning the model

  """

  def __init__(self, dataframe, tokenizer, source_len, target_len, source_text, target_text):
    self.tokenizer = tokenizer
    self.data = dataframe
    self.source_len = source_len
    self.summ_len = target_len
    self.target_text = self.data[target_text]
    self.source_text = self.data[source_text]

  def __len__(self):
    return len(self.target_text)

  def __getitem__(self, index):
    source_text = str(self.source_text[index])
    target_text = str(self.target_text[index])

    #cleaning data so as to ensure data is in string type
    source_text = ' '.join(source_text.split())
    target_text = ' '.join(target_text.split())

    source = self.tokenizer.batch_encode_plus([source_text], max_length= self.source_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')
    target = self.tokenizer.batch_encode_plus([target_text], max_length= self.summ_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')

    source_ids = source['input_ids'].squeeze()
    source_mask = source['attention_mask'].squeeze()
    target_ids = target['input_ids'].squeeze()
    target_mask = target['attention_mask'].squeeze()

    return {
        'source_ids': source_ids.to(dtype=torch.long), 
        'source_mask': source_mask.to(dtype=torch.long), 
        'target_ids': target_ids.to(dtype=torch.long),
        'target_ids_y': target_ids.to(dtype=torch.long)
    }

In [None]:
def train(epoch, tokenizer, model, device, loader, optimizer):

  """
  Function to be called for training with the parameters passed from main function

  """

  model.train()
  for _,data in enumerate(loader, 0):
    y = data['target_ids'].to(device, dtype = torch.long)
    y_ids = y[:, :-1].contiguous()
    lm_labels = y[:, 1:].clone().detach()
    lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
    ids = data['source_ids'].to(device, dtype = torch.long)
    mask = data['source_mask'].to(device, dtype = torch.long)

    outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
    loss = outputs[0]

    if _%10==0:
      training_logger.add_row(str(epoch), str(_), str(loss))
      console.print(training_logger)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [None]:
def validate(epoch, tokenizer, model, device, loader):

  """
  Function to evaluate model for predictions

  """
  model.eval()
  predictions = []
  actuals = []
  with torch.no_grad():
      for _, data in enumerate(loader, 0):
          y = data['target_ids'].to(device, dtype = torch.long)
          ids = data['source_ids'].to(device, dtype = torch.long)
          mask = data['source_mask'].to(device, dtype = torch.long)

          generated_ids = model.generate(
              input_ids = ids,
              attention_mask = mask, 
              max_length=150
              )
          preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
          target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
          if _%10==0:
              console.print(f'Completed {_}')

          predictions.extend(preds)
          actuals.extend(target)
  return predictions, actuals

In [None]:
def preprocess(message):
  source = tokenizer.batch_encode_plus(message,
                                       max_length= 200,
                                       truncation=True,
                                       return_tensors='pt')
  source_ids = source['input_ids'].squeeze()
  source_mask = source['attention_mask'].squeeze()
  
  return {
        'source_ids': source_ids.to(dtype=torch.long),
        'source_mask': source_mask.to(dtype=torch.long)
    }

In [None]:
def T5Trainer(dataframe, source_text, target_text, model_params, output_dir="./outputs/" ):
  
  """
  T5 trainer

  """

  # Set random seeds and deterministic pytorch for reproducibility
  torch.manual_seed(model_params["SEED"]) # pytorch random seed
  np.random.seed(model_params["SEED"]) # numpy random seed
  torch.backends.cudnn.deterministic = True

  # logging
  console.log(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

  # tokenzier for encoding the text
  tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])

  # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
  # Further this model is sent to device (GPU/TPU) for using the hardware.
  model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
  model = model.to(device)
  
  # logging
  console.log(f"[Data]: Reading data...\n")

  # Importing the raw dataset
  dataframe = dataframe[[source_text,target_text]]
  display_df(dataframe.head(2))

  
  # Creation of Dataset and Dataloader
  # Defining the train size. So 80% of the data will be used for training and the rest for validation. 
  train_size = 0.8
  train_dataset=dataframe.sample(frac=train_size,random_state = model_params["SEED"])
  val_dataset=dataframe.drop(train_dataset.index).reset_index(drop=True)
  train_dataset = train_dataset.reset_index(drop=True)

  console.print(f"FULL Dataset: {dataframe.shape}")
  console.print(f"TRAIN Dataset: {train_dataset.shape}")
  console.print(f"TEST Dataset: {val_dataset.shape}\n")


  # Creating the Training and Validation dataset for further creation of Dataloader
  training_set = YourDataSetClass(train_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"], source_text, target_text)
  val_set = YourDataSetClass(val_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"], source_text, target_text)


  # Defining the parameters for creation of dataloaders
  train_params = {
      'batch_size': model_params["TRAIN_BATCH_SIZE"],
      'shuffle': True,
      'num_workers': 0
      }

  val_params = {
      'batch_size': model_params["VALID_BATCH_SIZE"],
      'shuffle': False,
      'num_workers': 0
      }

  # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
  training_loader = DataLoader(training_set, **train_params)
  val_loader = DataLoader(val_set, **val_params)


  # Defining the optimizer that will be used to tune the weights of the network in the training session. 
  optimizer = torch.optim.Adam(params =  model.parameters(), lr=model_params["LEARNING_RATE"])


  # Training loop
  console.log(f'[Initiating Fine Tuning]...\n')

  for epoch in range(model_params["TRAIN_EPOCHS"]):
     train(epoch, tokenizer, model, device, training_loader, optimizer)
      
  console.log(f"[Saving Model]...\n")
  #Saving the model after training
  path = os.path.join(output_dir, "model_files")
  torch.save(model.state_dict(), 'model.pth')
  model.save_pretrained('final_model.pth')
  tokenizer.save_pretrained(path)


  # evaluating test dataset
  console.log(f"[Initiating Validation]...\n")
  for epoch in range(model_params["VAL_EPOCHS"]):
    predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
    final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
    final_df.to_csv(os.path.join(output_dir,'predictions.csv'))
  
  console.save_text(os.path.join(output_dir,'logs.txt'))
  
  console.log(f"[Validation Completed.]\n")

  console.print(f"""[Model] Model saved @ {os.path.join(output_dir, "model_files")}\n""")
  console.print(f"""[Validation] Generation on Validation data saved @ {os.path.join(output_dir,'predictions.csv')}\n""")
  console.print(f"""[Logs] Logs saved @ {os.path.join(output_dir,'logs.txt')}\n""")

  return model

In [None]:
model_params={
    "MODEL":"t5-small",             # model_type: t5-base/t5-large
    "TRAIN_BATCH_SIZE":8,          # training batch size
    "VALID_BATCH_SIZE":8,          # validation batch size
    "TRAIN_EPOCHS":30,              # number of training epochs
    "VAL_EPOCHS":1,                # number of validation epochs
    "LEARNING_RATE":1e-4,          # learning rate
    "MAX_SOURCE_TEXT_LENGTH":500,  # max length of source text
    "MAX_TARGET_TEXT_LENGTH":500,   # max length of target text
    "SEED": 42                     # set seed for reproducibility 

}

In [None]:
m1 =T5Trainer(dataframe=df, source_text="Answer", target_text="Question", model_params=model_params, output_dir="outputs")

In [None]:
#from transformers import DistilBertConfig, DistilBertModel
#model = DistilBertModel.from_pretrained('./outputs')

In [None]:
#m1.load_state_dict('./outputs')

In [None]:
#torch.save(m1.state_dict(), '/outputs')

In [None]:
model.load_state_dict(torch.load('load/from/path/model.pth'))

In [None]:
model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])

In [None]:
while True:
  text = input("")
  tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])
  input_ids = tokenizer.encode(text, return_tensors="pt").to(device)
  here= m1.generate(input_ids)
  output = tokenizer.batch_decode(here, skip_special_tokens=True)
  print(output)

hi


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


['Hi']
what is your name
['What is your name?']
teddy
['teddy bears teddy bears teddy bears']
favorite club
['Favorit club?']
color
['Color: Color: Color: Color: Color: Color: Color: Color: Color: Color']
ood
['ood ood ood ood oo']
mood
['Stimmungsschwierigkeiten – ob – ob –']
are cats girls?
['Sind girls girls?']
what
['Was']
name
['Nom du nom du nom du nom du nom du nom du nom du nom du nom du']
what is your name
['What is your name?']
reading
['Lesen Sie in den folgenden Bereichen z.B. z.B.']


KeyboardInterrupt: ignored

In [None]:
def predict(tokenizer, m1, device, loader):
  with torch.no_grad():
    for data in enumerate(loader):
      ids = data['source_ids'].to(device, dtype = torch.long)
      mask = data['source_mask'].to(device, dtype = torch.long)
      generated_ids = m1.generate(
          input_ids = ids,
          attention_mask = mask, 
          max_length=150
          )
      preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]

  return preds

In [None]:
while True:
  message = input("")
  tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])
  data = YourDataSetClass2(message, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"], source_text, target_text)
  m1.eval()
  with torch.no_grad():
    ids = data['source_ids'].to(device).squeeze()
    masks = data['source_mask'].to(device).squeeze()
    generated_ids = m1.generate(input_ids = ids, attention_mask = masks, max_length=150)
    preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
    for i in preds:
      prediction.append(i[:1])
  print(prediction)


In [None]:
def preprocess1(message):
  tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])
  source = tokenizer.batch_encode_plus(message,
                                       max_length= 200,
                                       pad_to_max_length=True,
                                       truncation=True,
                                       padding="max_length",
                                       return_tensors='pt')
  source_ids = source['input_ids'].squeeze()
  source_mask = source['attention_mask'].squeeze()
  
  return {
        source_ids.to(dtype=torch.long), 
        source_mask.to(dtype=torch.long)
    }



In [None]:
def predict1(tokenizer, m1, device, ids, mask):

  #model.eval()
  predictions = []
  actuals = []
  with torch.no_grad():
    ids = ids.to(device, dtype = torch.long)
    mask = mask.to(device, dtype = torch.long)
    generated_ids = m1.generate(
        input_ids = ids,
        attention_mask = mask, 
        max_length=150
        )
    preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]

  return preds

In [None]:
while True:
  tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])
  message = input("")
  response_id, mask = preprocess1(message)
  ints = predict1(tokenizer, m1, device, response_id, mask)
  print(ints)
