In [1]:
!pip install transformers[sentencepiece]

Collecting transformers[sentencepiece]
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 10.5 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.0 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 58.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 25.9 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 21.9 MB/s 
Collecting sentencepiece!=0.1.92,>=0.1.91
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17

In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 13.2 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.3.0-py3-none-any.whl (136 kB)
[K     |████████████████████████████████| 136 kB 74.3 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 75.7 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 63.7 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 74.2 MB/s 
[?25hCollecting async-timeout<5.0,>=4.0.0a3
  Downlo

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/231M [00:00<?, ?B/s]

In [5]:
import numpy as np
import pandas as pd

# DS_DIR = './'
DS_DIR = '/content/gdrive/My Drive/'
label = 'text1'
models = ['google', 'wit', 'deepgram']


def load_data(model):
    df = pd.read_csv(DS_DIR + 'ljs-data.csv')
    train_df, valid_df = np.split(df.sample(frac=1, random_state=42), [int(.8 * len(df))])
    errors_train = train_df['google'].isna().values | train_df['wit'].isna().values | train_df['deepgram'].isna().values
    y_train = train_df[label].values
    X_train = train_df[model].values
    errors_valid = valid_df['google'].isna().values | valid_df['wit'].isna().values | valid_df['deepgram'].isna().values
    y_valid = valid_df[label].values
    X_valid = valid_df[model].values
    return X_train[~errors_train], y_train[~errors_train], X_valid[~errors_valid], y_valid[~errors_valid]

In [6]:
X_train, y_train, X_valid, y_valid = load_data('wit')

In [7]:
from datasets import Dataset

df_train = pd.DataFrame({"text": X_train, "labels": y_train})
dataset_train = Dataset.from_pandas(df_train)

df_valid = pd.DataFrame({"text": X_valid, "labels": y_valid})
dataset_valid = Dataset.from_pandas(df_valid)

In [8]:
prefix = "translate English to English: "

def preprocess_function(examples):
    inputs = [prefix + example for example in examples["text"]]
    model_inputs = tokenizer(inputs, padding="max_length", max_length=160, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['labels'], padding="max_length", max_length=160, truncation=True)

    # model_inputs["input_ids"] = model_inputs["input_ids"].squeeze(0)
    # model_inputs["attention_mask"] = model_inputs["attention_mask"].squeeze(0)
    model_inputs["labels"] = labels["input_ids"] #.squeeze(0)

    return model_inputs

tokenized_datasets_train = dataset_train.map(preprocess_function, batched=True)
tokenized_datasets_valid = dataset_valid.map(preprocess_function, batched=True)

  0%|          | 0/11 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [9]:
tokenized_datasets_train = tokenized_datasets_train.remove_columns(["text"])
tokenized_datasets_valid = tokenized_datasets_valid.remove_columns(["text"])
# tokenized_datasets.set_format("torch")
tokenized_datasets_train

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 10477
})

In [10]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [11]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results-wit",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=100,
    fp16=True,
)

In [12]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets_train,
    eval_dataset=tokenized_datasets_valid,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

Using amp half precision backend


In [13]:
trainer.train()

***** Running training *****
  Num examples = 10477
  Num Epochs = 100
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 65500


Epoch,Training Loss,Validation Loss
1,1.3079,0.178698
2,0.1976,0.163671
3,0.1846,0.157567
4,0.1693,0.152894
5,0.1671,0.14935
6,0.1605,0.146862
7,0.1576,0.144547
8,0.1534,0.142358
9,0.15,0.140735
10,0.1472,0.139488


Saving model checkpoint to ./results-wit/checkpoint-500
Configuration saved in ./results-wit/checkpoint-500/config.json
Model weights saved in ./results-wit/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results-wit/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results-wit/checkpoint-500/special_tokens_map.json
Copy vocab file to ./results-wit/checkpoint-500/spiece.model
***** Running Evaluation *****
  Num examples = 2615
  Batch size = 16
Saving model checkpoint to ./results-wit/checkpoint-1000
Configuration saved in ./results-wit/checkpoint-1000/config.json
Model weights saved in ./results-wit/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results-wit/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results-wit/checkpoint-1000/special_tokens_map.json
Copy vocab file to ./results-wit/checkpoint-1000/spiece.model
***** Running Evaluation *****
  Num examples = 2615
  Batch size = 16
Saving model checkpoin

TrainOutput(global_step=65500, training_loss=0.12470100816333567, metrics={'train_runtime': 16791.2639, 'train_samples_per_second': 62.396, 'train_steps_per_second': 3.901, 'total_flos': 4.4311751688192e+16, 'train_loss': 0.12470100816333567, 'epoch': 100.0})

In [14]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

results = []

for batch in tokenized_datasets_valid:
  y_pred = model.generate(torch.tensor([batch['input_ids']]).to(device))

  for i in range(len(y_pred)):
    results.append({
      'input': tokenizer.decode(batch['input_ids'], skip_special_tokens=True),
      'output': tokenizer.decode(y_pred[i], skip_special_tokens=True),
      'label': tokenizer.decode(batch['labels'], skip_special_tokens=True),
    })

results_df = pd.DataFrame(results)
results_df.to_csv(DS_DIR + 'wit-results.csv')
