In [1]:
!pip install transformers[sentencepiece]

Collecting transformers[sentencepiece]
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 7.5 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.2 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 38.7 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 20.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 41.8 MB/s 
Collecting sentencepiece!=0.1.92,>=0.1.91
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64

In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[?25l[K     |█                               | 10 kB 30.2 MB/s eta 0:00:01[K     |██                              | 20 kB 19.8 MB/s eta 0:00:01[K     |███                             | 30 kB 10.6 MB/s eta 0:00:01[K     |████                            | 40 kB 8.8 MB/s eta 0:00:01[K     |█████                           | 51 kB 6.5 MB/s eta 0:00:01[K     |██████                          | 61 kB 7.7 MB/s eta 0:00:01[K     |███████                         | 71 kB 6.8 MB/s eta 0:00:01[K     |████████                        | 81 kB 7.6 MB/s eta 0:00:01[K     |█████████                       | 92 kB 8.4 MB/s eta 0:00:01[K     |██████████                      | 102 kB 8.0 MB/s eta 0:00:01[K     |███████████                     | 112 kB 8.0 MB/s eta 0:00:01[K     |████████████                    | 122 kB 8.0 MB/s eta 0:00:01[K     |█████████████                   | 133 kB 8.0 MB/s eta 0:00:01

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/231M [00:00<?, ?B/s]

In [5]:
import numpy as np
import pandas as pd

# DS_DIR = './'
DS_DIR = '/content/gdrive/My Drive/'
label = 'text1'
models = ['google', 'wit', 'deepgram']


def load_data(model):
    df = pd.read_csv(DS_DIR + 'ljs-data.csv')
    train_df, valid_df = np.split(df.sample(frac=1, random_state=42), [int(.8 * len(df))])
    errors_train = train_df['google'].isna().values | train_df['wit'].isna().values | train_df['deepgram'].isna().values
    y_train = train_df[label].values
    X_train = train_df[model].values
    errors_valid = valid_df['google'].isna().values | valid_df['wit'].isna().values | valid_df['deepgram'].isna().values
    y_valid = valid_df[label].values
    X_valid = valid_df[model].values
    return X_train[~errors_train], y_train[~errors_train], X_valid[~errors_valid], y_valid[~errors_valid]

In [6]:
X1_train, y1_train, X1_valid, y1_valid = load_data('wit')
X2_train, y2_train, X2_valid, y2_valid = load_data('deepgram')

In [7]:
from datasets import Dataset

df_train = pd.DataFrame({"text1": X1_train, "text2": X2_train, "labels": y1_train})
dataset_train = Dataset.from_pandas(df_train)

df_valid = pd.DataFrame({"text1": X1_valid, "text2": X2_valid, "labels": y1_valid})
dataset_valid = Dataset.from_pandas(df_valid)

In [8]:
prefix = "merge sentence1: "
infix = " sentence2: "

def preprocess_function(examples):
    inputs = [prefix + examples["text1"][i] + infix + examples["text2"][i] for i in range(len(examples["text1"]))]
    model_inputs = tokenizer(inputs, padding="max_length", max_length=320, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['labels'], padding="max_length", max_length=320, truncation=True)

    # model_inputs["input_ids"] = model_inputs["input_ids"].squeeze(0)
    # model_inputs["attention_mask"] = model_inputs["attention_mask"].squeeze(0)
    model_inputs["labels"] = labels["input_ids"] #.squeeze(0)

    return model_inputs

tokenized_datasets_train = dataset_train.map(preprocess_function, batched=True)
tokenized_datasets_valid = dataset_valid.map(preprocess_function, batched=True)

  0%|          | 0/11 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [9]:
tokenized_datasets_train = tokenized_datasets_train.remove_columns(["text1", "text2"])
tokenized_datasets_valid = tokenized_datasets_valid.remove_columns(["text1", "text2"])
# tokenized_datasets.set_format("torch")
tokenized_datasets_train

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 10477
})

In [10]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [11]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results-wit-deepgram",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=100,
    fp16=True,
)

In [12]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets_train,
    eval_dataset=tokenized_datasets_valid,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

Using amp half precision backend


In [13]:
trainer.train()

***** Running training *****
  Num examples = 10477
  Num Epochs = 100
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 65500


Epoch,Training Loss,Validation Loss
1,0.6162,0.053364
2,0.0654,0.044188
3,0.0533,0.04054
4,0.0464,0.038401
5,0.0445,0.036611
6,0.0428,0.035173
7,0.04,0.034333
8,0.0384,0.033311
9,0.0372,0.032394
10,0.0356,0.03191


Saving model checkpoint to ./results-wit-deepgram/checkpoint-500
Configuration saved in ./results-wit-deepgram/checkpoint-500/config.json
Model weights saved in ./results-wit-deepgram/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results-wit-deepgram/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results-wit-deepgram/checkpoint-500/special_tokens_map.json
Copy vocab file to ./results-wit-deepgram/checkpoint-500/spiece.model
***** Running Evaluation *****
  Num examples = 2615
  Batch size = 16
Saving model checkpoint to ./results-wit-deepgram/checkpoint-1000
Configuration saved in ./results-wit-deepgram/checkpoint-1000/config.json
Model weights saved in ./results-wit-deepgram/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results-wit-deepgram/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results-wit-deepgram/checkpoint-1000/special_tokens_map.json
Copy vocab file to ./results-wit-deepgram/checkpoint-100

Epoch,Training Loss,Validation Loss
1,0.6162,0.053364
2,0.0654,0.044188
3,0.0533,0.04054
4,0.0464,0.038401
5,0.0445,0.036611
6,0.0428,0.035173
7,0.04,0.034333
8,0.0384,0.033311
9,0.0372,0.032394
10,0.0356,0.03191


Saving model checkpoint to ./results-wit-deepgram/checkpoint-14000
Configuration saved in ./results-wit-deepgram/checkpoint-14000/config.json
Model weights saved in ./results-wit-deepgram/checkpoint-14000/pytorch_model.bin
tokenizer config file saved in ./results-wit-deepgram/checkpoint-14000/tokenizer_config.json
Special tokens file saved in ./results-wit-deepgram/checkpoint-14000/special_tokens_map.json
Copy vocab file to ./results-wit-deepgram/checkpoint-14000/spiece.model
Deleting older checkpoint [results-wit-deepgram/checkpoint-12500] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2615
  Batch size = 16
Saving model checkpoint to ./results-wit-deepgram/checkpoint-14500
Configuration saved in ./results-wit-deepgram/checkpoint-14500/config.json
Model weights saved in ./results-wit-deepgram/checkpoint-14500/pytorch_model.bin
tokenizer config file saved in ./results-wit-deepgram/checkpoint-14500/tokenizer_config.json
Special tokens file saved in ./result

TrainOutput(global_step=65500, training_loss=0.027297321945656348, metrics={'train_runtime': 24357.5038, 'train_samples_per_second': 43.013, 'train_steps_per_second': 2.689, 'total_flos': 8.8623503376384e+16, 'train_loss': 0.027297321945656348, 'epoch': 100.0})

In [14]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

results = []

for batch in tokenized_datasets_valid:
  y_pred = model.generate(torch.tensor([batch['input_ids']]).to(device))

  for i in range(len(y_pred)):
    results.append({
      'input': tokenizer.decode(batch['input_ids'], skip_special_tokens=True),
      'output': tokenizer.decode(y_pred[i], skip_special_tokens=True),
      'label': tokenizer.decode(batch['labels'], skip_special_tokens=True),
    })

results_df = pd.DataFrame(results)
results_df.to_csv(DS_DIR + 'wit-deepgram-results.csv')
