In [1]:
!pip install transformers[sentencepiece]



In [2]:
!pip install datasets



In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

In [5]:
import numpy as np
import pandas as pd

# DS_DIR = './'
DS_DIR = '/content/gdrive/My Drive/'
label = 'text1'
models = ['google', 'wit', 'deepgram']


def load_data(model):
    df = pd.read_csv(DS_DIR + 'ljs-data.csv')
    train_df, valid_df = np.split(df.sample(frac=1, random_state=42), [int(.8 * len(df))])
    errors_train = train_df['google'].isna().values | train_df['wit'].isna().values | train_df['deepgram'].isna().values
    y_train = train_df[label].values
    X_train = train_df[model].values
    errors_valid = valid_df['google'].isna().values | valid_df['wit'].isna().values | valid_df['deepgram'].isna().values
    y_valid = valid_df[label].values
    X_valid = valid_df[model].values
    return X_train[~errors_train], y_train[~errors_train], X_valid[~errors_valid], y_valid[~errors_valid]

In [6]:
X1_train, y1_train, X1_valid, y1_valid = load_data('google')
X2_train, y2_train, X2_valid, y2_valid = load_data('wit')
X3_train, y3_train, X3_valid, y3_valid = load_data('deepgram')

In [7]:
from datasets import Dataset

df_train = pd.DataFrame({"text1": X1_train, "text2": X2_train, "text3": X3_train, "labels": y1_train})
dataset_train = Dataset.from_pandas(df_train)

df_valid = pd.DataFrame({"text1": X1_valid, "text2": X2_valid, "text3": X3_valid, "labels": y1_valid})
dataset_valid = Dataset.from_pandas(df_valid)

In [8]:
prefix = "merge sentence1: "
infix1 = " sentence2: "
infix2 = " sentence3: "

def preprocess_function(examples):
    inputs = [prefix + examples["text1"][i] + infix1 + examples["text2"][i] + infix2 + examples["text3"][i] for i in range(len(examples["text1"]))]
    model_inputs = tokenizer(inputs, padding="max_length", max_length=480, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['labels'], padding="max_length", max_length=160, truncation=True)

    # model_inputs["input_ids"] = model_inputs["input_ids"].squeeze(0)
    # model_inputs["attention_mask"] = model_inputs["attention_mask"].squeeze(0)
    model_inputs["labels"] = labels["input_ids"] #.squeeze(0)

    return model_inputs

tokenized_datasets_train = dataset_train.map(preprocess_function, batched=True)
tokenized_datasets_valid = dataset_valid.map(preprocess_function, batched=True)

  0%|          | 0/11 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [9]:
tokenized_datasets_train = tokenized_datasets_train.remove_columns(["text1", "text2", "text3"])
tokenized_datasets_valid = tokenized_datasets_valid.remove_columns(["text1", "text2", "text3"])
# tokenized_datasets.set_format("torch")
tokenized_datasets_train

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 10477
})

In [10]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [11]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results-google-wit-deepgram",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=100,
    fp16=True,
)

In [12]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets_train,
    eval_dataset=tokenized_datasets_valid,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

Using amp half precision backend


In [13]:
trainer.train()

***** Running training *****
  Num examples = 10477
  Num Epochs = 100
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 65500


Epoch,Training Loss,Validation Loss
1,0.6007,0.078829
2,0.0954,0.067553
3,0.0818,0.062423
4,0.0717,0.059222
5,0.0688,0.056562
6,0.066,0.054218
7,0.0614,0.053087
8,0.0593,0.051301
9,0.0575,0.050225
10,0.0547,0.049545


Saving model checkpoint to ./results-google-wit-deepgram/checkpoint-500
Configuration saved in ./results-google-wit-deepgram/checkpoint-500/config.json
Model weights saved in ./results-google-wit-deepgram/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results-google-wit-deepgram/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results-google-wit-deepgram/checkpoint-500/special_tokens_map.json
Copy vocab file to ./results-google-wit-deepgram/checkpoint-500/spiece.model
***** Running Evaluation *****
  Num examples = 2615
  Batch size = 16
Saving model checkpoint to ./results-google-wit-deepgram/checkpoint-1000
Configuration saved in ./results-google-wit-deepgram/checkpoint-1000/config.json
Model weights saved in ./results-google-wit-deepgram/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results-google-wit-deepgram/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results-google-wit-deepgram/checkpoint-1000/spe

TrainOutput(global_step=65500, training_loss=0.03787209140981427, metrics={'train_runtime': 24955.0401, 'train_samples_per_second': 41.984, 'train_steps_per_second': 2.625, 'total_flos': 1.32935255064576e+17, 'train_loss': 0.03787209140981427, 'epoch': 100.0})

In [14]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

results = []

for batch in tokenized_datasets_valid:
  y_pred = model.generate(torch.tensor([batch['input_ids']]).to(device))

  for i in range(len(y_pred)):
    results.append({
      'input': tokenizer.decode(batch['input_ids'], skip_special_tokens=True),
      'output': tokenizer.decode(y_pred[i], skip_special_tokens=True),
      'label': tokenizer.decode(batch['labels'], skip_special_tokens=True),
    })

results_df = pd.DataFrame(results)
results_df.to_csv(DS_DIR + 'google-wit-deepgram-results.csv')
