<a href="https://colab.research.google.com/github/Youssef-Rachad/RL-Speech-Disfluency/blob/main/BUTWHYCHATGIUSEPPE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip -q install ml_things transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import io, os, torch, pandas
from datetime import datetime
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from ml_things import plot_dict, plot_confusion_matrix, fix_text
from sklearn.metrics import classification_report, accuracy_score
from transformers import (set_seed,
                          TrainingArguments,
                          Trainer,
                          GPT2Config,
                          GPT2Tokenizer,
                          AdamW,
                          get_linear_schedule_with_warmup,
                          GPT2ForSequenceClassification)

set_seed(359)
epochs = 4

batch_size = 32
max_length = 60
model_name_or_path = 'gpt2'
labels_ids = [0, 1, 2, 3, 4, 5, 6]
# labels_ids = {1: 1, 2: 2,  3: 3,  4: 4,  5: 5,  6: 6,  7: 7}
n_labels = 7

In [None]:
class TranscriptDataset(Dataset):
    def __init__(self, path, ratings_file, use_tokenizer):
        if not os.path.isdir(path):
            raise ValueError("Woopeedoopee, "+path+" is poopoo")

        ratings = pandas.read_csv(ratings_file)
        self.texts = []
        self.labels = []

        for file_name in tqdm(os.listdir(path), desc=f"transcript file"):
            file_path = os.path.join(path, file_name)
            content = fix_text(io.open(file_path, mode='r', encoding='utf-8').read())
            for rating in ratings[ratings['video_id'] == file_name[-15: -4]]['Rating']:
                self.texts.append(content)
                self.labels.append(rating - 1)

        self.n_examples = len(self.labels)
        return

    def __len__(self):
        return self.n_examples

    def __getitem__(self, item):
        return {
                'transcript': self.texts[item],
                'rating': self.labels[item]
                }

class GPT2ClassificationCollator(object):
    def __init__(self, use_tokenizer, labels_encoder, max_sequence_len=None):
        self.use_tokenizer = use_tokenizer
        self.max_sequence_len = use_tokenizer.model_max_length if max_sequence_len is None else max_sequence_len
        self.labels_encoder = labels_encoder

        return

    def __call__(self, sequences):
        texts  = [sequence['transcript'] for sequence in sequences]
        labels = [sequence['rating'] for sequence in sequences]
        inputs = [label - 1 for label in labels]
        inputs = self.use_tokenizer(text=texts, return_tensors='pt', padding=True, truncation=True, max_length=self.max_sequence_len)
        inputs.update({'labels': torch.tensor(labels)})
        return inputs

In [None]:
def train(dataloader, optimizer_, scheduler_, device_):
    global model
    predictions_labels = []
    true_labels = []
    total_loss = 0
    model.train()
    for batch in tqdm(dataloader, total=len(dataloader)):
        true_labels += batch['labels'].numpy().flatten().tolist() # labels are ratings
        batch = {k:v.type(torch.long) for k, v in batch.items()}
        model.zero_grad()
        outputs = model(**batch)
        loss, logits = outputs[:2]
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer_.step()
        scheduler_.step()
        logits = logits.detach().cpu().numpy()
        predictions_labels += logits.argmax(axis=1).flatten().tolist()
    avg_epoch_loss = total_loss / len(dataloader)
    return true_labels, predictions_labels, avg_epoch_loss

def validation(dataloader, device_):
    global model
    predictions_labels = []
    true_labels = []
    total_loss = 0
    model.eval()
    for batch in tqdm(dataloader, total=len(dataloader)):
        true_labels += [int(i) for i in batch['labels'].numpy().flatten().tolist()]
        batch = {k:v.type(torch.long) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            loss, logits = outputs[:2]
            total_loss += loss.item()
            logits = logits.detach().cpu().numpy()
            predictions_labels += logits.argmax(axis=1).flatten().tolist()
    avg_epoch_loss = total_loss / len(dataloader)
    return true_labels, predictions_labels, avg_epoch_loss

In [None]:
# Get model configuration.
print('Loading configuraiton...')
model_config = GPT2Config.from_pretrained(pretrained_model_name_or_path=model_name_or_path, num_labels=n_labels)

# Get model's tokenizer.
print('Loading tokenizer...')
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path)
# default to left padding
tokenizer.padding_side = "left"
# Define PAD Token = EOS Token = 50256
tokenizer.pad_token = tokenizer.eos_token


# Get the actual model.
print('Loading model...')
model = GPT2ForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path, config=model_config)

# resize model embedding to match new tokenizer
model.resize_token_embeddings(len(tokenizer))

# fix model padding token id
model.config.pad_token_id = model.config.eos_token_id

# Create data collator to encode text and labels into numbers.
gpt2_classificaiton_collator = GPT2ClassificationCollator(use_tokenizer=tokenizer,
                                                          labels_encoder=labels_ids,
                                                          max_sequence_len=max_length)


print('Dealing with Train...')
# Create pytorch dataset.
train_dataset = TranscriptDataset(path='./dataset_ratings_one/',
                                  ratings_file='ratings.csv',
                               use_tokenizer=tokenizer)
print('Created `train_dataset` with %d examples!'%len(train_dataset))

# Move pytorch dataset into dataloader.
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=gpt2_classificaiton_collator)
print('Created `train_dataloader` with %d batches!'%len(train_dataloader))

print()

print('Dealing with Validation...')
# Create pytorch dataset.
valid_dataset =  TranscriptDataset(path='./dataset_ratings_two/',
                                   ratings_file='ratings_round_two.csv',
                               use_tokenizer=tokenizer)
print('Created `valid_dataset` with %d examples!'%len(valid_dataset))

# Move pytorch dataset into dataloader.
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=gpt2_classificaiton_collator)
print('Created `eval_dataloader` with %d batches!'%len(valid_dataloader))

# Note: AdamW is a class from the huggingface library (as opposed to pytorch)
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # default is 1e-8.
                  )

# Total number of training steps is number of batches * number of epochs.
# `train_dataloader` contains batched data so `len(train_dataloader)` gives
# us the number of batches.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

# Store the average loss after each epoch so we can plot them.
all_loss = {'train_loss':[], 'val_loss':[]}
all_acc = {'train_acc':[], 'val_acc':[]}

In [None]:
## NOW we run the thing
# Get the current date and time
current_datetime = datetime.now()

# Format it as "YYYYMMDDHHMMSS"
date_string = current_datetime.strftime("%Y%m%d%H%M%S")
# Loop through each epoch.
print('Epoch')
if not os.path.exists(f'./results/{date_string}'):
    os.makedirs(f'./results/{date_string}')
results_csv = io.open(f'./results/{date_string}/loss_acc.csv', 'w', encoding="utf-8")
results_csv.write("train_loss,val_loss,train_acc,valid_acc")
for epoch in tqdm(range(epochs)):
  print()
  print('Training on batches...')
  # Perform one full pass over the training set.
  train_labels, train_predict, train_loss = train(train_dataloader, optimizer, scheduler, None)
  # print("train_labels", train_labels)
  # print("train predict", train_predict)
  # print("train loss", train_loss)
  train_acc = accuracy_score(train_labels, train_predict)
  # print("train acc", train_acc)

  # Get prediction form model on validation data.
  # print('Validation on batches...')
  valid_labels, valid_predict, val_loss = validation(valid_dataloader, None)
  # print("valid_labels", valid_labels)
  # print("valid predict", valid_predict)
  # print("valid loss", val_loss)
  val_acc = accuracy_score(valid_labels, valid_predict)
  # print("valid acc", val_acc)

  # Print loss and accuracy values to see how training evolves.
  print("  train_loss: %.5f - val_loss: %.5f - train_acc: %.5f - valid_acc: %.5f"%(train_loss, val_loss, train_acc, val_acc))
  results_csv.write(f"{train_loss},{val_loss},{train_acc},{val_acc}")

  # Store the loss value for plotting the learning curve.
  all_loss['train_loss'].append(train_loss)
  all_loss['val_loss'].append(val_loss)
  all_acc['train_acc'].append(train_acc)
  all_acc['val_acc'].append(val_acc)

results_csv.close()

In [None]:
'''
# Plot loss curves.
plot_dict(all_loss, use_xlabel='Epochs', use_ylabel='Value', use_linestyles=['-', '--'])

# Plot accuracy curves.
plot_dict(all_acc, use_xlabel='Epochs', use_ylabel='Value', use_linestyles=['-', '--'])

# Get prediction form model on validation data. This is where you should use
# your test data.
true_labels, predictions_labels, avg_epoch_loss = validation(valid_dataloader, None)

# Create the evaluation report.
evaluation_report = classification_report(true_labels, predictions_labels, labels=list(labels_ids.values()), target_names=list(labels_ids.keys()))
# Show the evaluation report.
print(evaluation_report)

# Plot confusion matrix.
plot_confusion_matrix(y_true=true_labels, y_pred=predictions_labels,
                      classes=list(labels_ids.keys()), normalize=True,
                      magnify=0.1,
                      );
'''

In [None]:
torch.save(model.state_dict(), f'results/{date_string}/model.pth')