In [None]:
!cp "/content/drive/MyDrive/CSCI567 Project/clean_combined_sarcasm_dataset.csv" /content/

In [None]:
!cp "/content/drive/MyDrive/CSCI567 Project/100k-data-subset/sarcasm_updated_annotations_chunk_0.csv" /content/

In [None]:
!pip install transformers



In [None]:
from transformers import GPT2LMHeadModel,GPT2TokenizerFast,GPTNeoModel

MODEL_NAME = 'gpt2-small' #'distilgpt2' 'gpt2-medium' 'gpt2-small'

tokenizer = GPT2TokenizerFast.from_pretrained(MODEL_NAME,truncation=True,padding=True)
model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
import torch
# uncomment to re-train pre-existing model over more data:
#model=torch.load('/content/drive/MyDrive/CSCI567 Project/Fine_tuning_weights/distil_gpt2_finetunes_3_epochs_data_augmented.pth')

In [None]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50260, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50260, bias=False)
)

In [None]:
# Declare special tokens for padding and separating the context from the brand statement:
additional_tokens = {
    'pad_token': '<pad>',
    'additional_special_tokens': ['<nameinfo>', '<headline>'],
}

# Add these special tokens to the vocabulary and resize model's embeddings:
tokenizer.add_special_tokens(additional_tokens)
model.resize_token_embeddings(len(tokenizer))

# Show the full list of special tokens:
print(tokenizer.special_tokens_map)

{'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<pad>', 'additional_special_tokens': ['<headline>', '<nameinfo>']}


In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset

class DatasetEncoding(Dataset):
  def __init__(self, filename, tokenizer, seq_length=64):

    context_token = tokenizer.additional_special_tokens_ids[0]
    sep_token = tokenizer.additional_special_tokens_ids[1]
    pad_token = tokenizer.pad_token_id
    eos_token = tokenizer.eos_token_id # signifies the end of string

    self.examples = []

    df=pd.read_csv(filename)
    df=df.head(20000)
    for index,row in df.iterrows():

      # encode the context and slogan segments in the needed format:
      context = [context_token] + tokenizer.encode(str(row['Context']), max_length=seq_length//2-1)
      sarc = [sep_token] + tokenizer.encode(str(row['cleaned_generated_question']), max_length=seq_length//2-2) + [eos_token]

      # Concatenate the two parts together:
      tokens = context + slogan + [pad_token] * ( seq_length - len(context) - len(sarc) )

      # Annotate each token with its corresponding segment:
      segments = [context_token] * len(context) + [sep_token] * ( seq_length - len(context) )

      # Ignore the context, padding, and <slogan> tokens by setting their labels to -100:
      labels = [-100] * (len(context)+1) + sarc[1:] + [-100] * ( seq_length - len(context) - len(sarc) )

      # Add the preprocessed examples to the dataset:
      self.examples.append((tokens, segments, labels))

  def __len__(self):
    return len(self.examples)

  def __getitem__(self, item):
    return torch.tensor(self.examples[item])


# Passing our data for encoding into the appopriate format:
slogan_dataset = DatasetEncoding('/content/clean_combined_sarcasm_dataset.csv', tokenizer)

In [None]:
import math, random
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler

# Split data for training and validation splits with a random factor to prevent overfitting cases:

indices = list(range(len(slogan_dataset)))
random.seed(42)
random.shuffle(indices)

split = math.floor(0.1 * len(slogan_dataset))
train_indices, val_indices = indices[split:], indices[:split]

train_sampler = SubsetRandomSampler(train_indices)
val_sampler = SubsetRandomSampler(val_indices)

# declaring data loaders:
train_loader = DataLoader(slogan_dataset, batch_size=64, sampler=train_sampler)
val_loader = DataLoader(slogan_dataset, batch_size=64, sampler=val_sampler)

In [None]:
import numpy as np
from tqdm import tqdm


def fit(model, optimizer, train_dl, val_dl, epochs=1, device=torch.device('cpu')):

  for i in range(epochs):

    print('--- Starting epoch #{} ---\n\n'.format(i))

    model.train()

    losses = []
    nums = []

    for xb in tqdm(train_dl, desc="Training"):
      inputs = xb.to(device)

      outputs = model(inputs[:,0,:], token_type_ids=inputs[:,1,:], labels=inputs[:,2,:])

      loss = outputs[0]
      losses.append(loss.item())
      nums.append(len(xb))

      loss.backward()

      optimizer.step()
      model.zero_grad()

    # compute average cost over one epoch:
    train_cost = np.sum(np.multiply(losses, nums)) / sum(nums)


    # validation:
    model.eval()

    with torch.no_grad():
      losses = []
      nums = []

      for xb in tqdm(val_dl, desc="Validation"):
        inputs = xb.to(device)
        outputs = model(inputs[:,0,:], token_type_ids=inputs[:,1,:], labels=inputs[:,2,:])
        losses.append(outputs[0].item())
        nums.append(len(xb))

    val_cost = np.sum(np.multiply(losses, nums)) / sum(nums)

    print('\n--- Epoch #{} finished --- Training cost: {} \n Validation cost: {}'.format(i, train_cost, val_cost))


In [None]:
from torch.optim import AdamW

# Move the model to the GPU:
device = torch.device('cuda')
model.to(device)

# Fine-tune GPT2 for 3 epochs:
optimizer = AdamW(model.parameters())


In [None]:
# model.to(device)
fit(model, optimizer, train_loader, val_loader, epochs=4, device=device)

--- Starting epoch #0 ---




Training: 100%|██████████| 282/282 [03:23<00:00,  1.38it/s]
Validation: 100%|██████████| 32/32 [00:07<00:00,  4.24it/s]



--- Epoch #0 finished --- Training cost: 1.4392252931594849 
 Validation cost: 2.8659482707977295
--- Starting epoch #1 ---




Training: 100%|██████████| 282/282 [03:23<00:00,  1.39it/s]
Validation: 100%|██████████| 32/32 [00:07<00:00,  4.23it/s]



--- Epoch #1 finished --- Training cost: 1.154104036755032 
 Validation cost: 3.090859104156494
--- Starting epoch #2 ---




Training: 100%|██████████| 282/282 [03:23<00:00,  1.39it/s]
Validation: 100%|██████████| 32/32 [00:07<00:00,  4.22it/s]



--- Epoch #2 finished --- Training cost: 0.9377226757473416 
 Validation cost: 3.4111530590057373
--- Starting epoch #3 ---




Training: 100%|██████████| 282/282 [03:23<00:00,  1.39it/s]
Validation: 100%|██████████| 32/32 [00:07<00:00,  4.21it/s]


--- Epoch #3 finished --- Training cost: 0.7678716799418132 
 Validation cost: 3.6813233051300047





In [None]:
torch.save(model,'/content/drive/MyDrive/CSCI567 Project/Fine_tuning_weights/gpt2_finetunes_7_epochs_data_augmented.pth')

In [None]:
# Sampling functions with top k and top p from HuggingFace:

import torch.nn.functional as F
from tqdm import trange
from tqdm import tqdm

def top_k_top_p_ordering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
    """ Filter a distribution of logits using top-k and/or (top-p) filtering.
        Args:
            logits: batch size x vocabulary size
            top_k > 0: keep only top k tokens with highest probability (top-k filtering).
            top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
        Snippet taken from: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
    """
    top_k = min(top_k, logits.size(-1))  # Safety check

    if top_k > 0:
        # Remove all tokens with a probability less than the last token of the top-k
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value

    if top_p > 0.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        # Remove tokens with cumulative probability greater than the top_p threshold
        sorted_indices_to_remove = cumulative_probs > top_p
        # Shift the indices to the right to keep also the first token above the threshold
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        # scatter sorted tensors to original indexing
        indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove)
        logits[indices_to_remove] = filter_value
    return logits


# snippet from HuggingFace, adapted to work for contextual separation:
def sample_sequence(model, length, context, segments_tokens=None, num_samples=1, temperature=1, top_k=0, top_p=0.0, repetition_penalty=1.0,):
    context = torch.tensor(context, dtype=torch.long, device='cuda')
    context = context.unsqueeze(0).repeat(num_samples, 1)
    generated = context

    with torch.no_grad():
        for j in tqdm(range(length)):

            inputs = {'input_ids': generated}
            if segments_tokens != None:
              inputs['token_type_ids'] = torch.tensor(segments_tokens[:generated.shape[1]]).unsqueeze(0).repeat(num_samples, 1).to(torch.device('cuda'))


            outputs = model(**inputs)
            next_token_logits = outputs[0][:, -1, :] / (temperature if temperature > 0 else 1.)

            # repetition penalty from CTRL (https://arxiv.org/abs/1909.05858)
            for i in range(num_samples):
                for _ in set(generated[i].tolist()):
                    next_token_logits[i, _] /= repetition_penalty

            filtered_logits = top_k_top_p_ordering(next_token_logits, top_k=top_k, top_p=top_p)
            if temperature == 0: # sampling (greedy):
                next_token = torch.argmax(filtered_logits, dim=-1).unsqueeze(-1)
            else:
                next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
            generated = torch.cat((generated, next_token), dim=1)
    return generated


In [None]:
import torch

# model=torch.load('gpt2_finetunes_7_epochs_data_augmented.pth') #un-comment if you wish to load the weights

context = "Many people would do just fine with a $200 chromebook"
context_tkn = tokenizer.additional_special_tokens_ids[0]
sep_tkn = tokenizer.additional_special_tokens_ids[1]

input_ids = [context_tkn] + tokenizer.encode(context)

segments = [sep_tkn] * 128
segments[:len(input_ids)] = [context_tkn] * len(input_ids)

input_ids += [sep_tkn]

# Move the model back to the CPU for inference:
model.to(torch.device('cuda'))

# Generate 20 samples of max length 70
generated = sample_sequence(model, length=100, context=input_ids, segments_tokens=segments, num_samples=5)

print("\nGiven the context:",context)
print('\n--- Generated Sarcastic statements ---\n')

for g in generated:
  sarc = tokenizer.decode(g.squeeze().tolist())
  # print(slogan)
  sarc = sarc.split('<|endoftext|>')[0].split('<headline>')[1]
  print(sarc.split('<nameinfo>')[1])


100%|██████████| 100/100 [00:03<00:00, 27.14it/s]


Given the context: Many people would do just fine with a $200 chromebook

--- Generated Sarcastic statements ---

So, do you think the person who came up with that request was secretly a Google search engine, or just really wanted to be a Chromecast
So, what's the over/under on how long before someone starts accusing the Chromebooks of stealing their ideas?
What are some alternative ways to deal with the frustration of finding a $200 Chromebook that doesn't exist?
So, do you think it's fair to assume that these 'DVD' is the real MVP (Most Valuable Pork) when it comes to donating money
What is the maximum amount of effort put into aebook by someone who only speaks sarcasm as a second language?





In [None]:
import torch.nn.functional as F
from tqdm import trange
import torch
from transformers import GPT2TokenizerFast

import torch.nn.functional as F
from tqdm import trange,tqdm

def truncate_text(text, max_length=150):
    if len(text) > max_length:
        truncated_text = text[:max_length]
        return truncated_text
    else:
        return text


tokenizer = GPT2TokenizerFast.from_pretrained('gpt2-small',truncation=True,padding=True)
# model=torch.load('gpt2_finetunes_7_epochs_data_augmented.pth')
# Declare special tokens for padding and separating the context from the brand statement:

additional_tokens = {

    'pad_token': '<pad>',

    'additional_special_tokens': ['<nameinfo>', '<headline>'],

}



# Add these special tokens to the vocabulary and resize model's embeddings:

tokenizer.add_special_tokens(additional_tokens)

model.resize_token_embeddings(len(tokenizer))


def top_k_top_p_ordering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
    """ Filter a distribution of logits using top-k and/or (top-p) filtering.
        Args:
            logits: batch size x vocabulary size
            top_k > 0: keep only top k tokens with highest probability (top-k filtering).
            top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
        Snippet taken from: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
    """
    top_k = min(top_k, logits.size(-1))  # Safety check

    if top_k > 0:
        # Remove all tokens with a probability less than the last token of the top-k
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value

    if top_p > 0.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        # Remove tokens with cumulative probability greater than the top_p threshold
        sorted_indices_to_remove = cumulative_probs > top_p
        # Shift the indices to the right to keep also the first token above the threshold
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        # scatter sorted tensors to original indexing
        indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove)
        logits[indices_to_remove] = filter_value
    return logits


# snippet from HuggingFace, adapted to work for contextual separation:
def sample_sequence(model, length, context, segments_tokens=None, num_samples=1, temperature=1, top_k=0, top_p=0.0, repetition_penalty=1.0,):
    context = torch.tensor(context, dtype=torch.long, device='cuda')
    context = context.unsqueeze(0).repeat(num_samples, 1)
    generated = context

    with torch.no_grad():
        for j in range(length):

            inputs = {'input_ids': generated}
            if segments_tokens != None:
              inputs['token_type_ids'] = torch.tensor(segments_tokens[:generated.shape[1]]).unsqueeze(0).repeat(num_samples, 1).to(torch.device('cuda'))


            outputs = model(**inputs)
            next_token_logits = outputs[0][:, -1, :] / (temperature if temperature > 0 else 1.)

            # repetition penalty from CTRL (https://arxiv.org/abs/1909.05858)
            for i in range(num_samples):
                for _ in set(generated[i].tolist()):
                    next_token_logits[i, _] /= repetition_penalty

            filtered_logits = top_k_top_p_ordering(next_token_logits, top_k=top_k, top_p=top_p)
            if temperature == 0: # sampling (greedy):
                next_token = torch.argmax(filtered_logits, dim=-1).unsqueeze(-1)
            else:
                next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
            generated = torch.cat((generated, next_token), dim=1)
    return generated


import pandas as pd
from tqdm import tqdm

# Assuming you have a function for model inference
def get_model_response(input_value):
    input_value = truncate_text(input_value)
    context_tkn = tokenizer.additional_special_tokens_ids[0]
    sarc_tkn = tokenizer.additional_special_tokens_ids[1]

    input_ids = [context_tkn] + tokenizer.encode(input_value)

    segments = [sarc_tkn] * 128
    segments[:len(input_ids)] = [context_tkn] * len(input_ids)

    input_ids += [sarc_tkn]
    # Move the model back to the CPU for inference:

    # Generate 20 samples of max length 70
    generated = sample_sequence(model, length=90, context=input_ids, segments_tokens=segments, num_samples=1)
    for g in generated:
      sarc = tokenizer.decode(g.squeeze().tolist())
      sarc = sarc.split('<|endoftext|>')[0].split('<headline>')[1]
      return sarc.split('<nameinfo>')[1]

model.to(torch.device('cuda'))
# Load CSV file into a pandas DataFrame
csv_file_path = 'sarcasm_updated_annotations_chunk_0.csv'
df = pd.read_csv(csv_file_path)

# Specify the column you want to iterate over and the column to store the responses
input_column_name = 'Context'
output_column_name = 'gpt2_generated_text'

df[output_column_name] = ""

output_txt_file_path = 'generated_sarcasm_gpt-2.txt'
with open(output_txt_file_path, 'w') as txt_file:
  # Iterate over rows and update the DataFrame with model responses
  for index, row in tqdm(df.iterrows()):
      input_value = row[input_column_name]
      try:
          model_response = get_model_response(input_value)

      except Exception as e:
          print(e)
          model_response = "that's so weird"

      # print(model_response)
      df.at[index, output_column_name] = model_response
      txt_file.write(model_response + '\n')
      txt_file.flush()

# Save the updated DataFrame to a new CSV file
output_csv_file_path = 'gpt-2_chunk_0_generations.csv'
df.to_csv(output_csv_file_path, index=False)
