In [13]:
import argparse
import pickle as pk

import torch
# For machine learning tools and evaluation
from sklearn.metrics import accuracy_score
# Transformer library
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import Trainer, TrainingArguments
import evaluate

data_dir = "./data/"
device_name = "cuda" if torch.cuda.is_available() else "cpu"

In [6]:
model_name = 'distilgpt2'
model = AutoModelForCausalLM.from_pretrained(model_name).to(device_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# load data

In [50]:
subreddit = "Judaism"
month = '2016-12'

In [15]:
from datasets import load_dataset, Dataset

In [39]:
usecols=['year-month', 'timestamp', 'text', 'speaker']
comments_df = pk.load(open(data_dir + f"{subreddit}-comments.pk", "rb"))
comments_df = comments_df[usecols]

monthly_comments_df = comments_df[comments_df['year-month'] == month]
monthly_comments = Dataset.from_pandas(monthly_comments_df)

In [41]:
monthly_comments = monthly_comments.train_test_split(test_size=0.2)

In [42]:
monthly_comments

DatasetDict({
    train: Dataset({
        features: ['year-month', 'timestamp', 'text', 'speaker', 'id'],
        num_rows: 7948
    })
    test: Dataset({
        features: ['year-month', 'timestamp', 'text', 'speaker', 'id'],
        num_rows: 1988
    })
})

# process data
https://huggingface.co/docs/transformers/tasks/language_modeling

In [44]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [45]:
tokenized_monthly_comments = monthly_comments.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns = monthly_comments["train"].column_names,
)

#0:   0%|                                                                                                                                                                                   | 0/2 [00:00<?, ?ba/s]
#1:   0%|                                                                                                                                                                                   | 0/2 [00:00<?, ?ba/s][A

#2:   0%|                                                                                                                                                                                   | 0/2 [00:00<?, ?ba/s][A[A


#0:  50%|█████████████████████████████████████████████████████████████████████████████████████▌                                                                                     | 1/2 [00:01<00:01,  1.70s/ba][A[A[A
#1:  50%|█████████████████████████████████████████████████████████████████████████████████████▌                                        

In [46]:
block_size = 128


def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [47]:
lm_dataset = tokenized_monthly_comments.map(group_texts, batched=True, num_proc=4)

#0:   0%|                                                                                                                                                                                   | 0/2 [00:00<?, ?ba/s]
#1:   0%|                                                                                                                                                                                   | 0/2 [00:00<?, ?ba/s][A

#2:   0%|                                                                                                                                                                                   | 0/2 [00:00<?, ?ba/s][A[A


#0:  50%|█████████████████████████████████████████████████████████████████████████████████████▌                                                                                     | 1/2 [00:01<00:01,  1.58s/ba][A[A[A
#1:  50%|█████████████████████████████████████████████████████████████████████████████████████▌                                        

In [48]:
lm_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 4836
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1251
    })
})

In [49]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Finetune

In [55]:
training_args = TrainingArguments(
    output_dir=f"./models/{model_name}_{subreddit}_{month}",
    num_train_epochs=10,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='./logs',  # directory for storing logs
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [56]:
trainer.train()

***** Running training *****
  Num examples = 4836
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 6050
  Number of trainable parameters = 81912576


Epoch,Training Loss,Validation Loss
1,3.8048,3.821939
2,3.6915,3.809671
3,3.6908,3.79687
4,3.5245,3.792282
5,3.6011,3.78851
6,3.5989,3.786259
7,3.5789,3.786462
8,3.564,3.787623
9,3.4641,3.789806
10,3.503,3.790579


Saving model checkpoint to ./models/distilgpt2_Judaism_2016-12/checkpoint-500
Configuration saved in ./models/distilgpt2_Judaism_2016-12/checkpoint-500/config.json
Model weights saved in ./models/distilgpt2_Judaism_2016-12/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1251
  Batch size = 8
Saving model checkpoint to ./models/distilgpt2_Judaism_2016-12/checkpoint-1000
Configuration saved in ./models/distilgpt2_Judaism_2016-12/checkpoint-1000/config.json
Model weights saved in ./models/distilgpt2_Judaism_2016-12/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1251
  Batch size = 8
Saving model checkpoint to ./models/distilgpt2_Judaism_2016-12/checkpoint-1500
Configuration saved in ./models/distilgpt2_Judaism_2016-12/checkpoint-1500/config.json
Model weights saved in ./models/distilgpt2_Judaism_2016-12/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1251
  Batch size = 8
Saving model che

TrainOutput(global_step=6050, training_loss=3.5959679154325124, metrics={'train_runtime': 583.9894, 'train_samples_per_second': 82.81, 'train_steps_per_second': 10.36, 'total_flos': 1579538857328640.0, 'train_loss': 3.5959679154325124, 'epoch': 10.0})

In [58]:
eval_results

{'eval_loss': 3.790578842163086,
 'eval_runtime': 3.491,
 'eval_samples_per_second': 358.35,
 'eval_steps_per_second': 44.973,
 'epoch': 10.0}

In [57]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

***** Running Evaluation *****
  Num examples = 1251
  Batch size = 8


Perplexity: 44.28


# Inference / Evaluation

In [79]:
from datasets import load_dataset

test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")

Downloading builder script: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8.48k/8.48k [00:00<00:00, 5.32MB/s]
Downloading metadata: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.84k/6.84k [00:00<00:00, 3.93MB/s]
Downloading readme: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9.25k/9.25k [00:00<00:00, 5.97MB/s]


Downloading and preparing dataset wikitext/wikitext-2-raw-v1 to /home/aww66/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126...


Downloading data: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.72M/4.72M [00:00<00:00, 37.9MB/s]
                                                                                                                                                                                                                  

Dataset wikitext downloaded and prepared to /home/aww66/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126. Subsequent calls will reuse this data.


In [81]:
encodings

{'input_ids': tensor([[ 628,  796, 5199,  ...,  220,  628,  198]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]])}

In [87]:
# https://huggingface.co/docs/transformers/perplexity
import torch
from tqdm import tqdm

def calculate_ppl(encodings):
    max_length = model.config.n_positions
    stride = 16
    seq_len = encodings.input_ids.size(1)

    nlls = []
    prev_end_loc = 0
    for begin_loc in tqdm(range(0, seq_len, stride)):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device_name)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)

            # loss is calculated using CrossEntropyLoss which averages over input tokens.
            # Multiply it with trg_len to get the summation instead of average.
            # We will take average over all the tokens to get the true average
            # in the last step of this example.
            neg_log_likelihood = outputs.loss * trg_len

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
    return ppl

In [88]:
inference_month = "2017-01"
inference_texts = comments_df[comments_df['year-month'] == inference_month]['text'].tolist()

for text in inference_texts[:10]:
    print(text)
    encodings = tokenizer(text, truncation=True, padding=True, return_tensors="pt")
    ppl = calculate_ppl(encodings)
    print(ppl)
    

The reason he specifies killed is because moshiach is a political and military leader, not a rabbi. 


  0%|                                                                                                                                                                                       | 0/2 [00:00<?, ?it/s]


RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasLtMatmul( ltHandle, computeDesc.descriptor(), &alpha_val, mat1_ptr, Adesc.descriptor(), mat2_ptr, Bdesc.descriptor(), &beta_val, result_ptr, Cdesc.descriptor(), result_ptr, Cdesc.descriptor(), &heuristicResult.algo, workspace.data_ptr(), workspaceSize, at::cuda::getCurrentCUDAStream())`

In [65]:

# ## DOES NOT WORK
# perplexity = evaluate.load("perplexity", module_type="metric")
# tokenizer.save_pretrained("./models/distilgpt2_Judaism_2016-12/checkpoint-6000/")
# input_texts = [t for t in 
#                comments_df[comments_df['year-month'] == '2017-01']['text']
#                if t != ""]
# results = perplexity.compute(model_id='./models/distilgpt2_Judaism_2016-12/checkpoint-6000',
#                              predictions=input_texts)