In [4]:
# import pandas as pd
# from sklearn.model_selection import train_test_split

# from datasets import load_dataset, load_from_disk
# from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
import os

## Read data 

### Load XSUM data

In [3]:
xsum_dirpath = "data/xsum"
if os.path.exists(xsum_dirpath):
    print("XSUM Dataset found! Loading from path...")
    dataset = load_from_disk(xsum_dirpath)
else:
    print(f"XSUM Dataset not found! Downloading and saving to {xsum_dirpath}")
    dataset = load_dataset("EdinburghNLP/xsum")
    dataset.save_to_disk(xsum_dirpath)

XSUM Dataset found! Loading from path...


In [4]:
# Shuffle data
seed = 42
dataset = dataset.shuffle(seed=seed)
len(dataset["train"])

In [139]:
# Select a sample of 5000 articles for now
n_samples = 5000
samples = dataset["train"].select(range(n_samples))

### Load hewlett data

In [18]:
filepath = "/home/dafirebanks/projects/CS567_ML_Project/data/hewlett-foundation-data/training_set_rel3.xlsx"
df = pd.read_excel(filepath)
df.head()

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
0,1,1,"Dear local newspaper, I think effects computer...",4.0,4.0,,8.0,,,,...,,,,,,,,,,
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5.0,4.0,,9.0,,,,...,,,,,,,,,,
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4.0,3.0,,7.0,,,,...,,,,,,,,,,
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5.0,5.0,,10.0,,,,...,,,,,,,,,,
4,5,1,"Dear @LOCATION1, I know having computers has a...",4.0,4.0,,8.0,,,,...,,,,,,,,,,


In [19]:
len(df)

12978

In [21]:
df.columns

Index(['essay_id', 'essay_set', 'essay', 'rater1_domain1', 'rater2_domain1',
       'rater3_domain1', 'domain1_score', 'rater1_domain2', 'rater2_domain2',
       'domain2_score', 'rater1_trait1', 'rater1_trait2', 'rater1_trait3',
       'rater1_trait4', 'rater1_trait5', 'rater1_trait6', 'rater2_trait1',
       'rater2_trait2', 'rater2_trait3', 'rater2_trait4', 'rater2_trait5',
       'rater2_trait6', 'rater3_trait1', 'rater3_trait2', 'rater3_trait3',
       'rater3_trait4', 'rater3_trait5', 'rater3_trait6'],
      dtype='object')

In [22]:
# Selecting only the columns that are needed: essay_set, essay_id and essay
df = df[["essay_set", "essay_id", "essay"]]
df.head()

Unnamed: 0,essay_set,essay_id,essay
0,1,1,"Dear local newspaper, I think effects computer..."
1,1,2,"Dear @CAPS1 @CAPS2, I believe that using compu..."
2,1,3,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl..."
3,1,4,"Dear Local Newspaper, @CAPS1 I have found that..."
4,1,5,"Dear @LOCATION1, I know having computers has a..."


### Filter out essays with illegible words


In [23]:
from collections import defaultdict

defective2idx = defaultdict(list)

for i, essay in enumerate(df["essay"].values):
    lower = essay.lower()
    if "illegible" in lower:
        defective2idx["illegible"].append(i)
    elif "not legible" in lower:
        defective2idx["not legible"].append(i)
    elif "???" in lower:
        defective2idx["???"].append(i)

In [24]:
# How many essays are defective?
{key: len(value) for key, value in defective2idx.items()}

{'not legible': 7, '???': 264, 'illegible': 4}

In [25]:
# Filter out the essays that are defective
defective_idxs = defective2idx["illegible"] + defective2idx["not legible"] + defective2idx["???"]

df = df.drop(defective_idxs)
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,essay_set,essay_id,essay
0,1,1,"Dear local newspaper, I think effects computer..."
1,1,2,"Dear @CAPS1 @CAPS2, I believe that using compu..."
2,1,3,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl..."
3,1,4,"Dear Local Newspaper, @CAPS1 I have found that..."
4,1,5,"Dear @LOCATION1, I know having computers has a..."


In [26]:
len(df)

12703

In [5]:
clean_data_path = "data/hewlett-foundation-data/training_set_rel3_cleaned.csv"
if not os.path.exists(clean_data_path):
    print(f"Saving cleaned data to {clean_data_path}")
    df.to_csv(clean_data_path, index=False)
else:
    print(f"Cleaned data already exists at {clean_data_path}. Loading...")
    df = pd.read_csv(clean_data_path)

Cleaned data already exists at data/hewlett-foundation-data/training_set_rel3_cleaned.csv. Loading...


In [3]:
# Print the distribution of essays across the different essay sets
df["essay_set"].value_counts()

5    1805
6    1800
2    1799
1    1782
4    1763
3    1699
7    1333
8     722
Name: essay_set, dtype: int64

## Setup prompts

### Hewlett

In [4]:
# Map essay sets to the actual prompts:
set2prompt = {
    1: """More and more people use computers, but not everyone agrees that this benefits society. Those who support advances in technology believe that computers have a positive effect on people. They teach hand-eye coordination, give people the ability to learn about faraway places and people, and even allow people to talk online with other people. Others have different ideas. Some experts are concerned that people are spending too much time on their computers and less time exercising, enjoying nature, and interacting with family and friends. Write a letter to your local newspaper in which you state your opinion on the effects computers have on people. Persuade the readers to agree with you.""",
    2: """Censorship in the Libraries
"All of us can think of a book that we hope none of our children or any other children have taken off the shelf. But if I have the right to remove that book from the shelf -- that work I abhor -- then you also have exactly the same right and so does everyone else. And then we have no books left on the shelf for any of us." --Katherine Paterson, Author
Write a persuasive essay to a newspaper reflecting your vies on censorship in libraries. Do you believe that certain materials, such as books, music, movies, magazines, etc., should be removed from the shelves if they are found offensive? Support your position with convincing arguments from your own experience, observations, and/or reading.""",
    7: """Write about patience. Being patient means that you are understanding and tolerant. A patient person experience difficulties without complaining.
Do only one of the following: write a story about a time when you were patient OR write a story about a time when someone you know was patient OR write a story in your own way about patience.""",
    8: """We all understand the benefits of laughter. For example, someone once said, “Laughter is the shortest distance between two people.” Many other people believe that laughter is an important part of any relationship. Tell a true story in which laughter was one element or part.""",
}

In [5]:
# 2 prompting methods: Either use the first set of tokens from the human essays and generate the rest of the essay, or use the prompts for the LLM to generate them from scratch

# For now, we'll only use the essays from essay_set 1, 2, 3, 4
df = df[df["essay_set"].isin(set2prompt.keys())]
df = df.reset_index(drop=True)
df["prompt"] = df["essay_set"].map(set2prompt)
df.head()

Unnamed: 0,essay_set,essay_id,essay,prompt
0,1,1,"Dear local newspaper, I think effects computer...","More and more people use computers, but not ev..."
1,1,2,"Dear @CAPS1 @CAPS2, I believe that using compu...","More and more people use computers, but not ev..."
2,1,3,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...","More and more people use computers, but not ev..."
3,1,4,"Dear Local Newspaper, @CAPS1 I have found that...","More and more people use computers, but not ev..."
4,1,5,"Dear @LOCATION1, I know having computers has a...","More and more people use computers, but not ev..."


In [6]:
# Length distribution of essays in the dataset
df["essay"].apply(lambda x: len(x.split())).describe()

count    5636.000000
mean      354.253016
std       189.268178
min         4.000000
25%       211.000000
50%       336.500000
75%       459.000000
max      1064.000000
Name: essay, dtype: float64

In [7]:
h_ptemplate = """You will be provided with a prompt for an essay that needs to be written at the level of a student in 7-10th grade. You are an expert writer that knows how to write in different styles convincingly. You will read the prompt, and write an essay that is around 350 words.
Essay prompt: <ESSAY_PROMPT>
Essay:"""


set2fullprompt = {
    essay_set: h_ptemplate.replace("<ESSAY_PROMPT>", instruction)
    for essay_set, instruction in set2prompt.items()
}
set2fullprompt

{1: 'You will be provided with a prompt for an essay that needs to be written at the level of a student in 7-10th grade. You are an expert writer that knows how to write in different styles convincingly. You will read the prompt, and write an essay that is around 350 words.\nEssay prompt: More and more people use computers, but not everyone agrees that this benefits society. Those who support advances in technology believe that computers have a positive effect on people. They teach hand-eye coordination, give people the ability to learn about faraway places and people, and even allow people to talk online with other people. Others have different ideas. Some experts are concerned that people are spending too much time on their computers and less time exercising, enjoying nature, and interacting with family and friends. Write a letter to your local newspaper in which you state your opinion on the effects computers have on people. Persuade the readers to agree with you.\nEssay:',
 2: 'You

### XSUM

In [11]:
xsum_prompts = [sample["document"] for sample in samples]
xsum_prompts[0]

"In Wales, councils are responsible for funding and overseeing schools. But in England, Mr Osborne's plan will mean local authorities will cease to have a role in providing education. Academies are directly funded by central government and head teachers have more freedom over admissions and to change the way the"

In [140]:
xsum_prompts[0], len(xsum_prompts)

('In Wales, councils are responsible for funding and overseeing schools.\nBut in England, Mr Osborne\'s plan will mean local authorities will cease to have a role in providing education.\nAcademies are directly funded by central government and head teachers have more freedom over admissions and to change the way the school works.\nIt is a significant development in the continued divergence of schools systems on either side of Offa\'s Dyke.\nAnd although the Welsh Government will get extra cash to match the money for English schools to extend the school day, it can spend it on any devolved policy area.\nMinisters have no plans to follow suit.\nAt the moment, governing bodies are responsible for setting school hours and they need ministerial permission to make significant changes.\nThere are already more than 2,000 secondary academies in England and its extension to all state schools is unlikely to shake the Welsh Government\'s attachment to what they call a "community, comprehensive mod

## Setup model

In [8]:
# "mistralai/Mistral-7B-v0.1"  # "gpt2-medium"
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
download_dir = "/home/dafirebanks/projects/dont-stop-prompting/models"
batch_size = 50
DEVICE = "cuda"

In [87]:
# del model
# del tokenizer

In [9]:
set_seed(42)
model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=download_dir).to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=download_dir)
tokenizer.pad_token_id = tokenizer.eos_token_id

# Switch the padding side to the right - doesn't really matter because all texts should have at least 30 tokens, but this is to be consistent with the original code
if tokenizer.padding_side == "left":
    tokenizer.padding_side = "right"
tokenizer.padding_side

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

'right'

In [10]:
# NOTE: Taken from https://github.com/eric-mitchell/detect-gpt/blob/447d2ce8177004203f42d1da87d7f93a2e31ad52/run.py#L208 to ensure consistency


def encode_and_truncate(texts, n_prompt_tokens=None):
    """Encode each text as a list of token ids, and truncate to the first n_prompt_tokens tokens.
    Args:
        texts: list[str]
            List of texts to encode.
        n_prompt_tokens: int
            Number of tokens to keep for each text. If the text has fewer than this many tokens, it will be padded.
    Returns:
        dict[str, torch.Tensor]
            Dictionary of tensors, with keys "input_ids", "attention_mask", "token_type_ids".
    """
    all_encoded = tokenizer(texts, return_tensors="pt", padding=True).to(DEVICE)

    if n_prompt_tokens is None:
        return all_encoded
    return {key: value[:, :n_prompt_tokens] for key, value in all_encoded.items()}


def sample_from_model(
    texts,
    sampling_kwargs,
    min_words=55,
    n_prompt_tokens=None,
):
    """Sample from the model using the provided texts as prompts, until each sample has at least min_words words.
    Args:
        texts: list[str]
            List of texts to use as prompts.
        min_words: int
            Minimum number of words for each sample.
        n_prompt_tokens: int
            Number of tokens to use for each prompt.
    Returns:
        list[str]
            List of samples generated from the model.
    """
    all_encoded = encode_and_truncate(texts, n_prompt_tokens)
    decoded = ["" for _ in range(len(texts))]

    # sample from the model until we get a sample with at least min_words words for each example
    # this is an inefficient way to do this (since we regenerate for all inputs if just one is too short), but it works
    tries = 0
    while (m := min(len(x.split()) for x in decoded)) < min_words:
        if tries != 0:
            print()
            print(f"min words: {m}, needed {min_words}, regenerating (try {tries})")

        outputs = model.generate(**all_encoded, **sampling_kwargs)
        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        tries += 1

    return decoded


def trim_to_shorter_length(texta, textb):
    # truncate to shorter of o and s
    shorter_length = min(len(texta.split(" ")), len(textb.split(" ")))
    texta = " ".join(texta.split(" ")[:shorter_length])
    textb = " ".join(textb.split(" ")[:shorter_length])
    return texta, textb


def generate_samples(raw_data, batch_size, sampling_kwargs, min_words=55, n_prompt_tokens=None):
    """
    NOTE: Samples are truncated to the length of the shorter text. This is to ensure that the samples are comparable.
    """
    data = {
        "original": [],
        "sampled": [],
    }

    for batch in range(len(raw_data) // batch_size):
        print("Generating samples for batch", batch, "of", len(raw_data) // batch_size)
        original_text = raw_data[batch * batch_size : (batch + 1) * batch_size]
        sampled_text = sample_from_model(
            original_text, sampling_kwargs, min_words=min_words, n_prompt_tokens=n_prompt_tokens
        )

        for o, s in zip(original_text, sampled_text):
            o, s = trim_to_shorter_length(o, s)

            # add to the data
            data["original"].append(o)
            data["sampled"].append(s)

    return data

## Test generations

### Hewlett

In [11]:
sampling_kwargs = {
    "max_length": 650,
    "do_sample": True,
    "top_k": 40,
    "top_p": 0.96,
    "min_length": 350,
    "pad_token_id": tokenizer.eos_token_id,
    "eos_token_id": tokenizer.eos_token_id,
}

set2fullprompt.values()


n_essays_per_prompt = 1000
set2fullessays = {}

for set, prompt in set2fullprompt.items():
    print(f"Generating samples for essay set {set}")
    h_prompts = [prompt] * n_essays_per_prompt
    sampled_text = sample_from_model(h_prompts, sampling_kwargs, min_words=55)
    set2fullessays[set] = sampled_text

Generating samples for essay set 1


OutOfMemoryError: CUDA out of memory. Tried to allocate 3.21 GiB. GPU 0 has a total capacty of 47.54 GiB of which 2.99 GiB is free. Including non-PyTorch memory, this process has 44.54 GiB memory in use. Of the allocated memory 44.22 GiB is allocated by PyTorch, and 12.49 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [17]:
set2fullessays[8][]

["You will be provided with a prompt for an essay that needs to be written at the level of a student in 7-10th grade. You are an expert writer that knows how to write in different styles convincingly. You will read the prompt, and write an essay that is around 350 words.\nEssay prompt: We all understand the benefits of laughter. For example, someone once said, “Laughter is the shortest distance between two people.” Many other people believe that laughter is an important part of any relationship. Tell a true story in which laughter was one element or part.\nEssay:\nTitle: Laughter as a Bridge: A True Story of Friendship\n\nLaughter is a powerful force. It is the shortest distance between two people, the glue that holds friendships together, and the remedy to even the toughest of situations. In my life, I've experienced the power of laughter in many ways, but none more vividly than in the story I'd like to share with you today.\n\nIt was my sophomore year of high school when I first met 

### XSUM

In [141]:
sampling_kwargs = {
    "max_length": 200,
    "do_sample": True,
    "top_k": 40,
    "top_p": 0.96,
    "min_length": 150,
    "pad_token_id": tokenizer.eos_token_id,
    "eos_token_id": tokenizer.eos_token_id,
}
test = generate_samples(
    xsum_prompts,
    batch_size=batch_size,
    sampling_kwargs=sampling_kwargs,
    min_words=55,
    n_prompt_tokens=30,
)

Generating samples for batch 0 of 100


Generating samples for batch 1 of 100
Generating samples for batch 2 of 100
Generating samples for batch 3 of 100
Generating samples for batch 4 of 100
Generating samples for batch 5 of 100
Generating samples for batch 6 of 100
Generating samples for batch 7 of 100
Generating samples for batch 8 of 100
Generating samples for batch 9 of 100
Generating samples for batch 10 of 100
Generating samples for batch 11 of 100
Generating samples for batch 12 of 100
Generating samples for batch 13 of 100


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Generating samples for batch 14 of 100
Generating samples for batch 15 of 100
Generating samples for batch 16 of 100
Generating samples for batch 17 of 100
Generating samples for batch 18 of 100
Generating samples for batch 19 of 100
Generating samples for batch 20 of 100
Generating samples for batch 21 of 100
Generating samples for batch 22 of 100
Generating samples for batch 23 of 100
Generating samples for batch 24 of 100
Generating samples for batch 25 of 100
Generating samples for batch 26 of 100
Generating samples for batch 27 of 100
Generating samples for batch 28 of 100
Generating samples for batch 29 of 100
Generating samples for batch 30 of 100
Generating samples for batch 31 of 100
Generating samples for batch 32 of 100
Generating samples for batch 33 of 100
Generating samples for batch 34 of 100
Generating samples for batch 35 of 100
Generating samples for batch 36 of 100


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Generating samples for batch 37 of 100
Generating samples for batch 38 of 100
Generating samples for batch 39 of 100
Generating samples for batch 40 of 100
Generating samples for batch 41 of 100
Generating samples for batch 42 of 100
Generating samples for batch 43 of 100
Generating samples for batch 44 of 100
Generating samples for batch 45 of 100
Generating samples for batch 46 of 100
Generating samples for batch 47 of 100
Generating samples for batch 48 of 100
Generating samples for batch 49 of 100
Generating samples for batch 50 of 100
Generating samples for batch 51 of 100
Generating samples for batch 52 of 100
Generating samples for batch 53 of 100
Generating samples for batch 54 of 100
Generating samples for batch 55 of 100
Generating samples for batch 56 of 100
Generating samples for batch 57 of 100
Generating samples for batch 58 of 100
Generating samples for batch 59 of 100
Generating samples for batch 60 of 100
Generating samples for batch 61 of 100
Generating samples for ba

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Generating samples for batch 74 of 100
Generating samples for batch 75 of 100
Generating samples for batch 76 of 100
Generating samples for batch 77 of 100
Generating samples for batch 78 of 100
Generating samples for batch 79 of 100
Generating samples for batch 80 of 100
Generating samples for batch 81 of 100
Generating samples for batch 82 of 100
Generating samples for batch 83 of 100
Generating samples for batch 84 of 100
Generating samples for batch 85 of 100
Generating samples for batch 86 of 100
Generating samples for batch 87 of 100
Generating samples for batch 88 of 100
Generating samples for batch 89 of 100
Generating samples for batch 90 of 100
Generating samples for batch 91 of 100
Generating samples for batch 92 of 100
Generating samples for batch 93 of 100
Generating samples for batch 94 of 100
Generating samples for batch 95 of 100
Generating samples for batch 96 of 100
Generating samples for batch 97 of 100
Generating samples for batch 98 of 100
Generating samples for ba

In [None]:
# Batches to look into in case they may have weird padding: 13, 36, 73

In [144]:
len(test["sampled"])

5000

In [153]:
# Look at some generated samples
j = 20
i = 13 * batch_size + j
test["original"][i]

"Jeremy Corbyn unusually had the better of Theresa May in Prime Minister's Questions, brandishing leaked texts across the despatch box, claiming evidence that the Tories had given Surrey a special deal to avoid the chance of a damaging 15% council tax rise in a Conservative safe haven.\nThe council, and ministers, denied there had been any stitch-up.\nBut hours later, the government admitted they had agreed, in theory, that Surrey County Council could, like several others, try out keeping all of the business rates they raise from 2018, which could plug the gaps in funding in future.\nThat change is due to be in force across in England by 2020. Technically therefore, Surrey County Council has not been offered any additional funding. But the prospect of more flexibility over their own income in future could help fill the council's coffers, and seems to have eased some of their concerns.\nBut"

In [154]:
test["sampled"][i]

'Jeremy Corbyn unusually had the better of Theresa May in Prime Minister\'s Questions, brandishing leaked texts across the despatch box to taunt the Prime Minister about her own Brexit plan.\n\nThe PM was forced to defend plans to pay a £40 billion Brexit bill and was under pressure from Labour to reveal the Government\'s plan for future relations with the EU after Brexit.\n\nHowever, she was caught off guard by the release of documents that suggested that she believed a "soft" Brexit option was the only way to save the Union and "avert" a second Scottish independence referendum.\n\nThe documents were released by Labour as they questioned why the Government would choose an outcome which will "damage economic prospects, endanger our industrial base and cut living standards for the many to line the pockets of the few".\n\nOne document, a Downing Street briefing note for Mrs May, claimed a "soft"'

## Store it

In [155]:
# Turn into a jsonl format
all_data = []
for i in range(len(test["original"])):
    nongen = {"text": test["original"][i], "label": 0}
    gen = {"text": test["sampled"][i], "label": 1}

    all_data.append(nongen)
    all_data.append(gen)

len(all_data)

10000

In [161]:
# Visualize as a df
store_dirpath = "data"
df = pd.DataFrame.from_records(all_data)
df.head()

Unnamed: 0,text,label
0,"In Wales, councils are responsible for funding...",0
1,"In Wales, councils are responsible for funding...",1
2,"Up to 100,000 youngsters will be eligible for ...",0
3,"Up to 100,000 youngsters will be eligible for ...",1
4,Middlesbrough and Brighton face each other on ...,0


In [158]:
# Optionally split into train/test
train, test = train_test_split(df, test_size=0.2, random_state=42)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
# Store
train.to_json(f"{store_dirpath}/hewlett-gen-train.jsonl", orient="records", lines=True)
test.to_json(f"{store_dirpath}/hewlett-gen-test.jsonl", orient="records", lines=True)

In [None]:
# Store
train.to_json(f"{store_dirpath}/xsum-gen-train.jsonl", orient="records", lines=True)
test.to_json(f"{store_dirpath}/xsum-gen-test.jsonl", orient="records", lines=True)