In [None]:
!pip install pandas
!pip install numpy
!pip install tensorflow==2.11.0
!pip install transformers==4.29.2
!pip install scikit-learn
!pip install spacy
!python -m spacy download en_core_web_sm


In [None]:
import pandas as pd
import numpy as np
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from tqdm import tqdm

In [None]:
data = pd.read_csv('daigt_full_dataset.csv', on_bad_lines='skip')
examples = data['text'].tolist()

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')

In [None]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [None]:
def generate_essay(seed_text, min_length=200, max_length=400, temperature=0.7, top_p=0.9, top_k=50):
    input_ids = tokenizer.encode(seed_text, return_tensors='pt')

    output = model.generate(
        input_ids=input_ids,
        min_length=min_length + len(input_ids[0]),
        max_length=max_length + len(input_ids[0]),
        num_return_sequences=1,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        pad_token_id=tokenizer.pad_token_id,
        no_repeat_ngram_size=2,
        early_stopping=True
    )

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    essay_text = generated_text[len(seed_text):].strip()
    return essay_text

In [None]:
df = pd.DataFrame(columns=['text', 'label'])


In [None]:
prompts = {
        'Phones and driving': 'Discuss the impact of mobile phones on driving safety, include evidence and recent studies on driver distractions.',
        'Car-free cities': 'Explore the benefits and challenges of car-free cities, citing examples from around the world where urban areas have successfully implemented this policy.',
        'Summer projects': 'Describe ideal summer projects for college students that combine both learning and practical experience. Provide examples across various fields of study.',
        '"A Cowboy Who Rode the Waves"': 'Write a book report discussing the book "A Cowboy Who Rode the Waves", talk about the story and its impact.',
        'Mandatory extracurricular activities': 'Argue why or why not mandatory extracurricular activities should be a thing, include the pros and cons to this.',
        'Exploring Venus': 'Talk all things space! Explore the facts about Venus and why it\'s so cool or if there are any problems about it, this should include facts about the planet.',
        'Facial action coding system': 'Discuss the facial action coding system, what is it? what does this teach us about people? is it important? add some facts about this as well.',
        'The Face on Mars': 'Discuss the Face on Mars, was it real? is it an optical illusion? pretend you are a high school student talking about if the Face on Mars was real or an optical illusion and back up those claims with facts.',
        'Community service': 'Talk about the benefits of community service, who should do community service? should it be required by everyone? discuss the importance of community service as well and provide examples of how community service is a good or bad thing.',
        'Grades for extracurricular activities': 'Argue whether or not students should receive grades for extracurricular activities. Discuss if they should be given as bonus grades or required, provide facts about the pros and cons about this topic.',
        'Driverless cars': 'Talk about the future! Is Driverless cars the next big thing? provide facts about the pros and cons of driverless cars.',
        'Does the electoral college work?': 'Discuss the electoral college and why or why doesn\'t it work. provide examples on how the electoral college has done.',
        'Cell phones at school': 'Discuss if cell phones should be allowed at school. include examples of why a cell phone could be beneficial or harmful to students to use in school.',
        'Distance learning': 'Talk about the impact of distance learning due to COVID, provide examples of how students are doing across the world and how distance learning has impacted the rest of the world.',
        'Seeking multiple opinions': 'Discuss why we should or should not get the opinion of multiple people. Are there any cases where we should get multiple opinions on a topic? provide examples of when we should or should not seek multiple opinions.'
    }

In [None]:
num_essays = 1000

with tqdm(total=num_essays, unit='essay', ncols=100) as pbar:
    for i in range(num_essays):
        prompt = np.random.choice(list(prompts.keys()))
        seed_text = prompts[prompt]
        generated_essay = generate_essay(seed_text)
        new_row = pd.DataFrame({'text': [generated_essay], 'label': [1]})
        df = pd.concat([df, new_row], ignore_index=True)
        pbar.update(1)
        pbar.set_description(f"Generating essay {i+1}/{num_essays}")

df.to_csv('generated_essays.csv', index=False)