In [1]:
import pandas as pd
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from tqdm import tqdm
import os
import time

In [2]:
# Load the data
df = pd.read_csv('Data/train_essays/train_essays.csv')

# Display the data
df

Unnamed: 0,id,prompt_id,text,generated
0,0059830c,0,Cars. Cars have been around since they became ...,0
1,005db917,0,Transportation is a large necessity in most co...,0
2,008f63e3,0,"""America's love affair with it's vehicles seem...",0
3,00940276,0,How often do you ride in a car? Do you drive a...,0
4,00c39458,0,Cars are a wonderful thing. They are perhaps o...,0
...,...,...,...,...
1373,fe6ff9a5,1,There has been a fuss about the Elector Colleg...,0
1374,ff669174,0,Limiting car usage has many advantages. Such a...,0
1375,ffa247e0,0,There's a new trend that has been developing f...,0
1376,ffc237e9,0,As we all know cars are a big part of our soci...,0


In [3]:
# Load the data
df_prompts = pd.read_csv('Data/train_prompts.csv')

# Display the data
df_prompts

Unnamed: 0,prompt_id,prompt_name,instructions,source_text
0,0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ..."
1,1,Does the electoral college work?,Write a letter to your state senator in which ...,# What Is the Electoral College? by the Office...


In [4]:
# Check if CUDA is available
if torch.cuda.is_available():
    # Get the ID of the default CUDA device
    cuda_device_id = torch.cuda.current_device()
    
    # Get the name of the default CUDA device
    gpu_name = torch.cuda.get_device_name(cuda_device_id)
    
    print(f"The default CUDA device ID is: {cuda_device_id}")
    print(f"The name of the default CUDA device is: {gpu_name}")
else:
    print("CUDA is not available. Please check your GPU and CUDA installation.")

The default CUDA device ID is: 0
The name of the default CUDA device is: NVIDIA GeForce MX450


In [5]:
# Initialize the GPT-2 model and tokenizer
print("Initializing GPT-2 model and tokenizer...")

tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")

# Rest of the code...

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = model.to(device)

def generate_text(prompt):
    print(f"Generating text for prompt: {prompt}")
    inputs = tokenizer.encode(prompt, return_tensors='pt')
    attention_mask = torch.ones(inputs.shape, dtype=torch.long)  # create attention_mask
    inputs = inputs.to(device)
    attention_mask = attention_mask.to(device)

    outputs = model.generate(inputs, attention_mask=attention_mask, max_length=1000, temperature=0.7, num_return_sequences=1, do_sample=True, pad_token_id=tokenizer.eos_token_id)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Generated text: {generated_text}")
    return generated_text

# Create a list to store the synthetic data
print("Creating list to store synthetic data...")
synthetic_data = []

# Loop over the unique prompts
print("Starting to generate synthetic responses...")
for prompt_id, prompt in tqdm(df_prompts[['prompt_id', 'instructions']].values, desc="Generating synthetic responses"):
    for i in tqdm(range(689), desc=f"Generating for prompt {prompt_id}"):  # Generate 1378 responses for each prompt
        # Generate a synthetic response
        text = generate_text(prompt)
        
        # Append the generated response to the list
        synthetic_data.append({'id': 'synthetic'+str(prompt_id)+'_'+str(i), 'prompt_id': prompt_id, 'text': text, 'generated': 1})

print("Finished generating synthetic responses.")

# Convert the list to a DataFrame
print("Converting list to DataFrame...")
df_synthetic = pd.DataFrame(synthetic_data)

# Save the synthetic data to a CSV file
print("Saving synthetic data to CSV file...")
df_synthetic.to_csv('Data/synthetic_data.csv', index=False)
print("Done.")

Initializing GPT-2 model and tokenizer...


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Using device: cuda
Creating list to store synthetic data...
Starting to generate synthetic responses...


Generating synthetic responses:   0%|          | 0/2 [00:00<?, ?it/s]

Generating text for prompt: Write an explanatory essay to inform fellow citizens about the advantages of limiting car usage. Your essay must be based on ideas and information that can be found in the passage set. Manage your time carefully so that you can read the passages; plan your response; write your response; and revise and edit your response. Be sure to use evidence from multiple sources; and avoid overly relying on one source. Your response should be in the form of a multiparagraph essay. Write your essay in the space provided.


Generating for prompt 0:   0%|          | 0/689 [06:57<?, ?it/s]
Generating synthetic responses:   0%|          | 0/2 [06:57<?, ?it/s]


KeyboardInterrupt: 