In [None]:
pip install transformers pandas openpyxl



In [None]:
from google.colab import drive
import pandas as pd
from transformers import pipeline

# Mount Google Drive
drive.mount('/content/drive')

# Function to set up the generator pipeline with a specified model
def setup_generator(model_name="gpt2"):  # Using GPT-2 for demonstration
    generator = pipeline('text-generation', model=model_name)
    return generator

# Set up text generation model
generator = setup_generator()

Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
# Final code

# Function to generate text based on prompts, with better error handling
def generate_articles(generator, prompts, max_length=1000):
    articles = []
    batch_size = 20  # Adjust based on your testing and system capabilities
    for i in range(0, len(prompts), batch_size):
        batch_prompts = prompts[i:i + batch_size]
        print(f"Generating articles for batch starting with prompt: {batch_prompts[0][:50]}...")
        print(f"Total prompts in this batch: {len(batch_prompts)}")
        generated_texts = generator(batch_prompts, max_length=max_length, num_return_sequences=1)
        # Properly handle the output based on its structure
        for index, generated in enumerate(generated_texts):
            prompt_text = batch_prompts[index][:50]  # Store the first 50 characters of the prompt
            if isinstance(generated, dict):
                # If the output is a dictionary directly containing the generated text
                articles.append((prompt_text, generated['generated_text']))
                print(f"Article generated from prompt starting: {generated['generated_text'][:50]}")
            else:
                # If the output is a list of dictionaries
                for item in generated:
                    articles.append((prompt_text, item['generated_text']))
                    print(f"Article generated from prompt starting: {item['generated_text'][:50]}")
    return articles

# Function to extract the first 100 tokens
def get_first_100_tokens(texts, generator):
    tokenizer = generator.tokenizer
    prompts = []
    for text in texts:
        tokens = tokenizer.tokenize(text)
        prompt = tokenizer.convert_tokens_to_string(tokens[:100])  # Get the first 100 tokens
        prompts.append(prompt)
    return prompts

# Main function to load data, generate articles, and save them
def main(input_file, output_file):
    # Load data
    df = pd.read_excel(input_file)
    if len(df) > 600:
        df = df.sample(n=600)  # Randomly sample 600 articles if there are more
    texts = df['text'].tolist()

    # Extract the first 100 tokens as prompts
    prompts = get_first_100_tokens(texts, generator)

    # Generate articles
    articles = generate_articles(generator, prompts)

    # Save generated articles to a new DataFrame and then to Excel
    articles_df = pd.DataFrame(articles, columns=['Prompt', 'Generated Article'])
    articles_df.to_excel(output_file, index=False)
    print(f"Generated articles saved to {output_file}")

# File paths (adjust as necessary)
input_file = '/content/drive/My Drive/ML Project data/test_CNN_Article - Cleaned - Copy2.xlsx'
output_file = '/content/drive/My Drive/ML Project data/output_file_GPT-2(2).xlsx'

if __name__ == "__main__":
    main(input_file, output_file)
