In [None]:
pip install transformers pandas openpyxl



In [None]:
from google.colab import drive
import pandas as pd
from transformers import pipeline
import numpy as np

# Mount Google Drive
drive.mount('/content/drive')

# Function to set up the generator pipeline
def setup_generator(model_name="EleutherAI/gpt-j-6B"):
    generator = pipeline('text-generation', model=model_name)
    return generator

# Set up text generation model
generator = setup_generator()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Final code being used

# Function to generate text based on prompts
def generate_articles(generator, prompts, max_length=1200):
    articles = []
    count = 0
    for prompt in prompts:
        count += 1
        print(count , f"Generating article for prompt: {prompt[:50]}...")  # Print the beginning of the prompt for clarity
        generated = generator(prompt, max_length=max_length, num_return_sequences=1)
        # Append both the prompt (first 30 words) and the generated text
        articles.append((prompt, generated[0]['generated_text']))
    return articles

# Function to extract the first 30 tokens
def get_first_30_tokens(texts, generator):
    tokenizer = generator.tokenizer
    prompts = []
    for text in texts:
        tokens = tokenizer.tokenize(text)
        # Get the first 30 tokens and convert to string
        prompt = tokenizer.convert_tokens_to_string(tokens[:30])
        prompts.append(prompt)
    return prompts

# Main function to load data, generate articles, and save them
def main(input_file, output_file):
    # Load data
    df = pd.read_excel(input_file)
    if len(df) > 130:
        df = df.sample(n=130)  # Randomly sample 130 articles
    texts = df['text'].tolist()

    # Extract the first 30 tokens as prompts
    prompts = get_first_30_tokens(texts, generator)

    # Generate articles
    articles = generate_articles(generator, prompts)

    # Save generated articles to a new DataFrame and then to Excel
    articles_df = pd.DataFrame(articles, columns=['Prompt', 'Generated Article'])
    articles_df.to_excel(output_file, index=False)
    print(f"Generated articles saved to {output_file}")

input_file = '/content/drive/My Drive/ML Project data/test_CNN_Article - Cleaned - Copy2.xlsx'
output_file = '/content/drive/My Drive/ML Project data/output_file_GPT-J.xlsx'

if __name__ == "__main__":
    main(input_file, output_file)
