In [None]:
pip install transformers pandas openpyxl



In [None]:
from google.colab import drive
import pandas as pd
from transformers import pipeline
import numpy as np

# Mount Google Drive
drive.mount('/content/drive')

# Function to set up the generator pipeline
def setup_generator(model_name="EleutherAI/gpt-j-6B"):
    generator = pipeline('text-generation', model=model_name)
    return generator

# Set up text generation model
generator = setup_generator()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Final code being used

# Function to generate text based on prompts
def generate_articles(generator, prompts, max_length=600):
    articles = []
    count = 0
    for prompt in prompts:
        count += 1
        print(count , f"Generating article for prompt: {prompt[:50]}...")  # Print the beginning of the prompt for clarity
        generated = generator(prompt, max_length=max_length, num_return_sequences=1)
        # Append both the prompt (first 30 words) and the generated text
        articles.append((prompt, generated[0]['generated_text']))
    return articles

# Function to extract the first 30 tokens
def get_first_30_tokens(texts, generator):
    tokenizer = generator.tokenizer
    prompts = []
    for text in texts:
        tokens = tokenizer.tokenize(text)
        # Get the first 30 tokens and convert to string
        prompt = tokenizer.convert_tokens_to_string(tokens[:30])
        prompts.append(prompt)
    return prompts

# Main function to load data, generate articles, and save them
def main(input_file, output_file):
    # Load data
    df = pd.read_excel(input_file)
    if len(df) > 130:
        df = df.sample(n=130)  # Randomly sample 200 articles
    texts = df['text'].tolist()

    # Extract the first 30 tokens as prompts
    prompts = get_first_30_tokens(texts, generator)

    # Generate articles
    articles = generate_articles(generator, prompts)

    # Save generated articles to a new DataFrame and then to Excel
    articles_df = pd.DataFrame(articles, columns=['Prompt', 'Generated Article'])
    articles_df.to_excel(output_file, index=False)
    print(f"Generated articles saved to {output_file}")

# File paths (adjust as necessary)
input_file = '/content/drive/My Drive/ML Project data/test_CNN_Article - Cleaned - Copy.xlsx'
output_file = '/content/drive/My Drive/ML Project data/output_file_GPT-J.xlsx'

if __name__ == "__main__":
    main(input_file, output_file)


Token indices sequence length is longer than the specified maximum sequence length for this model (7292 > 2048). Running this sequence through the model will result in indexing errors
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


1 Generating article for prompt: Story highlightsThe siege of the U.S. Embassy in I...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


2 Generating article for prompt:  (CNN)A second defendant has signed a guilty plea ...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


3 Generating article for prompt: Story highlightsRonaldo to leave Real Madrid for J...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


4 Generating article for prompt:  (CNN)Azerbaijan, Turkey and Armenia are the worst...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


5 Generating article for prompt: Story highlightsAsylum seekers in England's North ...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


6 Generating article for prompt:  (CNN)Starter Ian Anderson threw five no-hit, scor...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


7 Generating article for prompt:  (CNN)The British government is ready to intervene...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


8 Generating article for prompt: Story highlightsConstruction has been suspended on...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


9 Generating article for prompt: Story highlightsJean-Francois Jalkh stands aside f...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


10 Generating article for prompt:  (CNN)Neymar netted his 68th goal for Brazil in it...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


11 Generating article for prompt:  (CNN)Sean Payton, who led the New Orleans Saints ...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


12 Generating article for prompt: Story highlightsThe Tigers scored the game-winner ...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


13 Generating article for prompt:  (CNN)More than 100 members of the entertainment i...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


14 Generating article for prompt: Story highlightsRoger Federer beats Tomas Berdych ...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


15 Generating article for prompt:  (CNN)State investigators in North Carolina are lo...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


16 Generating article for prompt:  (CNN)When the Covid-19 pandemic struck Indonesia'...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


17 Generating article for prompt: Story highlightsManchester United beat QPR 4-0 at ...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


18 Generating article for prompt: Story highlightsA UK boy, 11, dressed as "Fifty Sh...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


19 Generating article for prompt: Story highlightsReal Madrid held to 0-0 draw by Va...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


20 Generating article for prompt: Story highlights British Prime Minister David Came...
Generated articles saved to /content/drive/My Drive/ML Project data/output_file_GPT-J.xlsx
