In [1]:
import pandas as pd
from transformers import pipeline, GPT2LMHeadModel, GPT2Tokenizer
import torch

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cuda:0


In [3]:
df = pd.read_csv("geodata.csv")
if "location" not in df.columns:
    raise ValueError("CSV file must contain a 'location' column.")
locations = df["location"].dropna().unique()
output_data = []

In [4]:
len(locations)

154

In [5]:
max_retries = 10

for loc in locations:
    prompt = f"Here are five interesting facts about {loc}. Each fact is a complete sentence and return the fact only:"
    success = False

    for attempt in range(max_retries):
        results = generator(
            prompt,
            max_length=150,
            num_return_sequences=1,
            do_sample=True,
            temperature=0.8,
            top_k=50,
            top_p=0.95
        )
        text = results[0]["generated_text"]

        # Clean and split sentences
        generated_part = text.replace(prompt, "").strip()
        sentences = generated_part.split(". ")
        sentences = [
            s.strip().rstrip(".") + "."
            for s in sentences
            if s.strip() and not s.strip().startswith(("-", "•", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"))
        ]

        # Check if we got exactly 5 clean sentences
        if len(sentences) >= 5:
            output_data.append([loc] + sentences[:5])
            success = True
            break

    if not success:
        print(f"⚠️ Failed to generate 5 facts for location: {loc}")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for 

In [6]:
output_df = pd.DataFrame(output_data, columns=["location", "sentence 1", "sentence 2", "sentence 3", "sentence 4", "sentence 5"])
output_df.to_csv("geo_facts.csv", index=False)

In [7]:
output_df

Unnamed: 0,location,sentence 1,sentence 2,sentence 3,sentence 4,sentence 5
0,Afghanistan,The Taliban is the most advanced of the Taliba...,The Taliban's security forces are strong and c...,The majority of those killed in the US-led inv...,It is estimated that the Taliban has killed ar...,troops.\n\n4.
1,Albania,"In August 1996, the Albanian government was in...",The country remained in power.\n\nIn August 19...,The country remained in power.,"In July 1996, Albanian authorities arrested ab...",Some of them were convicted and sentenced to p...
2,Algeria,The government of Algeria is the only country ...,The country has been a key player in the histo...,The country is the only country to have succes...,The country has been a key player in the histo...,The country is the only country to have succes...
3,Amsterdam,The city is still a large city.\n\n2.,The city is still a small city.\n\n3.,The city is still a large city.\n\n4.,The city is still a small city.\n\n5.,The city is still a small city.\n\nHere's a sh...
4,Antwerp,"In Antwerp, a man's name was chosen.","In some countries, such as the Netherlands, th...",The bishop said he was a man of God.,"If a man is a man of God, he must be a bishop....","According to the UNESCO World Heritage List, A..."
...,...,...,...,...,...,...
149,Vienna,That the population of Vienna was approximatel...,That the population of Vienna was approximatel...,That the population of Vienna was approximatel...,That the population of Vienna was approximatel...,That the population of Vienna was approximatel...
150,Vietnam,The American public has never been trained to ...,The American public has never been trained to ...,The American public has never been trained to ...,The American public has never been trained to ...,The American public has never been trained to ...
151,Wales,Wales was a nation of great wealth.,"For the British, it was all about the British....",All Wales has been settled by the Crown.\n\n2.,All Welsh people have been settled by the Comm...,The Crown ruled Wales for over three millennia...
152,Warsaw,We're all in Warsaw.,And Warsaw is so beautiful that we're pretty s...,Warsaw is the capital of the Czech Republic.\n...,The city has a population of 1.8 billion peopl...,"Poland's population is 1.4 billion people, the..."


In [8]:
output_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154 entries, 0 to 153
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   location    154 non-null    object
 1   sentence 1  154 non-null    object
 2   sentence 2  154 non-null    object
 3   sentence 3  154 non-null    object
 4   sentence 4  154 non-null    object
 5   sentence 5  154 non-null    object
dtypes: object(6)
memory usage: 7.3+ KB


In [9]:
output_df = output_df[output_df['location'].duplicated(keep='first') == False]

In [10]:
len(output_df['location'])

154