# Generate the dataset

In [1]:
import pandas as pd
from transformers import pipeline, GPT2LMHeadModel, GPT2Tokenizer
import torch

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cuda:0


In [3]:
df = pd.read_csv("geodata.csv")
if "location" not in df.columns:
    raise ValueError("CSV file must contain a 'location' column.")
locations = df["location"].dropna().unique()
output_data = []

In [4]:
len(locations)

154

In [5]:
max_retries = 10

for loc in locations:
    prompt = f"Here are five interesting facts about {loc}. Each fact is a complete sentence and return the fact only:"
    success = False

    for attempt in range(max_retries):
        results = generator(
            prompt,
            max_length=150,
            num_return_sequences=1,
            do_sample=True,
            temperature=0.8,
            top_k=50,
            top_p=0.95
        )
        text = results[0]["generated_text"]

        # Clean and split sentences
        generated_part = text.replace(prompt, "").strip()
        sentences = generated_part.split(". ")
        sentences = [
            s.strip().rstrip(".") + "."
            for s in sentences
            if s.strip() and not s.strip().startswith(("-", "•", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"))
        ]

        # Check if we got exactly 5 clean sentences
        if len(sentences) >= 5:
            output_data.append([loc] + sentences[:5])
            success = True
            break

    if not success:
        print(f"⚠️ Failed to generate 5 facts for location: {loc}")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for 

In [6]:
output_df = pd.DataFrame(output_data, columns=["location", "sentence 1", "sentence 2", "sentence 3", "sentence 4", "sentence 5"])

In [7]:
output_df

Unnamed: 0,location,sentence 1,sentence 2,sentence 3,sentence 4,sentence 5
0,Afghanistan,Afghanistan is one of the world's most dangero...,"In 2014, the Taliban were the most powerful in...",The US has been involved in a number of human ...,The US has repeatedly denied responsibility fo...,"In 2015,."
1,Albania,The following is a list of all of Albania's fa...,Albania has a very large and highly developed ...,"Albania has a huge, extensive and well-functio...",Albania has the largest and most advanced mode...,Albania has a great deal of strategic and cult...
2,Algeria,The first two are very different.,The third is a different one.,This can be summarized in three broad categori...,The second one is the second one.,This is a very complicated situation and can b...
3,Amsterdam,"""The Amsterdam airport was bombed.""\n\n1.",The Netherlands has a population of about 1.3 ...,The Dutch are a majority-Muslim country with a...,The Netherlands is one of the top 10 most expe...,The Netherlands has been one of the richest co...
4,Antwerp,Antwerp is a city in central Belgium.,Antwerp is only 5km from the main city of Brus...,There are 14 distinct cantons.,"The city's streets are lined with shops, cafés...","The Antwerp border is a natural, narrow river ..."
...,...,...,...,...,...,...
149,Vienna,"This is an island, it is an island in a sea, i...","It is a country, it is a country in the south\...",The same island in the same region.\n\n4.,The same island.\n\n5.,The same island.\n\n6.
150,Vietnam,We have been in an extremely difficult situati...,The South Vietnamese and South Vietnamese gove...,We have been in a very difficult situation wit...,We have been in a very difficult situation wit...,We are in a very difficult situation with the ...
151,Wales,Wales has a population of 9 million.\n\n2.,Wales has a population of 3.9 billion\n\n3.,Wales has an income of $12.2 billion (at the t...,Wales has a population of 2.6 billion (at the ...,Wales has an income of $25.6 billion (at the t...
152,Warsaw,"The Polish government was founded in 1844, whi...",The nation of Poland was founded in 1844.,"The first Polish president, the late King of P...",The Polish government was founded in 1844.,"The first Polish dictator, the late King of Po..."


In [8]:
output_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154 entries, 0 to 153
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   location    154 non-null    object
 1   sentence 1  154 non-null    object
 2   sentence 2  154 non-null    object
 3   sentence 3  154 non-null    object
 4   sentence 4  154 non-null    object
 5   sentence 5  154 non-null    object
dtypes: object(6)
memory usage: 7.3+ KB


In [9]:
output_df = output_df[output_df['location'].duplicated(keep='first') == False]

In [10]:
len(output_df['location'])

154

# Clean the dataset

In [11]:
import re

In [12]:
def clean_sentence(text):
    # Split text into sentences by period + space (". ")
    sentences = re.split(r'\.\s+', text.strip())

    for sent in sentences:
        sent = sent.strip()
        # Skip sentences that start with digits + '.' or dashes etc.
        if re.match(r'^(\d+\.|\-|\•)', sent):
            continue
        # Return the first valid sentence with a period at the end
        if sent:
            if not sent.endswith('.'):
                sent += '.'
            return sent
    return ""  # Return empty string if no valid sentence found

In [13]:
for col in output_df.columns:
    if col != "location":  # skip location column
        output_df[col] = output_df[col].apply(clean_sentence)

In [14]:
output_df.to_csv("geo_facts.csv", index=False)