In this file I am generating the dataset using OpenAI API for ChatGPT.

In [1]:
from openai import OpenAI
import random
from tqdm import tqdm
import re
from transformers import BertTokenizer, AutoTokenizer
import json

In [2]:
OPENAI_API_KEY = ''
client = OpenAI(api_key=OPENAI_API_KEY)

In [3]:
with open('mountain_names.txt', 'r', encoding='utf-8') as file:
    mountain_names = [line.strip() for line in file]

In [4]:
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("dslim/distilbert-NER")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [5]:
def generate_sentences(prompt, n, max_tokens):
    """
    Function to generate sentences with name entities in square bracekts
    through ChatGPT and process the output
    """
    # Send a request to ChatGPT
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system",
             "content": "You are a dataset generator for named entity recognition. Do not communicate with user in your answers."},
            {"role": "user",
             "content": prompt},
        ],
        max_tokens=max_tokens,
        n=n,
        temperature=1,
        # frequency_penalty=0.02,
        # presence_penalty=0.2
    )

    # Parse the response to get the text
    sentences = [choice.message.content for choice in response.choices]
    
    # Prepare tokenized and labeled data
    tokenized_data = []

    for sentence in sentences:
        entities = []
        offset = 0  # Track changes in length after removing brackets

        # Find all mountain names in brackets, e.g., [Mount Everest]
        matches = re.finditer(r'\[(.*?)\]', sentence)

        for match in matches:
            mountain_name = match.group(1)  # Get the name without the brackets

            # Tokenize the mountain name to handle multi-word names like "Mount Everest"
            mountain_tokens = tokenizer.tokenize(mountain_name)

            # Calculate start and end indices for the entity
            start_idx = match.start() - offset
            end_idx = start_idx + len(mountain_name)

            # Store the entity with token positions
            entities.append((start_idx, end_idx))

            # Update the offset: 2 characters ([]) are removed
            offset += 2

        # Clean up the sentence by removing the brackets
        cleaned_sentence = re.sub(r'\[|\]', '', sentence)

        # Tokenize the cleaned sentence
        tokenized_sentence = tokenizer.tokenize(cleaned_sentence)

        # Initialize the labels for each token as "O"
        labels = ["O"] * len(tokenized_sentence)

        token_start_idx = 0
        # Assign "B-MOUNT" and "I-MOUNT" labels to the corresponding tokens
        for start_idx, end_idx in entities:
            # Get the subword tokens for the mountain name
            mountain_tokens = tokenizer.tokenize(cleaned_sentence[start_idx:end_idx])

            # Find where the mountain name starts in the tokenized sentence
            for i in range(token_start_idx, len(tokenized_sentence)):
                # Look for the start of the mountain in tokenized sentence
                if tokenized_sentence[i:i+len(mountain_tokens)] == mountain_tokens:
                    token_start_idx = i
                    labels[token_start_idx] = "B-MOUNT"
                    for i in range(1, len(mountain_tokens)):
                        labels[token_start_idx + i] = "I-MOUNT"
                    token_start_idx += len(mountain_tokens)
                    break

        # Add the tokenized sentence and labels to the data
        tokenized_data.append({"tokens": tokenized_sentence, "labels": labels})

    return tokenized_data

In [6]:
def generate_random_prompt(mountain_names):
    """
    Generate a random prompt with a few mountain names from the list.
    """
    num_mountains = random.randint(1, 4)
    selected_mountains = random.sample(mountain_names, num_mountains)
    prompt = f"Generate one or few sentences mentioning these mountains: {', '.join(selected_mountains)}. Enclose mountain names in square brackets."
    return prompt

In [7]:
# Example usage
prompt = generate_random_prompt(mountain_names)
tokenized_data = generate_sentences(prompt, 1, 200)
print(*tokenized_data[0]['tokens'])
print(*tokenized_data[0]['labels'])

The breath ##taking views from Mount Mitchell attract hike ##rs from all over , while the ma ##je ##stic Gross ##g ##lock ##ner stands as the highest peak in Austria . In Ukraine , the stunning scenery of Hu ##tsu ##ls ##ka Mountain offers a glimpse into the region ' s rich cultural heritage , and adventure ##rs often seek the challenge of climbing Cho ##gol ##isa , known for its striking beauty and difficult ascent .
O O O O O B-MOUNT I-MOUNT O O O O O O O O O O O O B-MOUNT I-MOUNT I-MOUNT I-MOUNT O O O O O O O O O O O O O O O B-MOUNT I-MOUNT I-MOUNT I-MOUNT I-MOUNT O O O O O O O O O O O O O O O O O O O O O B-MOUNT I-MOUNT I-MOUNT O O O O O O O O O O


Now generate the dataset

In [8]:
dataset = []
num_samples = 1000
max_tokens = 200
sentences_per_prompt = 1

for i in tqdm(range(num_samples)):
    # Generate a random prompt
    random_prompt = generate_random_prompt(mountain_names)
    
    # Generate labeled sentences for the prompt
    tokenized_data = generate_sentences(random_prompt, n=sentences_per_prompt, max_tokens=max_tokens)
    
    # Add the labeled data to the full dataset
    dataset.extend(tokenized_data)

100%|██████████| 1000/1000 [20:15<00:00,  1.22s/it]


In [9]:
with open('train_dataset.json', 'w', encoding='utf-8') as f:
    json.dump(dataset, f, ensure_ascii=False, indent=4)

In [10]:
val_dataset = []
num_samples = 100

for i in tqdm(range(num_samples)):
    # Generate a random prompt
    random_prompt = generate_random_prompt(mountain_names)
    
    # Generate labeled sentences for the prompt
    tokenized_data = generate_sentences(random_prompt, n=sentences_per_prompt, max_tokens=max_tokens)
    
    # Add the labeled data to the full dataset
    val_dataset.extend(tokenized_data)

with open('val_dataset.json', 'w', encoding='utf-8') as f:
    json.dump(val_dataset, f, ensure_ascii=False, indent=4)

100%|██████████| 100/100 [02:07<00:00,  1.28s/it]
