In [1]:
import os
import pandas as pd
import base64
import mimetypes
from openai import OpenAI
from dotenv import load_dotenv
import json
from datasets import Dataset, ClassLabel, Sequence, DatasetDict
from tqdm import tqdm
from pathlib import Path
from huggingface_hub import notebook_login
import pickle

# Provide necessary API keys and tokens

Create a `.env` file and add your OpenAI API key and Hugging Face token like so:
```plaintext
OPENAI_API_KEY=your_openai_api_key_here
HF_TOKEN=your_hugging_face_token_here
```
Your will need to add a Hugging Face token with write privileges to push the created dataset to the Hugging Face Hub.

In [2]:
load_dotenv()

client = OpenAI(
    api_key=os.environ.get('OPENAI_API_KEY'),
)

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Auxiliary functions

In [3]:
def text_only_labeling(text, tags):
    prompt = f"""
    You are an NER tagging assistant. Your task is to label ALL entities in the text based on the tags provided.
    Here are the tags: {', '.join(tags)}
    Use {tags[0]} whenever an entity does not match any other tag.

    For each entity, return the entity and its tag.

    Input text: "{text}"

    Respond with the entities in this JSON format:
    [
        {{ "entity": str, "tag": str }},
        ...
    ]
    """
    
    response = client.chat.completions.create(
        messages=[{'role': 'user', 'content': prompt}],
        model='gpt-4-1106-preview',
        temperature=0,
        response_format={'type': 'json_object'},
    )

    return response.choices[0].message.content

def parse_response(response):
    try:
        parsed_data = json.loads(response)
    except json.JSONDecodeError as e:
        raise ValueError(f'Error parsing JSON: {e}, in response: {response}')
        
    tokens = []
    tags = []

    if type(parsed_data) is list:
        entities = parsed_data
    elif type(parsed_data) is dict:
        if 'entities' in parsed_data.keys():
            entities = parsed_data['entities']
        else:
            entities = [parsed_data]
        
    for entity in entities:
        token = entity.get('entity')
        tag = entity.get('tag')
        if token is not None and tag is not None:
            tokens.append(token)
            tags.append(tag)
        else:
            raise ValueError(f'Unexpected response format: {response}')

    return tokens, tags

def chunk_string_by_words(text, chunk_size):
    words = text.split()
    chunks = [words[i:i + chunk_size] for i in range(0, len(words), chunk_size)]
    chunked_text = [' '.join(chunk) for chunk in chunks]
    
    return chunked_text

def generate_data_from_csv(path, tags):
    print(f'generating data from {path}')
    
    tokens_list = []
    tags_list = []
    
    df = pd.read_csv(path)
    df.columns = df.columns.str.replace(' ', '_').str.lower()

    if 'text' not in df.columns:
        raise ValueError('no text column in csv')
    
    grouping_columns = ['application_name', 'seen_timestamp']

    for col_name in grouping_columns:
        if col_name not in df.columns:
            raise ValueError(f'no {col_name} column in csv')
    
    grouped_dfs = [group for _, group in df.groupby(grouping_columns)]
    view_texts_coalesced = [' '.join(df['text'].dropna()) for df in grouped_dfs]
    view_texts_filtered = [s for s in view_texts_coalesced if s != '']
    view_texts_chunked = []

    for text in view_texts_filtered:
        view_texts_chunked += chunk_string_by_words(text, 64)
    
    for text in tqdm(view_texts_chunked):
        try:
            response = text_only_labeling(text, tags)
            tokens, tags = parse_response(response)
            tokens_list.append(tokens)
            tags_list.append(tags)
        except Exception as e:
            print(e)

    return tokens_list, tags_list

def create_dataset(tokens_list, tags_list, ner_tags):
    dataset = Dataset.from_dict({'tokens': tokens_list, 'ner_tags': tags_list})
    dataset = dataset.cast_column('ner_tags', Sequence(feature=ner_tags))
    dataset = dataset.add_column('id', [i for i in range(len(dataset))])
    
    trainvalid_test = dataset.train_test_split(test_size=0.2)
    train_valid = trainvalid_test['train'].train_test_split(test_size=0.2)
    split_dataset = DatasetDict({
        'train': train_valid['train'],
        'test': trainvalid_test['test'],
        'valid': train_valid['test']}
    )
    
    return split_dataset

def checkpoint_name(file_name):
    return f'{file_name}_checkpoint.pkl'

# Generate data

This part of the script will fetch all csv files from a `data` directory and attempt to generate a dataset from them.

In [None]:
data_path = Path('data')
ner_tags = ClassLabel(names=['N/A', 'PRICE', 'PRODUCT', 'DISCOUNT_PERCENTAGE', 'QUANTITY', 'DATE'])

for csv_file in data_path.glob('*.csv'):
    if os.path.exists(checkpoint_name(csv_file)):
        print(f'skipping {csv_file}')
        continue
        
    try:
        tokens_list, tags_list = generate_data_from_csv(csv_file, ner_tags.names)
        with open(checkpoint_name(csv_file), 'wb') as f:
            print(f'saving checkpoint {checkpoint_name(csv_file)}')
            pickle.dump((tokens_list, tags_list), f)
        
    except ValueError as e:
        print(e)

generating data from data\18789327023.csv


 14%|████████████████████████▋                                                                                                                                                    | 1/7 [00:11<01:11, 11.95s/it]

In [None]:
tokens_list = []
tags_list = []

for pkl_file in Path('.').glob('*.pkl'):
    with open(pkl_file, 'rb') as f:
        tokens, tags = pickle.load(f)

    tokens_list += tokens
    tags_list += tags

dataset = create_dataset(tokens_list, tags_list, ner_tags)

# Save dataset

In [88]:
dataset.push_to_hub("murmuras_labeled_data_from_csv", private=True)










ploading the dataset shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.46it/s]

CommitInfo(commit_url='https://huggingface.co/datasets/kdybek/test_dataset/commit/c87c208bb09661ad3b13c7c352f3373e66571aa9', commit_message='Upload dataset', commit_description='', oid='c87c208bb09661ad3b13c7c352f3373e66571aa9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/kdybek/test_dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='kdybek/test_dataset'), pr_revision=None, pr_num=None)