In [1]:
import os
import pandas as pd
import base64
import mimetypes
from openai import OpenAI
from dotenv import load_dotenv
import json
from datasets import Dataset, ClassLabel, Sequence, DatasetDict
from tqdm import tqdm
from pathlib import Path
from huggingface_hub import notebook_login

# Provide necessary API keys and tokens

Create a `.env` file and add your OpenAI API key and Hugging Face token like so:
```plaintext
OPENAI_API_KEY=your_openai_api_key_here
HF_TOKEN=your_hugging_face_token_here
```
Your will need to add a Hugging Face token with write privileges to push the created dataset to the Hugging Face Hub.

In [2]:
load_dotenv()

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Global variables

In [3]:
data_path = Path('data')
labels = ClassLabel(names=['N/A', 'PRICE', 'PRODUCT', 'DISCOUNT_PERCENTAGE', 'WEIGHT', 'DATE'])

# Auxiliary functions

In [6]:
def text_only_labeling(text, tags):
    prompt = f"""
    You are an NER tagging assistant. Your task is to label ALL entities in the text based on the tags provided.
    Here are the tags: {', '.join(tags)}
    Use {tags[0]} whenever an entity does not match any other tag.

    For each entity, return the entity and its tag.

    Input text: "{text}"

    Respond with the entities in this JSON format:
    [
        {{ "entity": str, "tag": str }},
        ...
    ]
    """
    
    response = client.chat.completions.create(
        messages=[{'role': 'user', 'content': prompt}],
        model='gpt-4-1106-preview',
        temperature=0,
        response_format={'type': 'json_object'},
    )

    return response.choices[0].message.content

def parse_response(response):
    try:
        parsed_data = json.loads(response)
    except json.JSONDecodeError as e:
        print(f'Error parsing JSON: {e}')
        print(response)
        return
    tokens = []
    tags = []
    for entry in parsed_data.get('entities'):
        if entry is not None:
            entity = entry.get('entity')
            tag = entry.get('tag')
            if entity is not None and tag is not None:
                tokens.append(entry['entity'])
                tags.append(entry['tag'])
            else:
                print('Unexpected response format')
                print(response)
        else:
            print('Unexpected response format')
            print(response)

    return tokens, tags

def generate_data_from_csv(path, tags):
    print(f'generating data from {path}')
    
    tokens_list = []
    tags_list = []
    
    df = pd.read_csv(path)
    df.columns = df.columns.str.lower()

    if 'text' not in df.columns:
        print('no text column in csv')
        return
    
    grouping_columns_try = ['application name', 'seen timestamp']
    grouping_columns = []

    for col_name in grouping_columns_try:
        if col_name in df.columns:
            print(f'grouping by {col_name}')
            grouping_columns.append(col_name)
    
    grouped_dfs = [group for _, group in df.groupby(grouping_columns)]
    view_texts_coalesced = [" ".join(df['text'].dropna()) for df in grouped_dfs]
    view_texts_filtered = [s for s in view_texts_coalesced if s != ""]
    
    for text in tqdm(view_texts_filtered):
        response = text_only_labeling(view_texts_coalesced[1], tags)
        tokens, tags = parse_response(response)
        
        if tokens is not None and tags is not None:
            tokens_list.append(tokens)
            tags_list.append(tags)

    return tokens_list, tags_list

def create_dataset(tokens_list, tags_list):
    dataset = Dataset.from_dict({'tokens': tokens_list, 'ner_tags': tags_list})
    dataset = dataset.cast_column('ner_tags', Sequence(feature=labels))
    dataset = dataset.add_column('id', [i for i in range(len(dataset))])
    
    trainvalid_test = dataset.train_test_split(test_size=0.2)
    train_valid = trainvalid_test['train'].train_test_split(test_size=0.2)
    split_dataset = DatasetDict({
        'train': train_valid['train'],
        'test': trainvalid_test['test'],
        'valid': train_valid['test']}
    )
    
    return split_dataset

# Data generation

This part of the script will fetch all csv files from a `data` directory and attempt to generate a dataset from them.

In [None]:
data_path = Path('data')
ner_tags = ClassLabel(names=['N/A', 'PRICE', 'PRODUCT', 'DISCOUNT_PERCENTAGE', 'WEIGHT', 'DATE'])

tokens_list = []
tags_list = []

for csv_file in data_path.glob('*.csv'):
    tokens_list_aux, tags_list_aux = generate_data_from_csv(csv_file, ner_tags.names)
    tokens_list += tokens_list_aux
    tags_list += tags_list_aux

dataset = create_dataset(tokens_list, tags_list)

generating data from data\18789327023.csv
grouping by application name
grouping by seen timestamp


 14%|████████████████████████▋                                                                                                                                                    | 1/7 [00:08<00:52,  8.80s/it]

# Dataset saving

In [88]:
dataset.push_to_hub("murmuras_csv_", private=True)










ploading the dataset shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.46it/s]

CommitInfo(commit_url='https://huggingface.co/datasets/kdybek/test_dataset/commit/c87c208bb09661ad3b13c7c352f3373e66571aa9', commit_message='Upload dataset', commit_description='', oid='c87c208bb09661ad3b13c7c352f3373e66571aa9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/kdybek/test_dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='kdybek/test_dataset'), pr_revision=None, pr_num=None)