In [1]:
import os
import pandas as pd
import base64
import mimetypes
from openai import OpenAI
from dotenv import load_dotenv
import json
from datasets import Dataset, ClassLabel, Sequence, DatasetDict
from tqdm import tqdm
from pathlib import Path
from huggingface_hub import login
import pickle
import requests
from bs4 import BeautifulSoup
from googlesearch import search
import random

# Provide necessary API keys and tokens

Create a `.env` file and add your OpenAI API key and Hugging Face token like so:
```plaintext
OPENAI_API_KEY=your_openai_api_key_here
HF_TOKEN=your_hugging_face_token_here
```
Your will need to add a Hugging Face token with write privileges to push the created dataset to Hugging Face Hub.

In [2]:
load_dotenv()

client = OpenAI(
    api_key=os.environ.get('OPENAI_API_KEY'),
)

login(token=os.environ.get('HF_TOKEN'))

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


# Create directories

In [14]:
csv_path = 'csv'
scrape_path = 'scrape'
checkpoints_path = 'checkpoints'

Path(csv_path).mkdir(parents=True, exist_ok=True)
Path(scrape_path).mkdir(parents=True, exist_ok=True)
Path(checkpoints_path).mkdir(parents=True, exist_ok=True)

# Auxiliary scraping functions

In [4]:
def scrape_all_text(url):
    try:
        response = requests.get(url)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        for script_or_style in soup(['script', 'style']):
            script_or_style.decompose()

        text = soup.get_text(separator=' ', strip=True)

        return text
        
    except Exception as e:
        print(f'Error scraping {url}: {e}')
        return ''

def get_random_links(base_url, count=10):
    try:
        response = requests.get(base_url)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        links = [a['href'] for a in soup.find_all('a', href=True)]

        full_links = [
            link if link.startswith('http') else requests.compat.urljoin(base_url, link)
            for link in links
        ]

        unique_links = list(set(full_links))

        return random.sample(unique_links, min(count, len(unique_links)))
        
    except Exception as e:
        print(f'Error fetching links from {base_url}: {e}')
        return []

def scrape_from_search(query, num, stop, pause):
    all_results = []
    
    for url in tqdm(search(query, num=num, stop=stop, pause=pause)):
        results = scrape_all_text(url)
        all_results.append(results)

    return all_results

# Scrape

In [5]:
queries = ['shop italy', 'sklep warszawa', 'bookstore london', 'shop france', 'shop germany']

for query in queries:
    print(f'querying {query}')
    results = scrape_from_search(query, num=10, stop=10, pause=2)
    with open(f'{os.path.join(scrape_path, query)}.pkl', 'wb') as f:
        print(f'saving data scraped with query {query}')
        pickle.dump(results, f)

querying shop italy



0it [00:10,  1.04s/it]

saving data scraped with query shop italy
querying sklep warszawa


1it [00:05,  5.47s/it]

Error scraping https://warszawa.naszemiasto.pl/ulubione-sklepy-warszawiakow-tam-kochaja-robic-zakupy-oto/ar/c1-8899471: 403 Client Error: Forbidden for url: https://warszawa.naszemiasto.pl/ulubione-sklepy-warszawiakow-tam-kochaja-robic-zakupy-oto/ar/c1-8899471



0it [00:08,  1.19it/s]

Error scraping https://warsawsneakerstore.com/: 403 Client Error: Forbidden for url: https://warsawsneakerstore.com/
Error scraping https://warsawsneakerstore.com/menu/obuwie/meskie: 403 Client Error: Forbidden for url: https://warsawsneakerstore.com/menu/obuwie/meskie
Error scraping https://warsawsneakerstore.com/menu/nowosci: 403 Client Error: Forbidden for url: https://warsawsneakerstore.com/menu/nowosci
saving data scraped with query sklep warszawa
querying bookstore london


1it [00:05,  5.59s/it]

Error scraping https://www.visitlondon.com/things-to-do/shopping/books/top-bookshops-in-london: 403 Client Error: Forbidden for url: https://www.visitlondon.com/things-to-do/shopping/books/top-bookshops-in-london


3it [00:06,  1.42s/it]

Error scraping https://www.foyles.co.uk/: 403 Client Error: Forbidden for url: https://www.foyles.co.uk/
Error scraping https://www.waterstones.com/bookshops/piccadilly: 403 Client Error: Forbidden for url: https://www.waterstones.com/bookshops/piccadilly



0it [00:10,  1.07s/it]

saving data scraped with query bookstore london
querying shop france


3it [00:07,  1.87s/it]

Error scraping https://www.tripadvisor.com/Attractions-g187070-Activities-c26-France.html: 403 Client Error: Forbidden for url: https://www.tripadvisor.com/Attractions-g187070-Activities-c26-France.html



0it [00:15,  1.56s/it]

saving data scraped with query shop france
querying shop germany


4it [00:06,  1.07s/it]

Error scraping https://www.tripadvisor.com/Attractions-g187275-Activities-c26-Germany.html: 403 Client Error: Forbidden for url: https://www.tripadvisor.com/Attractions-g187275-Activities-c26-Germany.html


10it [00:10,  1.04s/it]

Error scraping https://www.etsy.com/market/shops_from_germany: 403 Client Error: Forbidden for url: https://www.etsy.com/market/shops_from_germany
saving data scraped with query shop germany





# Auxiliary data generation functions

In [15]:
def text_only_labeling(text, tags):
    prompt = f"""
    You are an NER tagging assistant. Your task is to label all entities in the text based on the tags provided.
    Here are the tags: {', '.join(tags)}
    Use {tags[0]} whenever an entity does not match any other tag. It is imperative that you label everything in the text. Don't make the entities too short.

    For each entity, return the entity and its tag.

    Input text: "{text}"

    Respond with the entities in this JSON format:
    [
        {{ "entity": str, "tag": str }},
        ...
    ]
    """
    
    response = client.chat.completions.create(
        messages=[{'role': 'user', 'content': prompt}],
        model='gpt-4o',
        temperature=0,
        response_format={'type': 'json_object'},
    )

    return response.choices[0].message.content

def parse_response(response):
    try:
        parsed_data = json.loads(response)
    except json.JSONDecodeError as e:
        raise ValueError(f'Error parsing JSON: {e}, in response: {response}')
        
    tokens = []
    tags = []

    if type(parsed_data) is list:
        entities = parsed_data
    elif type(parsed_data) is dict:
        if 'entities' in parsed_data.keys():
            entities = parsed_data['entities']
        else:
            entities = [parsed_data]
        
    for entity in entities:
        token = entity.get('entity')
        tag = entity.get('tag')
        if token is not None and tag is not None:
            tokens.append(token)
            tags.append(tag)
        else:
            raise ValueError(f'Unexpected response format: {response}')

    return tokens, tags

def chunk_string_by_words(text, chunk_size=64):
    words = text.split()
    chunks = [words[i:i + chunk_size] for i in range(0, len(words), chunk_size)]
    chunked_text = [' '.join(chunk) for chunk in chunks]
    
    return chunked_text

def prepare_string_list(texts):
    texts_filtered = [s for s in texts if s != '']
    texts_chunked = []

    for text in texts_filtered:
        texts_chunked += chunk_string_by_words(text)

    return texts_chunked

def prepare_csv(path):
    df = pd.read_csv(path)
    df.columns = df.columns.str.replace(' ', '_').str.lower()

    if 'text' not in df.columns:
        raise ValueError('no text column in csv')
    
    grouping_columns = ['application_name', 'seen_timestamp']

    for col_name in grouping_columns:
        if col_name not in df.columns:
            raise ValueError(f'no {col_name} column in csv')
    
    grouped_dfs = [group for _, group in df.groupby(grouping_columns)]
    view_texts_coalesced = [' '.join(df['text'].dropna()) for df in grouped_dfs]

    return prepare_string_list(view_texts_coalesced)

def generate_data(texts, ner_tags):
    tokens_list = []
    tags_list = []
    
    for text in tqdm(texts):
        try:
            response = text_only_labeling(text, ner_tags)
            tokens, tags = parse_response(response)
            tokens_list.append(tokens)
            tags_list.append(tags)
        except Exception as e:
            print(e)

    return tokens_list, tags_list

def create_dataset(tokens_list, tags_list, ner_tags):
    dataset = Dataset.from_dict({'tokens': tokens_list, 'ner_tags': tags_list})
    dataset = dataset.cast_column('ner_tags', Sequence(feature=ner_tags))
    dataset = dataset.add_column('id', [i for i in range(len(dataset))])
    
    trainvalid_test = dataset.train_test_split(test_size=0.2)
    train_valid = trainvalid_test['train'].train_test_split(test_size=0.2)
    split_dataset = DatasetDict({
        'train': train_valid['train'],
        'test': trainvalid_test['test'],
        'validation': train_valid['test']}
    )
    
    return split_dataset

def checkpoint_name(path):
    return f'{os.path.join(checkpoints_path, Path(path).stem)}.pkl'

def remove_irrelevant_data(tokens_list, tags_list, drop_prob=0.8):
    cleaned_tokens_list = []
    cleaned_tags_list = []

    for tokens, tags in zip(tokens_list, tags_list, strict=True):
        if not tags:
            continue
            
        if all(item == 'N/A' for item in tags) and random.random() < drop_prob:
            continue

        cleaned_tokens_list.append(tokens)
        cleaned_tags_list.append(tags)

    return cleaned_tokens_list, cleaned_tags_list

# Generate data

This part of the script will fetch all csv files from a `csv` directory and attempt to generate labeled data with them.

In [None]:
ner_tags = ClassLabel(names=['N/A', 'PRICE', 'PRODUCT', 'DISCOUNT_PERCENTAGE', 'QUANTITY', 'DATE'])

for csv_file in Path(csv_path).glob('*.csv'):
    if os.path.exists(checkpoint_name(csv_file)):
        print(f'skipping {csv_file}')
        continue
        
    try:
        print(f'generating data using {csv_file}')
        texts = prepare_csv(csv_file)
        tokens_list, tags_list = generate_data(texts, ner_tags.names)
        print(f'generated {len(tokens_list)} examples')
        with open(checkpoint_name(csv_file), 'wb') as f:
            print(f'saving checkpoint {checkpoint_name(csv_file)}')
            pickle.dump((tokens_list, tags_list), f)
        
    except ValueError as e:
        print(e)

for scrape_file in Path(scrape_path).glob('*.pkl'):
    if os.path.exists(checkpoint_name(scrape_file)):
        print(f'skipping {scrape_file}')
        continue
        
    try:
        print(f'generating data using {scrape_file}')
        with open(scrape_file, 'rb') as f:
            raw_texts = pickle.load(f)
        
        texts = prepare_string_list(raw_texts)
        tokens_list, tags_list = generate_data(texts, ner_tags.names)
        tokens_list, tags_list = remove_irrelevant_data(tokens_list, tags_list)
        print(f'generated {len(tokens_list)} examples')
        with open(checkpoint_name(scrape_file), 'wb') as f:
            print(f'saving checkpoint {checkpoint_name(scrape_file)}')
            pickle.dump((tokens_list, tags_list), f)
        
    except ValueError as e:
        print(e)

# Create dataset

In [16]:
tokens_list = []
tags_list = []

for pkl_file in Path(checkpoints_path).glob('*.pkl'):
    with open(pkl_file, 'rb') as f:
        tokens, tags = pickle.load(f)

    tokens_list += tokens
    tags_list += tags

dataset = create_dataset(tokens_list, tags_list, ner_tags)

Casting the dataset:   0%|          | 0/104 [00:00<?, ? examples/s]

# Save dataset

In [None]:
dataset.push_to_hub('murmuras_labeled_data', private=True)