In [None]:
import os
from urllib.request import urlretrieve
import pandas as pd
from datasets import load_dataset

table_url = f'https://huggingface.co/datasets/poloclub/diffusiondb/resolve/main/metadata-large.parquet'

pbar = None

if not os.path.exists('metadata.parquet'):
    print("retrieving metadata file")
    urlretrieve(table_url, 'metadata.parquet')
# Read the table using Pandas\n",
metadata_df = pd.read_parquet('metadata.parquet')

In [None]:
import importlib.util
spec = importlib.util.find_spec("en_core_web_trf")
if spec is None:
    print("Installing en_core_web_trf")
    ! pip install https://huggingface.co/spacy/en_core_web_trf/resolve/main/en_core_web_trf-any-py3-none-any.whl

In [None]:
import en_core_web_trf
import spacy
nlp = en_core_web_trf.load()

In [39]:
metadata_df["prompt"].head(10)

[('a teddy bear family having breakfast on a plaid blanket on the meadow, picknick basket, morning mood, feng zhu and loish and laurie greasley, victo ngai, andreas rocha, john harris ',
  {'subject': ['family a teddy bear family'],
   'descriptor': ['breakfast a plaid blanket the meadow picknick basket morning mood feng zhu loish laurie greasley victo ngai andreas rocha john harris']}),
 ('trippy trip aliens in GTA V',
  {'subject': ['aliens V GTA V'], 'descriptor': ['']}),
 ('stunning goddess of beers portrait, clear eyes and dark skin. realistic, symmetrical face. art by bowater charlie, mark brooks, julie bell, arian mark, tony sandoval ',
  {'subject': ['beers'],
   'descriptor': ['clear eyes dark skin realistic, symmetrical face art bowater charlie mark brooks julie bell arian mark tony sandoval']}),
 ('Terminator, Profile, portrait, intricate, detailed, volumetric lighting, scenery, digital painting, highly detailed, artstation, sharp focus, illustration, concept art, ruan jia, 

0    beautiful porcelain ivory fair face woman biom...
1    complex 3 d render hyper detailed ultra sharp ...
2    complex 3 d render hyper detailed ultra sharp ...
3    complex 3 d render hyper detailed ultra sharp ...
4    complex 3 d render hyper detailed ultra sharp ...
5    complex 3 d render hyper detailed ultra sharp ...
6    complex 3 d render hyper detailed ultra sharp ...
7    complex 3 d render hyper detailed ultra sharp ...
8    complex 3 d render hyper detailed ultra sharp ...
9    complex 3 d render hyper detailed ultra sharp ...
Name: prompt, dtype: object

In [63]:

n_samples = 100_000
first_n_unique_prompts = metadata_df["prompt"].sample(n=n_samples, random_state=42).drop_duplicates().head(n_samples)
display(first_n_unique_prompts.head(5).tolist())
display(first_n_unique_prompts.shape)

['a teddy bear family having breakfast on a plaid blanket on the meadow, picknick basket, morning mood, feng zhu and loish and laurie greasley, victo ngai, andreas rocha, john harris ',
 'trippy trip aliens in GTA V',
 'stunning goddess of beers portrait, clear eyes and dark skin. realistic, symmetrical face. art by bowater charlie, mark brooks, julie bell, arian mark, tony sandoval ',
 'Terminator, Profile, portrait, intricate, detailed, volumetric lighting, scenery, digital painting, highly detailed, artstation, sharp focus, illustration, concept art, ruan jia, steve mccurry',
 'a still frame from comic strip, sleeping bird, 1 9 5 0, herluf bidstrup, new yorker illustration, monochrome bw, lineart, manga, tadanori yokoo, simplified, ']

(93834,)

In [60]:
from datasets import Dataset, Features
from spacy import displacy
from spacy.symbols import nsubj, VERB


def process(batch):
    out = {
        "subject": [],
        "descriptor": [],
    }
    for prompt in batch["prompt"]:
        doc = nlp(prompt)
        # displacy.render(doc, style="dep")
        subject_tokens, descriptor_tokens = [], []
        # find the first chunk with either an entity or a proper noun.
        subject_found = False
        for chunk in doc.noun_chunks:
            if subject_found:
                descriptor_tokens.append(chunk.text)
            else:
                proper_nouns = [token for token in chunk if token.pos_ == "PROPN"]
                proper_ents, non_proper_ents = [], []
                for ent in chunk.ents:
                    if ent.label_ == "PERSON" or ent.label_ == "ORG":
                        proper_ents.append(ent)
                    else:
                        non_proper_ents.append(ent)
                subject_tokens.append(chunk.root.text)
                if len(non_proper_ents) > 0:
                    subject_tokens.append(chunk.text)
                    subject_found = True
                elif len(proper_nouns) > 0 and len(proper_ents) == 0:
                    subject_tokens.append(chunk.text)
                    subject_found = True

        # print("token deps")
        subject_tokens = [
            tok for i, tok in enumerate(subject_tokens) if tok not in subject_tokens[:i]
        ]
        out["subject"].append(" ".join(subject_tokens))
        out["descriptor"].append(" ".join(descriptor_tokens))
    return out


# display([(p, process(p)) for p in [
#     "stunning goddess of beers portrait, clear eyes and dark skin. realistic, symmetrical face. art by bowater charlie, mark brooks, julie bell, arian mark, tony sandoval "
# ]])
display([(p, process({"prompt": [p]})) for p in first_n_unique_prompts[:10]])

[('a teddy bear family having breakfast on a plaid blanket on the meadow, picknick basket, morning mood, feng zhu and loish and laurie greasley, victo ngai, andreas rocha, john harris ',
  {'subject': ['family a teddy bear family'],
   'descriptor': ['breakfast a plaid blanket the meadow picknick basket morning mood feng zhu loish laurie greasley victo ngai andreas rocha john harris']}),
 ('trippy trip aliens in GTA V',
  {'subject': ['aliens V GTA V'], 'descriptor': ['']}),
 ('stunning goddess of beers portrait, clear eyes and dark skin. realistic, symmetrical face. art by bowater charlie, mark brooks, julie bell, arian mark, tony sandoval ',
  {'subject': ['beers'],
   'descriptor': ['clear eyes dark skin realistic, symmetrical face art bowater charlie mark brooks julie bell arian mark tony sandoval']}),
 ('Terminator, Profile, portrait, intricate, detailed, volumetric lighting, scenery, digital painting, highly detailed, artstation, sharp focus, illustration, concept art, ruan jia, 

In [64]:
prompt_only_df = first_n_unique_prompts.to_frame()
dataset = Dataset.from_pandas(prompt_only_df, preserve_index=False)
dataset

Dataset({
    features: ['prompt'],
    num_rows: 93834
})

In [66]:
dataset = dataset.map(process, batched=True, batch_size=512, remove_columns=["prompt"])

Map:   0%|          | 0/93834 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (609 > 512). Running this sequence through the model will result in indexing errors


In [67]:
from huggingface_hub import login

display(dataset)
login("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", add_to_git_credential=True)
dataset.push_to_hub("roborovski/diffusiondb-seq2seq")


Dataset({
    features: ['subject', 'descriptor'],
    num_rows: 93834
})

Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m
Token has not been saved to git credential helper.
Your token has been saved to /home/ubuntu/.cache/huggingface/token
Login successful


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/94 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]