In [None]:
%pip install spacy

In [4]:
import os
from urllib.request import urlretrieve
import pandas as pd

# Download the parquet table
table_url = f'https://huggingface.co/datasets/poloclub/diffusiondb/resolve/main/metadata-large.parquet'
if not os.path.exists('metadata.parquet'):
    urlretrieve(table_url, 'metadata.parquet')

# Read the table using Pandas
metadata_df = pd.read_parquet('metadata.parquet')

In [5]:
prompt_only = metadata_df["prompt"]

display(prompt_only[0])

'beautiful porcelain ivory fair face woman biomechanical cyborg, close - up, sharp focus, studio light, iris van herpen haute couture headdress made of rhizomorphs, daisies, brackets, colorful corals, fractal mushrooms, puffballs, octane render, ultra sharp, 8 k '

In [6]:
import spacy
spacy.cli.download("en_core_web_sm")
# Load the English language model
nlp = spacy.load("en_core_web_sm")



Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.8/12.8 MB 53.9 MB/s eta 0:00:00



[notice] A new release of pip is available: 23.1.1 -> 23.1.2
[notice] To update, run: pip install --upgrade pip


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [8]:
from datasets import Dataset, Features

nlp = spacy.load("en_core_web_sm")

features = Features(
    {
        "prompt": {"type": "string", "description": "The full input prompt."},
        "masked": {"type": "string", "description": "Nouns only."},
    }
)

def mask_non_nouns(prompt):
    doc = nlp(prompt)
    masked_tokens = []
    pos = set({"NOUN", "PROPN"})
    for token in doc:
        if token.pos_ in pos or (token.dep_ == "nsubj" and token.pos_ == "VERB"):
            masked_tokens.append(token.text)
    return " ".join(masked_tokens)

def process_batch(batch):
    batch["masked"] = [mask_non_nouns(prompt) for prompt in batch["prompt"]]
    return batch

prompt_only = prompt_only.drop_duplicates()

display([(mask_non_nouns(p), p) for p in prompt_only[0:110]])

[('porcelain ivory face woman cyborg focus studio light iris van herpen haute couture headdress rhizomorphs daisies brackets corals fractal mushrooms puffballs octane render k',
  'beautiful porcelain ivory fair face woman biomechanical cyborg, close - up, sharp focus, studio light, iris van herpen haute couture headdress made of rhizomorphs, daisies, brackets, colorful corals, fractal mushrooms, puffballs, octane render, ultra sharp, 8 k '),
 ('d hyper woman porcelain ivory face shot portrait up lace iris van herpen cyberpunk daisies corals couture headdress rhizomorph finials spires brackets fractal puffballs octane render k',
  'complex 3 d render hyper detailed ultra sharp futuristic beautiful biomechanical humanoid woman with porcelain ivory face, medium shot portrait, close - up, filigree lace, iris van herpen cyberpunk daisies corals haute couture headdress with rhizomorph finials spires, brackets, fractal embroidered puffballs, octane render, 8 k '),
 ('d hyper scifi woman porc

In [10]:
# prompt_only = prompt_only.to_frame()
display(prompt_only.head())
prompt_only_df = prompt_only.to_frame()
dataset = Dataset.from_pandas(prompt_only_df)
dataset = dataset.map(process_batch, batched=True, batch_size=1000, num_proc=16)

0     beautiful porcelain ivory fair face woman biom...
1     complex 3 d render hyper detailed ultra sharp ...
15    complex 3 d render hyper detailed ultra sharp ...
16    complex 3 d render hyper detailed ultra sharp ...
33    complex 3 d render hyper detailed ultra sharp ...
Name: prompt, dtype: object

                                                                                     

In [11]:
from huggingface_hub import login

login("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", add_to_git_credential=True)
dataset.push_to_hub("roborovski/diffusiondb-masked-no-descriptors")


Token is valid.
Your token has been saved in your configured git credential helpers (osxkeychain).
Your token has been saved to /Users/brian/.cache/huggingface/token
Login successful


Creating parquet from Arrow format: 100%|██████████| 1820/1820 [00:01<00:00, 1292.14ba/s]
Upload 1 LFS files: 100%|██████████| 1/1 [01:41<00:00, 101.17s/it]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [01:43<00:00, 103.19s/it]
