In [1]:
import sys
sys.path.append('../')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from DPF.formatters.t2i_formatter import T2IFormatter

In [2]:
formatter = T2IFormatter()
shards_path = '../data/example_shards_format/'
processor = formatter.from_shards(
    shards_path, 
    imagename_column='image_name',
    caption_column='caption',
    progress_bar=True,
    processes=8
)

  0%|          | 0/1 [00:00<?, ?it/s]

### Validator

_ShardsValidator_ - checks for errors in shards

In [5]:
from DPF.validators.text2image.shards_validator import ShardsValidator

# fs = S3FileSystem(
#     key='your_access_key',
#     secret='your_secret_key',
#     endpoint_url='your_endpoint'
# )

fs = processor.get_filesystem()

validator = ShardsValidator(
    filesystem=fs,
    csv_columns=[
        'image_name', 'caption', 'width', 'height'
    ]
)
res, err2count, all_ok = validator.validate(shards_path, processes=16)


  0%|          | 0/1 [00:00<?, ?it/s]

all_ok must be true. Otherwise, look at the errors in the 'res' and correct them

### CLIPFilter

CLIP similarity between image and caption

In [6]:
from DPF.filters.text2image.clip_filter import CLIPFilter

clip_filter = CLIPFilter(
    clip_version='ViT-L/14', 
    templates=['{}'],
    weights_folder='path_to_weights_folder',
    workers=16, batch_size=64
)

processor.apply_filter(clip_filter)

100%|██████████| 3/3 [00:24<00:00,  8.04s/it]


In [7]:
processor.update_data(
    ['clip_ViT-L/14_similarity'],
    processes=16
)

  0%|          | 0/1 [00:00<?, ?it/s]

[]

ruCLIP similarity between image and caption

In [8]:
from DPF.filters.text2image.ruclip_filter import RuCLIPFilter

ruclip_filter = RuCLIPFilter(
    ruclip_version='ruclip-vit-large-patch14-336', 
    templates=['{}'],
    weights_folder='path_to_weights_folder',
    workers=8, batch_size=64,
    device='cuda:0'
)

processor.apply_filter(ruclip_filter)

100%|██████████| 3/3 [00:16<00:00,  5.44s/it]


In [9]:
processor.update_data(
    ['ruclip-vit-large-patch14-336_similarity'],
    processes=16
)

  0%|          | 0/1 [00:00<?, ?it/s]

[]

### WatermarksFilter with CLIPLabelsFilter

_WatermarksFilter_ - watermarks detection

In [10]:
from DPF.filters.images.watermarks_filter import WatermarksFilter
from DPF.filters import ComplexFilter

watermarks_filter_small = WatermarksFilter(
    'resnext50_32x4d-small',
    weights_folder='path_to_weights_folder',
    workers=8, batch_size=256,
    device='cuda:0'
)

watermarks_filter_large = WatermarksFilter(
    'resnext101_32x8d-large',
    weights_folder='path_to_weights_folder',
    workers=8, batch_size=256,
    device='cuda:0'
)

cfilter = ComplexFilter(
    [watermarks_filter_small, watermarks_filter_large], 
    use_same_preprocess=True, workers=16
)
processor.apply_filter(cfilter)



100%|██████████| 1/1 [00:25<00:00, 25.43s/it]


In [11]:
processor.update_data(
    ['watermark_resnext50_32x4d-small', 'watermark_resnext101_32x8d-large'],
    processes=16
)

  0%|          | 0/1 [00:00<?, ?it/s]

[]

Labeling with CLIP

In [12]:
from DPF.filters.images.cliplabels_filter import CLIPLabelsFilter

labels_for_clip = [
    "picture has watermark",
    "slide of presentation with text",
    "document with text",
    "web site with text"
]

clip_filter = CLIPLabelsFilter(
    clip_model='ViT-L/14',
    templates=['{}'],
    weights_folder='path_to_weights_folder',
    labels=labels_for_clip,
    workers=16, batch_size=512,
    device='cuda:0'
)

print(clip_filter.schema)

processor.apply_filter(clip_filter)

['image_path', 'ViT-L/14 clip score "picture has watermark"', 'ViT-L/14 clip score "slide of presentation with text"', 'ViT-L/14 clip score "document with text"', 'ViT-L/14 clip score "web site with text"']


100%|██████████| 1/1 [00:22<00:00, 22.61s/it]


In [13]:
processor.update_data(
    ['ViT-L/14 clip score "picture has watermark"',
     'ViT-L/14 clip score "slide of presentation with text"',
     'ViT-L/14 clip score "document with text"',
     'ViT-L/14 clip score "web site with text"'],
    processes=16
)

  0%|          | 0/1 [00:00<?, ?it/s]

[]

Combine WatermarksFilter and CLIPLabelsFilter

In [14]:
df = processor.df

min_clip_score = 0.14
max_clip_score = 0.2
clip_score_diff = max_clip_score-min_clip_score

clip_prob_watermark = (
    (df['ViT-L/14 clip score "picture has watermark"']-min_clip_score)/clip_score_diff 
)

df['watermark_prob'] = df['watermark_resnext50_32x4d-small']*0.3 \
                       + df['watermark_resnext101_32x8d-large']*0.4 \
                       + clip_prob_watermark*0.3

df.loc[df['watermark_prob']<0, 'watermark_prob'] = 0
df.loc[df['watermark_prob']>1, 'watermark_prob'] = 1

max_text_score = df[
    ['ViT-L/14 clip score "slide of presentation with text"',
     'ViT-L/14 clip score "document with text"', 
     'ViT-L/14 clip score "web site with text"']
].max(axis=1)

clip_prob_text = (max_text_score-min_clip_score)/clip_score_diff
clip_prob_text[clip_prob_text<0] = 0
clip_prob_text[clip_prob_text>1] = 1

df['webdocument_prob'] = clip_prob_text

In [15]:
processor.update_data(
    ['watermark_prob',
     'webdocument_prob'],
    processes=16
)

  0%|          | 0/1 [00:00<?, ?it/s]

[]

### Duplicate

_PHashFilter_ - filter to get information about images: phash. 

In [17]:
from DPF.filters.images.hash_filters import PHashFilter

phash_filter = PHashFilter(sim_hash_size=8, workers=16)
processor.apply_filter(phash_filter)

df = processor.df

100%|██████████| 192/192 [00:16<00:00, 11.85it/s]


duplicated

In [18]:
df['is_duplicate'] = False
df.loc[df['image_phash_8'].duplicated(), 'is_duplicate'] = True

In [19]:
processor.update_data(
    ['image_phash_8', 'is_duplicate'],
    processes=16
)

  0%|          | 0/1 [00:00<?, ?it/s]

[]

### TextFilter

_LangFilter_ - filter to get information about caption: lang and lang score 

In [23]:
from DPF.filters.texts.lang_filter import LangFilter

lang_filter = LangFilter(
    text_column_name='caption',
    workers=16
)

processor.apply_filter(lang_filter)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [None]:
processor.update_data(
    ['lang', 'lang_score'],
    processes=16
)

  0%|          | 0/1 [00:00<?, ?it/s]

[]

_RegexFilter_ - filter that cleans the text from unnecessary words, using regular expressions

In [25]:
from DPF.filters.texts.regex_filter import RegexFilter
from DPF.filters.texts.regexs import *

regex_filter = RegexFilter(
    text_column_name='caption',
    regex_replacement_list=ru_regexs+special_regexs+emoji_regexs
)

processor.apply_filter(regex_filter)

Can't import package re2, using re package. It is recommended to use more efficient re2 package.
INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [26]:
processor.update_data(
    ['clean_caption'],
    processes=16
)

  0%|          | 0/1 [00:00<?, ?it/s]

[]