In [1]:
import sys
sys.path.append('../')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from DPF.formatters.t2i_formatter import T2IFormatter
from DPF.filesystems.s3filesystem import S3FileSystem

from DPF.filters.images.base_images_info_filter import ImageInfoGatherer
from DPF.filters.images.hash_filters import PHashFilter
from DPF.filters.images.watermarks_filter import WatermarksFilter
from DPF.filters.images.cliplabels_filter import CLIPLabelsFilter

from DPF.validators.text2image.shards_validator import ShardsValidator

from DPF.filters.text2image.clip_filter import CLIPFilter
from DPF.filters.text2image.ruclip_filter import RuCLIPFilter

from DPF.filters import ComplexFilter

from DPF.filters.texts.lang_filter import LangFilter
from DPF.filters.texts.regex_filter import RegexFilter
from DPF.filters.texts.regexs import *


Can't import package re2, using re package. It is recommended to use more efficient re2 package.


### Read dataset

In [2]:
formatter = T2IFormatter()
shards_path = '../data/example_shards_format/'
processor = formatter.from_shards(
    shards_path, 
    imagename_column='image_name',
    caption_column='caption',
    progress_bar=True,
    processes=8
)

  0%|          | 0/1 [00:00<?, ?it/s]

In [3]:
processor.df = processor.df[['image_name', 'image_path', 'table_path', 'archive_path', 'data_format',
       'caption', 'character', 'cartoon_name']]

### Validator

_ShardsValidator_ - checks for errors in shards

In [28]:
# fs = S3FileSystem(
#     key='your_access_key',
#     secret='your_secret_key',
#     endpoint_url='your_endpoint'
# )

fs = processor.get_filesystem()

validator = ShardsValidator(
    filesystem=fs,
    csv_columns=[
        'image_name', 'caption', 'width', 'height'
    ]
)
res, err2count, all_ok = validator.validate(shards_path, processes=16)


  0%|          | 0/1 [00:00<?, ?it/s]

all_ok must be true. Otherwise, look at the errors in the 'res' and correct them

In [29]:
print(all_ok)

True


### Create and run filter

_ImageInfoGatherer_ - filter to get information about images: width, height, channels and image correctness

In [5]:
imgfilter = ImageInfoGatherer(
    workers=16
)
processor.apply_filter(imgfilter)

100%|██████████| 192/192 [00:12<00:00, 15.33it/s]


Filter adds new columns to a dataframe.
You can get names of new columns by printing _imgfilter.schema_

In [6]:
imgfilter.schema

['image_path', 'is_correct', 'width', 'height', 'channels', 'error']

Let's check a dataframe

In [7]:
processor.df.head()[['image_name', 'caption', 'is_correct', 'width', 'height', 'channels', 'error']]

Unnamed: 0,image_name,caption,is_correct,width,height,channels,error
0,ProstokvashinoNew-01_Vozvraschenie-01_eee0013.png,"Матроскин из мультфильма ""Простоквашино""",True,1920,1080,3,
1,ProstokvashinoNew-02_Vozvraschenie-02_eee0019.png,"Матроскин из мультфильма ""Простоквашино""",True,1920,1080,3,
2,ProstokvashinoNew-04_DietaMatroskina_eee0004.png,"Матроскин из мультфильма ""Простоквашино""",True,1920,1080,3,
3,ProstokvashinoNew-04_DietaMatroskina_eee0012.png,"Матроскин из мультфильма ""Простоквашино""",True,1920,1080,3,
4,ProstokvashinoNew-05_SHarikHochetTelefon_eee00...,"Матроскин из мультфильма ""Простоквашино""",True,1920,1080,3,


### Save new columns

To save new columns, call _processor.update_data_

In [8]:
help(processor.update_data)

Help on method update_data in module DPF.processors.text2image.t2i_processor:

update_data(columns_to_add: List[str], overwrite_columns: bool = True, processes: int = 1, force: bool = False) -> List[str] method of DPF.processors.text2image.shards_processor.ShardsProcessor instance
    Updates existing columns and adds new columns in dataframes of a dataset
    
    Parameters
    ----------
    columns_to_add: list[str]
        List of column names to update or add
    overwrite_columns: bool = True
        Change (overwrite) or not existing columns
    processes: int = 1
        Number of parallel processes to read and update dataframes
    force: bool = False
        Force update if dataframe shape was changed
        
    Returns
    -------
    list[str]
        List of occured errors



In [9]:
processor.update_data(
    ['is_correct', 'width', 'height', 'channels', 'error'],
    processes=8
)

  0%|          | 0/1 [00:00<?, ?it/s]

[]

### CLIPFilter

CLIP similarity between image and caption

In [10]:
clip_filter = CLIPFilter(
    clip_version='ViT-L/14', 
    templates=['{}'],
    weights_folder='path_to_folder_with_weights',
    workers=16, batch_size=64
)

processor.apply_filter(clip_filter)

100%|██████████| 3/3 [00:27<00:00,  9.26s/it]


In [None]:
processor.update_data(
		['clip_ViT-L/14_similarity'],
		processes=16
)

  0%|          | 0/1 [00:00<?, ?it/s]

[]

ruCLIP similarity between image and caption

In [12]:
ruclip_filter = RuCLIPFilter(
    ruclip_version='ruclip-vit-large-patch14-336', 
    templates=['{}'],
    weights_folder='path_to_folder_with_weights',
    workers=8, batch_size=64,
    device='cuda:0'
)

processor.apply_filter(ruclip_filter)

100%|██████████| 3/3 [00:26<00:00,  8.69s/it]


In [None]:
processor.update_data(
		['ruclip-vit-large-patch14-336_similarity'],
		processes=16
)

  0%|          | 0/1 [00:00<?, ?it/s]

[]

### WatermarksFilter with CLIPLabelsFilter

_WatermarksFilter_ - watermarks detection

In [14]:
watermarks_filter_small = WatermarksFilter(
    'resnext50_32x4d-small',
    weights_folder='path_to_folder_with_weights',
    workers=8, batch_size=256,
    device='cuda:0'
)

watermarks_filter_large = WatermarksFilter(
    'resnext101_32x8d-large',
    weights_folder='path_to_folder_with_weights',
    workers=8, batch_size=256,
    device='cuda:0'
)

cfilter = ComplexFilter(
		[watermarks_filter_small, watermarks_filter_large], 
		use_same_preprocess=True, workers=16
)
processor.apply_filter(cfilter)



100%|██████████| 1/1 [00:21<00:00, 21.15s/it]


In [None]:
processor.update_data(
		['watermark_resnext50_32x4d-small', 'watermark_resnext101_32x8d-large'],
		processes=16
)

  0%|          | 0/1 [00:00<?, ?it/s]

[]

Labeling with CLIP

In [16]:
labels_for_clip = [
    "picture has watermark",
    "slide of presentation with text",
    "document with text",
    "web site with text"
]

clip_filter = CLIPLabelsFilter(
    clip_model='ViT-L/14',
    templates=['{}'],
    weights_folder='path_to_folder_with_weights',
    labels=labels_for_clip,
    workers=16, batch_size=512,
    device='cuda:0'
)

print(clip_filter.schema)

processor.apply_filter(clip_filter)

['image_path', 'ViT-L/14 clip score "picture has watermark"', 'ViT-L/14 clip score "slide of presentation with text"', 'ViT-L/14 clip score "document with text"', 'ViT-L/14 clip score "web site with text"']


100%|██████████| 1/1 [00:30<00:00, 31.00s/it]


In [17]:

processor.update_data(
		['ViT-L/14 clip score "picture has watermark"',
		'ViT-L/14 clip score "slide of presentation with text"',
		'ViT-L/14 clip score "document with text"',
		'ViT-L/14 clip score "web site with text"'],
		processes=16
)

  0%|          | 0/1 [00:00<?, ?it/s]

[]

Combine WatermarksFilter and CLIPLabelsFilter

In [18]:
df = processor.df

min_clip_score = 0.14
max_clip_score = 0.2
clip_score_diff = max_clip_score-min_clip_score

clip_prob_watermark = (
    (df['ViT-L/14 clip score "picture has watermark"']-min_clip_score)/clip_score_diff 
)

df['watermark_prob'] = df['watermark_resnext50_32x4d-small']*0.3\
                       + df['watermark_resnext101_32x8d-large']*0.4\
                       + clip_prob_watermark*0.3

df.loc[df['watermark_prob']<0, 'watermark_prob'] = 0
df.loc[df['watermark_prob']>1, 'watermark_prob'] = 1

max_text_score = df[
    ['ViT-L/14 clip score "slide of presentation with text"',
     'ViT-L/14 clip score "document with text"', 
     'ViT-L/14 clip score "web site with text"']
].max(axis=1)

clip_prob_text = (max_text_score-min_clip_score)/clip_score_diff
clip_prob_text[clip_prob_text<0] = 0
clip_prob_text[clip_prob_text>1] = 1

df['webdocument_prob'] = clip_prob_text

In [19]:
processor.update_data(
		['watermark_prob',
       'webdocument_prob' 
        ],
		processes=16
)

  0%|          | 0/1 [00:00<?, ?it/s]

[]

### Duplicate

_PHashFilter_ - filter to get information about images: phash. 

In [20]:
phash_filter = PHashFilter(sim_hash_size=8, workers=16)
processor.apply_filter(phash_filter)

df = processor.df

100%|██████████| 192/192 [00:27<00:00,  6.94it/s]


duplicated

In [21]:
df['is_duplicate'] = False
df.loc[df['image_phash_8'].duplicated(), 'is_duplicate'] = True

In [22]:
processor.update_data(
		['image_phash_8', 'is_duplicate'],
		processes=16
)

  0%|          | 0/1 [00:00<?, ?it/s]

[]

### TextFilter

_LangFilter_ - filter to get information about caption: lang and lang score 

In [23]:
lang_filter = LangFilter(
    text_column_name='caption',
    workers=16
)

processor.apply_filter(lang_filter)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [24]:
# запись результатов в csv
processor.update_data(
    ['lang', 'lang_score'],
    processes=16
)

  0%|          | 0/1 [00:00<?, ?it/s]

[]

_RegexFilter_ - filter that cleans the text from unnecessary words, using regular expressions

In [25]:
regex_filter = RegexFilter(
    text_column_name='caption',
    regex_replacement_list=ru_regexs+special_regexs+emoji_regexs
)

processor.apply_filter(regex_filter)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [26]:
processor.update_data(
		['clean_caption'],
		processes=16
)

  0%|          | 0/1 [00:00<?, ?it/s]

[]