In [1]:
import sys
sys.path.append('../')
import pandas as pd
import numpy as np

from DPF.formatters.t2i_formatter import T2IFormatter
from DPF.filters.images.base_images_info_filter import ImageInfoGatherer

### Read dataset

In [2]:
formatter = T2IFormatter()

processor = formatter.from_shards(
    '../data/example_shards_format/', 
    imagename_column='image_name',
    caption_column='rus_caption',
    progress_bar=True,
    processes=8
)

  0%|          | 0/21 [00:00<?, ?it/s]

### Create and run filter

_ImageInfoGatherer_ - filter to get information about images: width, height, channels and image correctness

In [3]:
imgfilter = ImageInfoGatherer(
    save_parquets=False, 
    workers=16
)
processor.apply_filter(imgfilter)

100%|██████████| 20013/20013 [00:08<00:00, 2430.58it/s]


Filter adds new columns to a dataframe.
You can get names of new columns by printing _imgfilter.schema_

In [5]:
imgfilter.schema

['image_path', 'is_correct', 'width', 'height', 'channels', 'error']

Let's check a dataframe

In [7]:
processor.df.head()[['image_name', 'caption', 'is_correct', 'width', 'height', 'channels', 'error']]

Unnamed: 0,image_name,caption,is_correct,width,height,channels,error
0,d44_3810246.jpg,"картина автора сергея юрьевича ефрема ""селезен...",True,514,384,3,
1,721_545170.jpg,"картина ""гротеск метаморфозы"" в стиле сюрреали...",True,455,384,3,
2,ea5_827246.jpg,"текстиль, картина новая деревня",True,384,504,3,
3,8a7_767448.jpg,"наталия багацкая, картина ""quo vadis?.."" в сти...",True,387,384,3,
4,8a8_335602.jpg,"картина владимира георгиевича гремитского ""бер...",True,384,480,3,


### Save new columns

To save new columns, call _processor.update_data_

In [9]:
help(processor.update_data)

Help on method update_data in module DPF.processors.text2image.t2i_processor:

update_data(columns_to_add: List[str], overwrite_columns: bool = True, processes: int = 1, force: bool = False) -> List[str] method of DPF.processors.text2image.shards_processor.ShardsProcessor instance
    Updates existing columns and adds new columns in dataframes of a dataset
    
    Parameters
    ----------
    columns_to_add: list[str]
        List of column names to update or add
    overwrite_columns: bool = True
        Change (overwrite) or not existing columns
    processes: int = 1
        Number of parallel processes to read and update dataframes
    force: bool = False
        Force update if dataframe shape was changed
        
    Returns
    -------
    list[str]
        List of occured errors



In [10]:
processor.update_data(
    ['is_correct', 'width', 'height', 'channels', 'error'],
    processes=8
)

  0%|          | 0/21 [00:00<?, ?it/s]

[]