In [30]:
import sys
sys.path.append('../')
import pandas as pd
import numpy as np

from DPF.formatters.t2i_formatter import T2IFormatter
from DPF.filters.images.base_images_info_filter import ImageInfoGatherer

### Read dataset

In [2]:
formatter = T2IFormatter()
shards_path = '../data/example_shards_format/'
processor = formatter.from_shards(
    shards_path, 
    imagename_column='image_name',
    caption_column='caption',
    progress_bar=True,
    processes=8
)

  0%|          | 0/1 [00:00<?, ?it/s]

### Create and run filter

_ImageInfoGatherer_ - filter to get information about images: width, height, channels and image correctness

In [5]:
imgfilter = ImageInfoGatherer(
    workers=16
)
processor.apply_filter(imgfilter)

100%|██████████| 192/192 [00:12<00:00, 15.33it/s]


Filter adds new columns to a dataframe.
You can get names of new columns by printing _imgfilter.schema_

In [6]:
imgfilter.schema

['image_path', 'is_correct', 'width', 'height', 'channels', 'error']

Let's check a dataframe

In [7]:
processor.df.head()[['image_name', 'caption', 'is_correct', 'width', 'height', 'channels', 'error']]

Unnamed: 0,image_name,caption,is_correct,width,height,channels,error
0,ProstokvashinoNew-01_Vozvraschenie-01_eee0013.png,"Матроскин из мультфильма ""Простоквашино""",True,1920,1080,3,
1,ProstokvashinoNew-02_Vozvraschenie-02_eee0019.png,"Матроскин из мультфильма ""Простоквашино""",True,1920,1080,3,
2,ProstokvashinoNew-04_DietaMatroskina_eee0004.png,"Матроскин из мультфильма ""Простоквашино""",True,1920,1080,3,
3,ProstokvashinoNew-04_DietaMatroskina_eee0012.png,"Матроскин из мультфильма ""Простоквашино""",True,1920,1080,3,
4,ProstokvashinoNew-05_SHarikHochetTelefon_eee00...,"Матроскин из мультфильма ""Простоквашино""",True,1920,1080,3,


### Save new columns

To save new columns, call _processor.update_data_

In [8]:
help(processor.update_data)

Help on method update_data in module DPF.processors.text2image.t2i_processor:

update_data(columns_to_add: List[str], overwrite_columns: bool = True, processes: int = 1, force: bool = False) -> List[str] method of DPF.processors.text2image.shards_processor.ShardsProcessor instance
    Updates existing columns and adds new columns in dataframes of a dataset
    
    Parameters
    ----------
    columns_to_add: list[str]
        List of column names to update or add
    overwrite_columns: bool = True
        Change (overwrite) or not existing columns
    processes: int = 1
        Number of parallel processes to read and update dataframes
    force: bool = False
        Force update if dataframe shape was changed
        
    Returns
    -------
    list[str]
        List of occured errors



In [9]:
processor.update_data(
    ['is_correct', 'width', 'height', 'channels', 'error'],
    processes=8
)

  0%|          | 0/1 [00:00<?, ?it/s]

[]