# Code for processing original broden dataset

In [1]:
from __future__ import print_function, division
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as im
from PIL import Image
import skimage
from tqdm import tqdm, tqdm_notebook
import re
import math

%load_ext autoreload
%autoreload 2

In [7]:
datadir = 'dataset/broden1_227/'
index_frame = pd.read_csv(data_dir + 'index.csv')

In [23]:
# looks at label for each pixel and returns the most common label
# Is this the best way to be doing things??? Who knows.
def extract_prominent_feature(file):
    img = skimage.io.imread(file)
    features = np.add(img[:,:,0], 256 * img[:,:,1])
    features = features.flatten()
    features = features[features != 0]
    features = np.bincount(features)
    if features.size != 0:
        return features.argmax()
    else:
        return 0
    
# returns a list of every features present in a feature image file
def extract_features_from_file(file):
    img = skimage.io.imread(file)
    features = np.add(img[:,:,0], 256 * img[:,:,1])
    features = features.flatten()
    features = np.unique(features[features != 0])
    return features

categories = ['color', 'object', 'part', 'material', 'scene', 'texture']
multi_label_pattern = re.compile("^\w+(;\w+)*$")
multi_file_pattern = re.compile(".+(\.png);.+")

# generates a list of every feature present for a specific row
def extract_features(df, index):
    features = []
    for category in categories:
        feature = df.loc[index, category]

        if isinstance(feature, float):
            if not math.isnan(feature):
                features.append(feature)
            else:
                continue
        else:
            if multi_label_pattern.match(feature):
                features.extend(feature.split(';'))
            elif multi_file_pattern.match(feature):
                feature_files = feature.split(';')
                for file in feature_files:
                    filename = data_dir + 'images/' + file
                    features.extend(extract_features_from_file(filename))
                    
            else:
                filename = data_dir + 'images/' + feature
                features.extend(extract_features_from_file(filename))

    return features

# returns a new dataframe with a link to the image, and the category class
def process_index_file(df):      
    processed_data = np.empty((len(df), 3))
    processed_df = pd.DataFrame(data=processed_data, columns=['image', 'split', 'features'])
    processed_df['image'] = pd.Series(dtype='str')
    processed_df['split'] = pd.Series(dtype='str')
    processed_df['features'] = pd.Series(dtype='object')
    for index, row in tqdm(df.iterrows(), total=len(df)):
        features = extract_features(df, index)
        image = df.loc[index, 'image']
        split = df.loc[index, 'split']
        processed_df.at[index, 'image'] = image
        processed_df.at[index, 'split'] = split
        # This is so bad, need to figure out something better
        processed_df.at[index, 'features'] = ','.join(map(str, [f'[{x}]' for x in features]))   
    return processed_df

In [24]:
processed_data = process_index_file(index_frame)
processed_data.drop(processed_data.columns[processed_data.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)


100%|██████████| 63305/63305 [02:19<00:00, 454.93it/s]


In [25]:
processed_data.to_csv(path_or_buf=data_dir + 'processed_index.csv')