In [None]:
import os
if not os.path.isdir('data'):
    os.mkdir('data')
    !gdown 1BRgFthsGrutapFp-pXPuw_pmH7-C-TWb -O data/ # features.yaml
    !gdown 1QbpJLMltcObmNcBpRdMKaLQCxDKvJ_Uu -O data/ # samples_porn2k_241029_182645.csv

In [1]:
import pandas as pd
import ast
features_df = pd.read_csv('data/samples_porn2k_241029_182645.csv')
print(features_df.columns)
features_df.head(5)

Index(['filename', 'label', 'object_name', 'object_bbox', 'object_conf',
       'scene', 'porn', 'porn_2019', 'face_name', 'face_bbox', 'face_conf',
       'age', 'child', 'gender', 'skin_ita', 'skin_patch', 'extension',
       'resolution', 'aspect', 'colormode', 'luminance', 'sharpness',
       'BRISQUE'],
      dtype='object')


Unnamed: 0,filename,label,object_name,object_bbox,object_conf,scene,porn,porn_2019,face_name,face_bbox,...,gender,skin_ita,skin_patch,extension,resolution,aspect,colormode,luminance,sharpness,BRISQUE
0,vNonPorn000852/image-00000031.jpeg,0,['vehicle/motorcycle'],"[[0.4861111342906952, 0.17499999701976776, 0.9...",[0.72265625],outdoor/natural/desert_sand,0.001114,"[0.015593965537846088, 0.9805508852005005, 0.0...",,,...,,,,jpeg,0.9216,1.777778,RGB,154.833044,48.411537,65.35907
1,vNonPorn000852/image-00000190.jpeg,0,[],,,outdoor/natural/sky,3.4e-05,"[0.4277361035346985, 0.5674610733985901, 0.003...",,,...,,,,jpeg,0.9216,1.777778,RGB,156.586895,163.121391,60.052917
2,vNonPorn000852/image-00000025.jpeg,0,['vehicle/motorcycle'],"[[0.506944477558136, 0.15859375894069672, 0.80...",[0.88720703125],outdoor/natural/desert_sand,0.009209,"[0.04049646109342575, 0.9538776278495789, 0.00...",,,...,,,,jpeg,0.9216,1.777778,RGB,156.691808,78.72397,64.400085
3,vNonPorn000852/image-00000020.jpeg,0,"['vehicle/motorcycle', 'person/person']","[[0.3361111283302307, 0.21484375, 0.7958333492...","[0.8798828125, 0.34912109375]",indoor/commercial/toyshop,0.000586,"[0.005201381631195545, 0.9921100735664368, 0.0...",,,...,,,,jpeg,0.9216,1.777778,RGB,155.922624,65.196866,65.346375
4,vNonPorn000852/image-00000001.jpeg,0,[],,,outdoor/natural/desert_sand,0.066797,"[0.2318183034658432, 0.6421917080879211, 0.087...",,,...,,,,jpeg,0.9216,1.777778,L,0.0,0.0,


## Categorize and adjust features
To make them suitable for the correlation functions

In [2]:
import yaml
import numpy as np

with open("data/features.yaml") as stream:
    features = yaml.safe_load(stream)

def parse_list(value):
    if isinstance(value, str):
        value = value.replace('nan', 'None')
        value = value.replace('array(', '').replace('dtype=float32),', '').replace('dtype=float32)', '')  # age
    try:
        value = ast.literal_eval(value)
        if isinstance(value, list):
            value = np.nan if len(value) == 0 else value
        return value
    except (ValueError, SyntaxError):
        return np.nan


options_num = []
options_cat = []

for name, attrs in features.items():
    print(name)
    if name == 'skin':
        features_df['skin_ita'] = features_df['skin_ita'].apply(parse_list)
        options_num.append('skin_ita')

    elif attrs['type'] == 'detection': ## add number of detections and bbox area
        options_num.append(f'{name}_conf')
        options_cat.append(f'{name}_name')
        options_cat.append(f'{name}_qtty')
        options_num.append(f'{name}_area')

        if attrs['cardinality'] == 'multiple':
            features_df[f'{name}_name'] = features_df[f'{name}_name'].apply(parse_list)
            features_df[f'{name}_conf'] = features_df[f'{name}_conf'].apply(parse_list)
            features_df[f'{name}_bbox'] = features_df[f'{name}_bbox'].apply(parse_list)

        features_df[f'{name}_qtty'] = features_df[f'{name}_conf'].apply(lambda x: len(x) if isinstance(x, list) else x)
        features_df[f'{name}_area'] = features_df[f'{name}_bbox'].apply(lambda value: [v[2]*v[3] for v in value] if isinstance(value, list) else value)

        if attrs.get('hierarchy_levels',1) > 1 and attrs['cardinality'] == 'multiple':
            na_mask = features_df[f'{name}_name'].isna()
            nhier = attrs.get('hierarchy_levels',1)
            for level in range(nhier-1, 0, -1):
                features_df[f'{name}_name{level}'] = features_df[f'{name}_name']
                features_df.loc[~na_mask, f'{name}_name{level}'] = \
                        features_df.loc[~na_mask, f'{name}_name'].apply(
                            lambda value: [v.split('/')[nhier-level-1] for v in value]
                        )
                options_cat.append(f'{name}_name{level}')

            features_df[f'{name}_name'] = \
                    features_df.loc[~na_mask, f'{name}_name'].apply(
                            lambda value: [v.split('/')[-1] for v in value]
                        )

    elif attrs['type'] == 'metrics':
        for typ, var in zip(attrs['variable_types'], attrs['metrics']):
            if typ == 'numeric':
                options_num.append(var)
            elif typ == 'categorical':
                options_cat.append(var)
            # else is none

    else:  # classification
        classes = attrs.get('classnames', [])
        nc = len(classes)

        options_cat.append(f'{name}_name')

        if nc > 1 or attrs['cardinality'] == 'multiple':
            features_df[name] = features_df[name].apply(parse_list)

        if attrs['variable_types'][0] == 'numeric':
            options_num.append(f'{name}_conf')

            if attrs['cardinality'] == 'multiple' and nc >=1:
                na_mask = features_df[name].isna()
                features_df[f'{name}_name'] = features_df[name]
                features_df[f'{name}_conf'] = features_df[name]

                thres = attrs.get('threshold', 0.5)
                classes = attrs.get('classnames', name)
                if nc > 1:
                    features_df.loc[~na_mask, f'{name}_name'] = \
                            features_df.loc[~na_mask, name].apply(lambda x: [classes[v] for v in np.argmax(x, axis=1)])
                    features_df.loc[~na_mask, f'{name}_conf'] = \
                            features_df.loc[~na_mask, name].apply(lambda x: np.max(x, axis=1).tolist())
                else:
                    features_df.loc[~na_mask, f'{name}_name'] = \
                            features_df.loc[~na_mask, name].apply(lambda lst: [x>thres for x in lst])

            elif nc == 1:
                values = features_df[name].to_numpy()
                bool_mask = values > attrs.get('threshold', 0.5)
                features_df[f'{name}_name'] = bool_mask
                features_df[f'{name}_conf'] = values

            elif nc > 1:
                values = np.stack(features_df[name].to_numpy())
                features_df[f'{name}_name'] = [classes[v] for v in np.argmax(values, axis=1)]
                features_df[f'{name}_conf'] = np.max(values, axis=1)

        else: # categorical variables for classification are classnames
            if attrs.get('hierarchy_levels',1) > 1: # does not support multiple cardinality
                split = features_df[f'{name}'].str.split('/', expand=True)
                split.columns = split.columns[::-1]

                features_df[f'{name}_name'] = split.iloc[:, -1]
                for col in split.columns[:-1]:
                    features_df[f'{name}_name{col}'] = split[col]
                    options_cat.append(f'{name}_name{col}')
            else:
                features_df[f'{name}_name'] = features_df[name]


display(features_df)
print('##### Numeric variables #####')
print(options_num)

print('\n##### Categorical variables #####')
print(options_cat)


label
object
scene
porn
porn_2019
face
age
child
gender
skin
metadata
quality


Unnamed: 0,filename,label,object_name,object_bbox,object_conf,scene,porn,porn_2019,face_name,face_bbox,...,porn_2019_name,porn_2019_conf,face_qtty,face_area,age_name,age_conf,child_name,child_conf,gender_name,gender_conf
0,vNonPorn000852/image-00000031.jpeg,0,[motorcycle],"[[0.4861111342906952, 0.17499999701976776, 0.9...",[0.72265625],outdoor/natural/desert_sand,0.001114,"[0.015593965537846088, 0.9805508852005005, 0.0...",,,...,drawings,0.980551,,,,,,,,
1,vNonPorn000852/image-00000190.jpeg,0,,,,outdoor/natural/sky,0.000034,"[0.4277361035346985, 0.5674610733985901, 0.003...",,,...,drawings,0.567461,,,,,,,,
2,vNonPorn000852/image-00000025.jpeg,0,[motorcycle],"[[0.506944477558136, 0.15859375894069672, 0.80...",[0.88720703125],outdoor/natural/desert_sand,0.009209,"[0.04049646109342575, 0.9538776278495789, 0.00...",,,...,drawings,0.953878,,,,,,,,
3,vNonPorn000852/image-00000020.jpeg,0,"[motorcycle, person]","[[0.3361111283302307, 0.21484375, 0.7958333492...","[0.8798828125, 0.34912109375]",indoor/commercial/toyshop,0.000586,"[0.005201381631195545, 0.9921100735664368, 0.0...",,,...,drawings,0.992110,,,,,,,,
4,vNonPorn000852/image-00000001.jpeg,0,,,,outdoor/natural/desert_sand,0.066797,"[0.2318183034658432, 0.6421917080879211, 0.087...",,,...,drawings,0.642192,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21532,vPorn000085/image-00000758.jpeg,1,[person],"[[0.3305555582046509, 0.14166668057441711, 1.3...",[0.44921875],outdoor/natural/iceberg,0.954308,"[0.09647384285926819, 5.986062024021521e-05, 0...",,,...,porn,0.896125,,,,,,,,
21533,vPorn000085/image-00000206.jpeg,1,"[potted plant, bed, person]","[[0.013888888992369175, 0.0, 0.463888913393020...","[0.89013671875, 0.78662109375, 0.5751953125]",indoor/commercial/florist_shop_indoor,0.985882,"[0.002887272508814931, 0.00015475500549655408,...",[hasface],"[(188.74688720703125, -18, 239.93157958984375,...",...,porn,0.933806,1.0,[10703.120925893076],[25-30],[0.99985838],[False],[0.026391687],[False],[0.0018183456]
21534,vPorn000085/image-00000390.jpeg,1,"[person, bottle]","[[0.011111111380159855, 0.0020833334419876337,...","[0.92578125, 0.52490234375]",outdoor/human-made/water_park,0.999095,"[0.0032736805733293295, 6.582427158718929e-05,...",[hasface],"[(275.9510498046875, 20.67654037475586, 352.45...",...,porn,0.960444,1.0,[43531.23453550041],[25-30],[0.55442399],[False],[5.837897e-06],[True],[1.0]
21535,vPorn000085/image-00000022.jpeg,1,"[potted plant, person]","[[0.013888888992369175, 0.10833334177732468, 0...","[0.9189453125, 0.84326171875]",indoor/commercial/art_studio,0.502467,"[0.5949068069458008, 0.0035848089028149843, 0....",[hasface],"[(219.0843963623047, 57.12853240966797, 273.10...",...,neutral,0.594907,1.0,[30395.647001501173],[25-30],[0.88949162],[False],[0.0006667912],[True],[0.98678094]


##### Numeric variables #####
['object_conf', 'object_area', 'porn_conf', 'porn_2019_conf', 'face_conf', 'face_area', 'age_conf', 'child_conf', 'gender_conf', 'skin_ita', 'resolution', 'aspect', 'luminance', 'sharpness', 'BRISQUE']

##### Categorical variables #####
['label_name', 'object_name', 'object_qtty', 'object_name1', 'scene_name', 'scene_name2', 'scene_name1', 'porn_name', 'porn_2019_name', 'face_name', 'face_qtty', 'age_name', 'child_name', 'gender_name', 'extension', 'colormode']


In [3]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def scatter_pair(col_x, col_y):

    df_scatter = features_df[[col_x, col_y]].dropna()
    df_scatter = df_scatter.explode(col_x).explode(col_y).dropna()
    fig = px.scatter(df_scatter, x=col_x, y=col_y)
    # fig.write_html(f'visualizations/pair_{col_x}_{col_y}.html')

    fig.show()


def get_occurrence(df, col, topk):

    df_occ = pd.DataFrame(columns = sorted(df[col].unique()) )
    for c in df_occ.columns:
        df_occ[c] = df[col].map(str).str.contains(str(c)).astype(int)

    if len(df_occ.columns) > topk:
        top = df_occ.sum().sort_values(ascending=False)
        top = top.index[:topk]
        df_occ = df_occ[top]

    return df_occ


def heatmap_pair(col_x, col_y, normalize, topk=20):

    df_heat = features_df[[col_x, col_y]]
    df_heat = df_heat.explode(col_x).explode(col_y).dropna()

    df_x = get_occurrence(df_heat, col_x, topk)
    df_y = get_occurrence(df_heat, col_y, topk)

    heatmap = df_y.T.dot(df_x)
    if normalize == 'x':
        heatmap = (heatmap.T/np.sum(heatmap, axis=1)).T
    elif normalize == 'y':
        heatmap = heatmap/np.sum(heatmap, axis=0)

    fig = px.imshow(heatmap.to_numpy(), color_continuous_scale = 'PuBu',
                   labels=dict(x=col_x, y=col_y, color="count"))

    fig.update_layout(
        xaxis = dict(tickmode='array',
                     tickvals=np.arange(len(heatmap.columns)),
                     ticktext=heatmap.columns),
        yaxis = dict(tickmode='array',
                     tickvals=np.arange(len(heatmap.index)),
                     ticktext=heatmap.index)
        )

    ###########################################################
    # fig.write_html(f'pair_{col_x}_{col_y}_norm{normalize}.html')
    ###########################################################

    fig.show()

def boxplot_pair(cat, num):

    df_box = features_df[[cat, num]].dropna()
    df_box = df_box.explode(cat).explode(num).dropna()

#     fig = px.box(df_box, x=col_x, y=col_y)

    fig = px.histogram(df_box, x=num, color=cat,
                       marginal='box', barmode='group')
    fig.update_layout(
        xaxis=dict(
            rangeslider=dict(
                visible=True,
                thickness=0.05
            ),
        )
    )

    # fig.write_html(f'visualizations/pair_{cat}_{num}.html')
    fig.show()

def plot_pair(x, y, normalize=None):
    categories = ['num', 'list', 'cat']

    cat_x = 'num' if x in options_num else 'cat'
    cat_y = 'num' if y in options_num else 'cat'

    if cat_x == 'num' and cat_y == 'num':
        scatter_pair(x, y)
    elif cat_x == 'cat' and cat_y == 'cat':
        heatmap_pair(x, y, normalize)
    else:
        cat_dim = 'x' if cat_x == 'cat' else 'y'
        if cat_dim == 'x':
            boxplot_pair(x, y)
        else:
            boxplot_pair(y, x)

In [4]:
fig = plot_pair('object_area', 'porn_name', normalize='y')

## Statistical Parity

$G(y|x_1,x_2,DP) = P(y|x_1) - P(y|x_2)$

In [5]:
import matplotlib
from matplotlib import cm
from matplotlib import colors
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.colors import ListedColormap

def find_interv(values):
    std95 = 1.96 * values.std()
    lower = values.mean()-std95
    upper = values.mean()+std95
    interv = np.linspace(lower, upper, 9)
    return interv

def find_idx(interv, v):
    if np.isnan(v): return np.nan
    diff = v - interv
    idx  = np.nonzero(diff < 0)[0]
    return idx[0] if len(idx) > 0 else len(interv)

def num2cat(df, col):
    def find_idx(value):
        diff = value-interv
        idx  = np.nonzero(diff < 0)[0]
        return idx[0] if len(idx) > 0 else len(interv)

    if 'conf' in col :
        interv = np.linspace(0.1, 0.9, 9)
    else:
        interv = find_interv(df[col])

    print('#'*30)
    print(f'Numerical variable {col} adjusted')
    [print(f'{k}: {i}') for k, i in enumerate(interv)]
    print('#'*30)

    return df[col].apply(find_idx)

def colormap_to_colorscale(cmap):
    #function that transforms a matplotlib colormap to a Plotly colorscale
    return [ [k*0.1, colors.rgb2hex(cmap(1-k*0.1))] for k in range(11)]

newcmp = ListedColormap(cm.get_cmap('coolwarm', 512)(np.linspace(0.15, 0.85, 256)))
cmap = colormap_to_colorscale(newcmp)

def statistical_parity(col_x, col_y):

    df_heat = features_df[[col_x, col_y]]
    df_heat = df_heat.explode(col_x).explode(col_y).dropna()

    if col_x in options_num: df_heat[col_x] = num2cat(df_heat, col_x)
    if col_y in options_num: df_heat[col_y] = num2cat(df_heat, col_y)

    df_x = pd.DataFrame(columns = sorted(df_heat[col_x].unique()) )
    for col in df_x.columns:
        df_x[col] = df_heat[col_x].map(str).str.contains(str(col)).astype(int)

    df_y = pd.DataFrame(columns = sorted(df_heat[col_y].unique()) )
    for col in df_y.columns:
        df_y[col] = df_heat[col_y].map(str).str.contains(str(col)).astype(int)

    heatmap = df_y.T.dot(df_x)

    ysum = heatmap.sum(axis=1)
    ysum /= np.sum(ysum)
    proportion = ysum
    xsum = heatmap.sum(axis=0)

    # confindence interval #
    xsum_inv = (1/xsum).to_numpy()
    conf = (proportion * (1-proportion)).to_numpy()
    conf = np.sqrt(conf[:, np.newaxis].dot(xsum_inv[np.newaxis, :])) * 2
    #########################

    expected = pd.DataFrame(ysum.to_numpy()[:, np.newaxis].dot(xsum.to_numpy()[np.newaxis,:]))
    expected.index = heatmap.index
    expected.columns = heatmap.columns
    bias = heatmap / expected

    ########### check confidence interval ###########
    upper = proportion.to_numpy()[:, np.newaxis] + conf
    upper = upper * xsum.to_numpy()[np.newaxis, :]

    lower = proportion.to_numpy()[:, np.newaxis] - conf
    lower = lower * xsum.to_numpy()[np.newaxis, :]

    colors = bias.copy()
    colors[(heatmap > lower) & (heatmap < upper)] = np.nan
    colors[colors < 1] = -expected / heatmap
    colors[np.isnan(colors)] = 0
    colors = colors.to_numpy()
    #######################################################

    fig = px.imshow(colors, color_continuous_scale = cmap,
#                     color_continuous_midpoint=0,
                    range_color = [-2, 2],  #interface button to set range
                    labels=dict(x=col_x, y=col_y))

    fig.update_traces(showlegend=False,
                     customdata=np.dstack((heatmap.to_numpy(), expected.to_numpy(), bias.to_numpy())),
                     hovertemplate="<br>".join([
                        col_x+": %{x}",
                        col_y+": %{y}",
                        "Value: %{customdata[0]}",
                        "Expected: %{customdata[1]:.2f}",
                        "Representation: %{customdata[2]:.2f}x"
                    ]))

    fig.update_layout(
        xaxis = dict(tickmode='array',
                     tickvals=np.arange(len(heatmap.columns)),
                     ticktext=heatmap.columns),
        yaxis = dict(tickmode='array',
                     tickvals=np.arange(len(heatmap.index)),
                     ticktext=heatmap.index)
        )
    fig = go.FigureWidget(fig)
    fig.show()


The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.



In [6]:
statistical_parity('object_area', 'label_name')

##############################
Numerical variable object_area adjusted
0: -0.0550405123648231
1: 0.09581433057432931
2: 0.24666917351348172
3: 0.39752401645263413
4: 0.5483788593917865
5: 0.699233702330939
6: 0.8500885452700914
7: 1.0009433882092438
8: 1.1517982311483963
##############################
