In [2]:
import os
if not os.path.isdir('data'):
    os.mkdir('data')
    !gdown 1BRgFthsGrutapFp-pXPuw_pmH7-C-TWb -O data/ # features.yaml
    !gdown 1QbpJLMltcObmNcBpRdMKaLQCxDKvJ_Uu -O data/ # samples_porn2k_241029_182645.csv

Downloading...
From: https://drive.google.com/uc?id=1fEoEiYum2fkX68V52ITI73LpmadF_L0A
To: /content/data/features.yaml
100% 1.96k/1.96k [00:00<00:00, 7.57MB/s]
Downloading...
From: https://drive.google.com/uc?id=1QbpJLMltcObmNcBpRdMKaLQCxDKvJ_Uu
To: /content/data/samples_porn2k_241029_182645.csv
100% 24.1M/24.1M [00:00<00:00, 51.2MB/s]


In [6]:
import pandas as pd
features_df = pd.read_csv('data/samples_porn2k_241029_182645.csv')
print(features_df.columns)
features_df.head(5)

Index(['filename', 'label', 'object_name', 'object_bbox', 'object_conf',
       'scene', 'porn', 'porn_2019', 'face_name', 'face_bbox', 'face_conf',
       'age', 'child', 'gender', 'skin_ita', 'skin_patch', 'extension',
       'resolution', 'aspect', 'colormode', 'luminance', 'sharpness',
       'BRISQUE'],
      dtype='object')


Unnamed: 0,filename,label,object_name,object_bbox,object_conf,scene,porn,porn_2019,face_name,face_bbox,...,gender,skin_ita,skin_patch,extension,resolution,aspect,colormode,luminance,sharpness,BRISQUE
0,vNonPorn000852/image-00000031.jpeg,0,['vehicle/motorcycle'],"[[0.4861111342906952, 0.17499999701976776, 0.9...",[0.72265625],outdoor/natural/desert_sand,0.001114,"[0.015593965537846088, 0.9805508852005005, 0.0...",,,...,,,,jpeg,0.9216,1.777778,RGB,154.833044,48.411537,65.35907
1,vNonPorn000852/image-00000190.jpeg,0,[],,,outdoor/natural/sky,3.4e-05,"[0.4277361035346985, 0.5674610733985901, 0.003...",,,...,,,,jpeg,0.9216,1.777778,RGB,156.586895,163.121391,60.052917
2,vNonPorn000852/image-00000025.jpeg,0,['vehicle/motorcycle'],"[[0.506944477558136, 0.15859375894069672, 0.80...",[0.88720703125],outdoor/natural/desert_sand,0.009209,"[0.04049646109342575, 0.9538776278495789, 0.00...",,,...,,,,jpeg,0.9216,1.777778,RGB,156.691808,78.72397,64.400085
3,vNonPorn000852/image-00000020.jpeg,0,"['vehicle/motorcycle', 'person/person']","[[0.3361111283302307, 0.21484375, 0.7958333492...","[0.8798828125, 0.34912109375]",indoor/commercial/toyshop,0.000586,"[0.005201381631195545, 0.9921100735664368, 0.0...",,,...,,,,jpeg,0.9216,1.777778,RGB,155.922624,65.196866,65.346375
4,vNonPorn000852/image-00000001.jpeg,0,[],,,outdoor/natural/desert_sand,0.066797,"[0.2318183034658432, 0.6421917080879211, 0.087...",,,...,,,,jpeg,0.9216,1.777778,L,0.0,0.0,


In [4]:
import plotly.express as px
import plotly.subplots as sp
import plotly.graph_objects as go
import numpy as np
import ast

savepath = 'output'
if not os.path.isdir(savepath): os.mkdir(savepath)

#### utility function #####
def parse_list(value):
    value = value.replace('nan', 'None')
    value = value.replace('array(', '').replace('dtype=float32),', '').replace('dtype=float32)', '')  # age
    try:
        return ast.literal_eval(value)
    except (ValueError, SyntaxError):
        return None
##########################

def display_metrics(name, columns, show=False):

    fig = sp.make_subplots(rows=1, cols=len(columns),
                           subplot_titles=[f"{col.capitalize()}" for col in columns])

    for i, col in enumerate(columns):
        unique_values = features_df[col].nunique()
        bins = 20 if unique_values > 3 else unique_values

        hist = go.Histogram(x=features_df[col], nbinsx=bins)
        fig.add_trace(hist, row=1, col=i + 1)

    fig.update_layout(height=400, width=300 * len(columns),
                      title_text=f"{name.capitalize()} Metrics", showlegend=False)

    if show: fig.show()
    else: fig.write_html(f"{savepath}/metrics_{name}.html")

def display_classification(name, attrs, show=False):
    classes = attrs.get('classnames', [])
    nc = len(classes)

    values = features_df[name].dropna()
    if nc > 1 or attrs['cardinality'] == 'multiple':
        values = values.apply(parse_list)
    if attrs['cardinality'] == 'multiple':
        values = values.explode(name)

    if nc >= 1:
        specs = [[{"secondary_y": True}, {"rowspan": nc}]]
        specs.extend([[{"secondary_y": True}, None]]*(nc-1))

        fig = sp.make_subplots(
            rows=nc, cols=2, specs=specs,
            shared_xaxes=True,
            column_widths=[0.6, 0.4],
            horizontal_spacing = 0.1,
            subplot_titles=('Probability Distribution', 'Total')
        )

        values = np.stack(values.to_numpy())
        if nc == 1: values = values[:,None]
        for k in range(nc):
            fig.add_trace(go.Histogram(x=values[:,k]), row=k+1, col=1, secondary_y=False)
            if nc > 1:
                fig.add_trace(go.Scatter(x=[None],y=[None]), row=k+1, col=1, secondary_y=True) # dummy
                fig.update_yaxes(showticklabels=False, title_text=classes[k] , row=k+1, col=1, secondary_y=True)


            if 'threshold' in attrs:
                fig.add_vrect(x0=attrs['threshold'], x1=1.01, line_width=0,
                              opacity=0.1, layer='below', annotation_text=f'Class={classes[0].capitalize()}',
                              fillcolor="red", annotation_position="top left", row=k+1, col=1)

        if nc == 1:
            thres = attrs.get('threshold', 0.5)
            positive = sum(values > thres).item()
            total = [positive, len(values)-positive]
            ticks = [f'{classes[0].capitalize()}=True', f'{classes[0].capitalize()}=False']
        else:
            pred = np.argmax(values, axis=1)
            _, total = np.unique(pred, return_counts=True)
            ticks = classes

        fig_total = go.Bar(x=ticks, y=total, showlegend=False, hoverinfo='y',
                           width=[0.7]*len(ticks),marker_color='#636efa')
        fig.add_trace(fig_total, row=1, col=2)

        fig.update_layout(showlegend=False, title=name.capitalize(), height=600)
        fig.update_xaxes(type='category', row=1, col=2)
    else:
        # special case, too many classes to plot probabilities
        # data is provided in classnames, not in probabilities
        ncols = attrs['hierarchy_levels']
        column_widths = [0.6] if ncols==1 else [0.6] + [0.3/(ncols-1)]*(ncols-1)
        subplot_titles = ['Classes'] + [f'Macro level {i}' for i in range(1, ncols)]
        fig = sp.make_subplots(
            rows=1, cols=ncols,
            column_widths=column_widths,
            horizontal_spacing = 0.1,
            subplot_titles=subplot_titles
        )
        classes = pd.DataFrame(values) if ncols==1 else values.str.split('/', expand=True)
        for c, col_name in enumerate(classes.columns):
            histogram = classes.groupby(by=col_name)\
                        .size().reset_index(name="counts")\
                        .sort_values('counts', ascending=False)
            fig_hist = px.bar(data_frame=histogram,y="counts",x=col_name)
            fig.add_trace(fig_hist.data[0], row=1, col=ncols-c)

        fig.update_layout(showlegend=False,title=name.capitalize())

    if show: fig.show()
    else: fig.write_html(f"{savepath}/classification_{name}.html")

def display_detection(name, attrs, show=False):
    # name	bbox conf
    data = {'name': None, 'bbox': None, 'conf': None}
    num_per_img = None
    for key in data:
        data[key] = features_df[f'{name}_{key}'].dropna()
        if attrs['cardinality'] == 'multiple':
            data[key] = data[key].apply(parse_list)
            if num_per_img is None: num_per_img = data[key].apply(len)
            data[key] = data[key].explode(f'{name}_{key}')

    num_absent = sum(features_df[f'{name}_conf'].isna())
    if num_per_img is None:
        num_per_img = pd.Series([1]*len(data['name']))
    num_per_img = pd.concat( (num_per_img, pd.Series([0]*num_absent)) )

    nhier = attrs.get('hierarchy_levels',1)
    ncols = 3 + nhier
    column_widths = [0.3]*nhier + [0.2, 0.1, 0.1]
    subplot_titles = ['Detections per image', 'Classes'] + [f'Macro level {i}' for i in range(1, nhier)]
    subplot_titles += ['Conf', 'Rel. Area']
    fig = sp.make_subplots(
            rows=1, cols=ncols,
            column_widths=column_widths,
            horizontal_spacing = 0.05,
            subplot_titles=(subplot_titles)
        )

    count = num_per_img.value_counts().reset_index(name="counts")\
            .sort_values('counts', ascending=False)
    fig_count = go.Bar(x=count['index'], y=count['counts'], showlegend=False)
    fig.add_trace(fig_count, row=1, col=1)

    classes = pd.DataFrame(data['name']) if nhier==1 else data['name'].str.split('/', expand=True)
    for c, col_name in enumerate(classes.columns):
        histogram = classes.groupby(by=col_name)\
                    .size().reset_index(name="counts")\
                    .sort_values('counts', ascending=False)
        fig_hist = px.bar(data_frame=histogram,y="counts",x=col_name)
        fig.add_trace(fig_hist.data[0], row=1, col=nhier-c+1)


    fig_conf = px.box(data['conf'], y=f'{name}_conf')
    fig.add_trace(fig_conf.data[0], row=1, col=nhier+2)

    area = data['bbox'].apply(lambda x: x[2]*x[3])
    fig_area = px.box(area, y=f'{name}_bbox')
    fig.add_trace(fig_area.data[0], row=1, col=nhier+3)

    fig.update_layout(showlegend=False,title=name.capitalize())

    if show: fig.show()
    else: fig.write_html(f"{savepath}/detection_{name}.html")

def display_skin(attrs, show=False):
    ita  = features_df['skin_ita'].dropna().apply(parse_list).explode('skin_ita')
    img = np.zeros( (6, 60, 3) )

    fig = sp.make_subplots(rows=2, cols=1, row_heights=[0.85, 0.15],
                    subplot_titles=('ITA Distribution', 'Fitzpatrick Scale'))

    fig_hist = px.histogram(ita)
    fig_hist.data[0].showlegend = False
    fig.add_trace(fig_hist.data[0], row=1, col=1)

    ####################################
    ita_ranges = [i+50 for i in [-50, -30, -10, 10, 28, 41, 55, 70]]
    # ITA boundaries (approximate)
    fitzpatrick_labels = ['VI: -30', 'V: -10', 'IV: 10', 'III: 28', 'II: 41', 'I: 55']
    colors = [(41, 36, 32), (90, 46, 27), (139, 69, 19), (197, 129, 77),
              (230, 169, 126), (255, 209, 184), (255, 237, 225)]

    img = np.zeros( (6, 120, 3) )
    for k in range(7):
        img[:, ita_ranges[k]:ita_ranges[k+1]] = colors[k]

    fig_show = go.Image(z=img)
    fig.add_trace(fig_show, row=2, col=1)
    ####################################

    fig.update_layout(height=700,)
    fig.update_xaxes(tickvals=ita_ranges[1:-1], ticktext=fitzpatrick_labels, row=2, col=1)
    fig.update_yaxes(showticklabels=False, row=2, col=1)

    if show: fig.show()
    else: fig.write_html(f"{savepath}/metric_skin.html")

In [9]:
import yaml

with open("data/features.yaml") as stream:
    features = yaml.safe_load(stream)

for name, attrs in features.items():
    print(name)
    if name == 'skin': pass
        # display_skin(attrs)
    elif attrs['type'] == 'detection': pass
        # display_detection(name, attrs)
    elif attrs['type'] == 'metrics':
        display_metrics(name, attrs['metrics'])
    else:  # classification
        display_classification(name, attrs)


label
object
scene
porn
porn_2019
face
age
child
gender
skin
metadata
quality


In [10]:
!zip output.zip output/*

  adding: output/classification_age.html (deflated 68%)
  adding: output/classification_child.html (deflated 70%)
  adding: output/classification_gender.html (deflated 70%)
  adding: output/classification_label.html (deflated 71%)
  adding: output/classification_porn_2019.html (deflated 66%)
  adding: output/classification_porn.html (deflated 70%)
  adding: output/classification_scene.html (deflated 71%)
  adding: output/detection_face.html (deflated 70%)
  adding: output/detection_object.html (deflated 70%)
  adding: output/metric_skin.html (deflated 70%)
  adding: output/metrics_metadata.html (deflated 75%)
  adding: output/metrics_quality.html (deflated 69%)
