In [1]:
import pandas as pd
from IPython.display import display, HTML

# Meta's transparency data is organized into app, policy_area, metric, period,
# and value columns. The metrics listed under COUNT have an integer count with
# comma as thousands separator as value, and the metrics listed under PERCENT
# have a percent value with trailing percent sign as value. When a value for
# either kind of metric is not available, it is marked as N/A.

# Combining different kinds of metrics within the same column is bad form
# because they don't even share the same type. Hence we parse the value column
# as strings and undo the damage afterwards.

COUNT = (
    'Content Actioned',
    'Content Appealed',
    'Content Restored with appeal',
    'Content Restored without appeal',
)

PERCENT = (
    'Proactive rate',
    'UBP',
    'Prevalence',
    'Lowerbound Prevalence',
    'Upperbound Prevalence',
)

METRICS = frozenset([*COUNT, *PERCENT])

SCHEMA = {
    'app': 'category',
    'policy_area': 'category',
    'metric': 'category',
    'period': 'period[Q]',
    'value': 'string',
}

def extract_count(df):
    return (
        df.loc[df['metric'].isin(COUNT), 'value']
        .str.replace(',', '')
        .astype('UInt64')
    )

def extract_percent(df):
    return (
        df.loc[df['metric'].isin(PERCENT), 'value']
        .str.rstrip('%')
        .astype('float')
    )

def loadQ(quarter, year=2022):
    df = (
        pd.read_csv(f'data/meta-q{quarter}-{year}.csv', dtype=SCHEMA)
        .assign(count=extract_count)
        .assign(percent=extract_percent)
    )
    assert set(df['metric'].unique() == METRICS)
    return df

meta = loadQ(3)
display(HTML('<h1>Meta</h1><h2>Meta: Q3 Data Overview</h2>'))
display(meta.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2780 entries, 0 to 2779
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype        
---  ------       --------------  -----        
 0   app          2780 non-null   category     
 1   policy_area  2780 non-null   category     
 2   metric       2780 non-null   category     
 3   period       2780 non-null   period[Q-DEC]
 4   value        1788 non-null   string       
 5   count        1243 non-null   UInt64       
 6   percent      545 non-null    float64      
dtypes: UInt64(1), category(3), float64(1), period[Q-DEC](1), string(1)
memory usage: 98.4 KB


None

In [2]:
# Compare the Q2 2022 and Q3 2022 datasets:
# 
#  1. Since the latter obviously contains additional entries, we first perform
#     an inner join on all columns but value, count, and percent. The join
#     appends 2 for the Q2 versions of value, count, and percent as well as 3 for
#     the Q3 versions.
#  2. Since N/A is incomparable even with itself, we next drop all rows where both
#     value2 and value3 are N/A.
#  3. Since integers may be written with or without thousands separators and
#     percent values with or without trailing zeros after the decimal, we drop rows
#     with different count2 and count3 for counted metrics and different percent2
#     and percent3 for fractional metrics.

diff = (
    pd.merge(
        loadQ(2),
        meta,
        how='inner',
        on=['app', 'policy_area', 'metric', 'period'],
        suffixes=('2', '3'),
    )
    .query('not value2.isna() or not value3.isna()')
    .query(
        'metric in @COUNT and count2 != count3 '
        'or metric in @PERCENT and percent2 != percent3'
    )
    .sort_values(['period', 'metric', 'policy_area'])
)

display(HTML('<h2>Meta: Differences between Q2 and Q3</h2>'))
print(len(diff), 'entries differ between Q2 and Q3 2022')

113 entries differ between Q2 and Q3 2022


In [3]:
print('Different entries per period, counts only, fractions only')
display(diff[['period', 'value2', 'count2', 'percent2']].groupby('period').count())

Different entries per period, counts only, fractions only


Unnamed: 0_level_0,value2,count2,percent2
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020Q4,77,58,19
2021Q1,3,0,3
2021Q2,4,0,4
2022Q2,29,27,2


In [4]:
q42020 = (
    diff[diff['period'].astype(str) == '2020Q4']
    .query('metric in @COUNT')
    .drop(columns=['value2', 'value3', 'percent2', 'percent3'])
)
q42020['delta'] = q42020[['count2', 'count3']].pct_change(axis=1)['count3'] * 100.0

min = q42020['delta'].min()
max = q42020['delta'].max()

print('For', len(q42020), 'divergent entries from Q4 2020 with counts:')
print('min diff', f'{min:6.2f}%')
print('max diff', f'{max:6.2f}%')

For 58 divergent entries from Q4 2020 with counts:
min diff -50.00%
max diff  -0.14%


In [5]:
CSAM = (
    'Child Nudity & Sexual Exploitation',
    # -- End Q1 2021 -- Start Q2 2021 --
    'Child Endangerment: Sexual Exploitation',
)

csam = (
    meta.query('policy_area in @CSAM and metric == "Content Actioned"')
    .drop(columns=['app', 'policy_area', 'metric', 'value', 'percent'])
    .set_index('period')
    .pipe(lambda df: df.groupby(df.index).sum())
)

display(HTML('<h2>Meta: CSAM Pieces Per Quarter</h2>'))
display(csam.style.format('{:,}'))

Unnamed: 0_level_0,count
period,Unnamed: 1_level_1
2017Q4,0
2018Q1,0
2018Q2,0
2018Q3,9000000
2018Q4,7200000
2019Q1,5800000
2019Q2,7426200
2019Q3,12155800
2019Q4,13986400
2020Q1,9500000


In [6]:
yearly = csam.groupby(csam.index.year).sum()

display(HTML('<h2>Meta: CSAM Pieces Per Year</h2>'))
display(yearly.style.format('{:,}'))

Unnamed: 0_level_0,count
period,Unnamed: 1_level_1
2017,0
2018,16200000
2019,39368400
2020,28187700
2021,78012400
2022,71000000
