In [1]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import intransparent as it

MONTH_RANGES = ['Jan-Mar', 'Apr-Jun', 'Jul-Sep', 'Oct-Dec']

COLUMNS = [c.casefold() for c in [
    'Metric',
    'Period Type',
    'Period',
    'Policy Type',
    'Issue Policy',
    'Task Type',
    'Task',
    'Location',
    'Market',
]]

CASEFOLD = lambda v: v.casefold()
MAKEFLOAT = lambda v: (
    f'{v:.3}' if pd.api.types.is_float(v)
    else f'{float(v):.3}' if pd.api.types.is_integer(v)
    else f'{float(v.replace(",", "")):.3}'
)
CONVERTERS = {
    'Metric': CASEFOLD,
    'Period Type': CASEFOLD,
    'Policy Type': CASEFOLD,
    'Issue Policy': CASEFOLD,
    'Task Type': CASEFOLD,
    'Task': CASEFOLD,
    'Location': CASEFOLD,
    'Market': CASEFOLD,
    'Result': MAKEFLOAT,
}

START = pd.Period('2022Q4')
STOP = pd.Period('2023Q4')


it.show('<h1>TikTok</h1>')

differences = []
period1 = START
data1 = None

while period1 < STOP:
    period2 = period1 + 1

    it.show(f'<h2>{period1} to {period2}</h2>')

    if data1 is None:
        data1 = pd.read_csv(
            f'../data/tiktok/tiktok-{period1.year}-q{period1.quarter}.csv',
            thousands=',',
            converters=CONVERTERS,
        )
        data1.columns = data1.columns.str.lower()

    data2 = pd.read_csv(
        f'../data/tiktok/tiktok-{period2.year}-q{period2.quarter}.csv',
        thousands=',',
        converters=CONVERTERS,
    )
    data2.columns = data2.columns.str.lower()

    latest = f'{MONTH_RANGES[period2.quarter - 1]} {period2.year}'
    old_data2 = data2[data2['period'] != latest]

    it.show(pd.DataFrame({
        'dataset': [str(period1), str(period2), str(period2) + ' w/o new'],
        'rows': [len(data1), len(data2), len(old_data2)],
    }))

    label1 = str(period1)
    label2 = str(period2)

    difference = (
        pd.merge(
            data1,
            data2,
            how='inner',
            on=COLUMNS,
            suffixes=(label1, label2),
        )
        .query(f'not result{label1}.isna() or not result{label2}.isna()')
        .query(f'result{label1} != result{label2}')
        .sort_values(['metric', 'issue policy', 'market', 'period'])
    )

    if len(difference) == 0:
        it.show('✅ No differences in historical data!')
    else:
        it.show(f'❌ Sadly, historical data diverges across {len(difference)} rows.')
        differences.append(difference)

    period1 = period2
    data1 = data2

SHOW_DIFFERENCES = True

if SHOW_DIFFERENCES and differences:
    for difference in differences:
        it.show(difference)

Unnamed: 0,dataset,rows
0,2022Q4,1030
1,2023Q1,1522
2,2023Q1 w/o new,1140


Unnamed: 0,dataset,rows
0,2023Q1,1522
1,2023Q2,1912
2,2023Q2 w/o new,1522


Unnamed: 0,dataset,rows
0,2023Q2,1912
1,2023Q3,8704
2,2023Q3 w/o new,1912


Unnamed: 0,dataset,rows
0,2023Q3,8704
1,2023Q4,15966
2,2023Q4 w/o new,8704


In [2]:
from intransparent.tiktok import Processor

it.show("<h2>A Closer Look at Q4 2023</h2>")

stats = Processor.for_period("2023q4").quarter_only().stats()
it.show(stats)

it.show(
    """
    Yet according to NCMEC, TikTok submitted only 590,376 CyberTipline reports
    for all of 2023.
    """
)

Unnamed: 0,Statistics
Total videos removed,176461963.0
Category share: Safety & civility,0.135
Subcategory share: Youth exploitation & abuse,0.232
Subcategory count: Youth exploitation & abuse,5526788.0
Subcategory share: Youth safety & well-being - youth exploitation & abuse,0.345
Speculative subcategory count: Previous two rows multiplied,1906741.0
