# Visualizing CSAM Reports Per Country

In [1]:
# Import numpy, pandas, and plotly: np, pd, go, px
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px

## 1. Reports Per Country

Load the reports-per-country dataset and perform some sanity checks:

In [2]:
reports = pd.read_csv('reports.csv', thousands=',')
reports = reports[reports['iso3'].notnull()]

def count_reports_for(iso3):
    result = reports.loc[reports['iso3'] == iso3]['reports']
    assert len(result) == 1
    return result.iloc[0]

assert count_reports_for('AFG') == 283_116
assert count_reports_for('TUV') == 25
assert count_reports_for('YEM') == 243_340
assert count_reports_for('CHN') == 7_644
assert reports['iso3'].nunique() == len(reports)
# This would fail without the thousands argument to read_csv:
assert str(reports.dtypes['reports']) == 'int64'

report_countries = set(reports['iso3'].values)
reports.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 243 entries, 0 to 242
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   country  243 non-null    object
 1   iso3     243 non-null    object
 2   reports  243 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 7.6+ KB


## 2. Population Per Country

Load the population-per-country dataset, perform some sanity checks, and
determine for which countries we lack data.

In [3]:
# Build on https://ourworldindata.org/grapher/population

capita = pd.read_csv('population.csv')
capita = capita[capita['iso3'].notnull() & (capita['year'] == 2021)]
capita = capita[['iso3', 'population']]

assert capita['iso3'].nunique() == len(capita)
assert str(capita.dtypes['population']) == 'int64'

capita_countries = set(capita['iso3'].values)
missing_countries = report_countries.difference(capita_countries)
print(f'Population data is missing for', sorted(missing_countries), '\n')

assert 'ATA' in missing_countries
assert 'CCK' in missing_countries
assert not 'XXK' in missing_countries

capita.info()

Population data is missing for ['ALA', 'ANT', 'ATA', 'BLM', 'BVT', 'CCK', 'CXR', 'IOT'] 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 237 entries, 258 to 57171
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   iso3        237 non-null    object
 1   population  237 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 5.6+ KB


### 2.1 Create Remedial Dataset By Googling Missing Values

It's not reassuring that NCMEC's data contains an entry for the Netherlands
Antilles (ANT) as well as Bonaire, Saint Eustatius and Saba (BES), Curaçao
(CUW), and Sint Maarten (SXM), since the former was split into the latter three.
Antarctica (ATA) may have a permanent population of zero but also has research
stations. Finally, since Bouvet Island is uninhabited, it's unclear how CSAM can
be hosted there.

In [4]:
supplement = pd.DataFrame([
    ['ALA', 29_789],
    ['ANT', 0], # Netherlands Antilles was split into BES, CUW, SXM
    ['ATA', 0],
    ['BLM', 9_952],
    ['BVT', 0],
    ['CCK', 596],
    ['CXR', 1_402],
    ['IOT', 3_000],
], columns=['iso3', 'population'])

supplement_countries = set(supplement['iso3'].values)
if missing_countries > supplement_countries:
    raise Exception(f'Supplement lacks {missing_countries - supplement_countries}')

capita = pd.concat([capita, supplement]).reset_index(drop=True)
capita.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 245 entries, 0 to 244
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   iso3        245 non-null    object
 1   population  245 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 4.0+ KB


None

## 3. Reports Per Country Per Capita

It's time to merge the two datasets into one and normalize the report counts by
population size. We use a left outer join on the reports per country to add in
the population size.

Let's look at the entries with a small population size.


In [5]:
normalized = pd.merge(reports, capita, how='left', on='iso3')
display(normalized[normalized['population'] < 2_000])


Unnamed: 0,country,iso3,reports,population
8,Antarctica,ATA,4,0
29,Bouvet Island,BVT,1,0
47,Christmas Island,CXR,1,1402
48,Cocos (Keeling) Islands,CCK,168,596
153,Netherlands Antilles,ANT,2,0
159,Niue,NIU,1,1957
218,Tokelau,TKL,4,1869
235,Vatican City,VAT,1,812





After removing rows with a population size of 0 (see
above), we add the new column for the RPC2, reports per country per capita

And? Now that we have comparable per-country data, let's sort by RPC2 and print
out the 20 worst offenders.

In [6]:
rpcpc = pd.merge(reports, capita, how='left', on='iso3')
rpcpc = rpcpc[rpcpc['population'] > 0]
rpcpc['rpc'] = rpcpc['reports'] / rpcpc['population']

rpcpc = rpcpc.sort_values(by=['rpc'], ascending=False)
rpcpc.index = pd.RangeIndex(start=1, stop=len(rpcpc) + 1, step=1)

display(rpcpc.head(20))
px.bar(rpcpc['rpc']).show()

Unnamed: 0,country,iso3,reports,population,rpc
1,Cocos (Keeling) Islands,CCK,168,596,0.281879
2,Libya,LBY,270811,6735280,0.040208
3,United Arab Emirates,ARE,327820,9365149,0.035004
4,Iraq,IRQ,1220470,43533592,0.028035
5,Philippines,PHL,3188793,113880336,0.028001
6,Qatar,QAT,73536,2688239,0.027355
7,Algeria,DZA,1171653,44177964,0.026521
8,Cambodia,KHM,395793,16589031,0.023859
9,Bahrain,BHR,32346,1463266,0.022105
10,Belize,BLZ,8297,400037,0.020741


Whoa! Given the small number of reports and small population size, the outlier for
the Cocos Islands is suspect. The group of 27 tiny islands in the Indian Ocean also
is so small, it won't show even on a map of the region. It's best to remove the row
from our dataset.

In [7]:
rpcpc = rpcpc[rpcpc['iso3'] != 'CCK']
px.bar(rpcpc['rpc']).show()

That's better. But it still isn't particularly suitable for visualization in a
choropleth, since the per-country metric to color mapping is linear. Let's fix
that.

In [9]:
rpcpc['logrpc'] = np.log10(rpcpc['rpc'])
px.bar(rpcpc['logrpc']).show()

### 3.3 Check the Choropleth

In [None]:
#reports['label'] = reports['rpc'].round(0).astype(int).astype(str) + '/10,000<br>reports'

fig = go.Figure(
    data=go.Choropleth(
        locations=rpcpc['iso3'],
        locationmode='ISO-3',
        z=rpcpc['logrpc'],
        colorscale='plasma_r',
        text=rpcpc['label'],
        customdata=rpcpc['country'],
        hovertemplate='%{text}<extra><b>%{location}</b><br>%{customdata}</extra>',
        colorbar=dict(thickness=20, xpad=15, y=0.47, ypad=40),
        title='CSAM Reports Per Capita'
    ),
    layout=dict(
        margin={"r":0,"t":0,"l":0,"b":0},
        geo=dict(
           showcountries=True,
           showframe=False,
           showlakes=False,
           lataxis_range=[-60,90],
        ),
    ),
)
fig.show()