In [1]:
import tempfile
import zipfile
import datetime
import json
import pathlib
import operator
import collections

import pandas as pd

from google_fetch import config


In [2]:
def read_bundle(bundle_path):
    fetch_datetime = datetime.datetime.strptime(
        bundle_path.stem,
        'google-political-ads-transparency-bundle_%Y-%m-%d_%H-%M-%S',
    )

    with tempfile.TemporaryDirectory() as tmpdirname:
        tmp_dir = pathlib.Path(tmpdirname)

        with zipfile.ZipFile(bundle_path, "r") as zip_ref:
            zip_ref.extractall(tmpdirname)

            assert set([
                element.name
                for element in tmp_dir.iterdir()
            ]) == {'google-political-ads-transparency-bundle'}

            data_dir = tmp_dir / 'google-political-ads-transparency-bundle'
            assert set([
                element.name
                for element in data_dir.iterdir()
            ]) == {
                'README.txt',
                'google-political-ads-advertiser-stats.csv',
                'google-political-ads-advertiser-weekly-spend.csv',
                'google-political-ads-campaign-targeting.csv',
                'google-political-ads-creative-stats.csv',
                'google-political-ads-geo-spend.csv',
                'google-political-ads-top-keywords-history.csv',
            }

            bundle_data = {}

            with open(data_dir / 'README.txt', 'r') as f:
                bundle_data['README.txt'] = f.read()

            for key in [
                'google-political-ads-advertiser-stats.csv',
                'google-political-ads-advertiser-weekly-spend.csv',
                'google-political-ads-campaign-targeting.csv',
                'google-political-ads-creative-stats.csv',
                'google-political-ads-geo-spend.csv',
                'google-political-ads-top-keywords-history.csv',
            ]:
                bundle_data[key] = pd.read_csv(
                    data_dir / key,
                    dtype=str,
                    keep_default_na=False,
                    na_values={},
                )

    return fetch_datetime, bundle_data

In [3]:
for bundle_path in sorted(list((config.DATA_DIR / 'google').iterdir())):
    print(bundle_path)

/home/michel/projects/desinfo/political-ads-scraper/data/google/1
/home/michel/projects/desinfo/political-ads-scraper/data/google/2
/home/michel/projects/desinfo/political-ads-scraper/data/google/google-political-ads-transparency-bundle_2019-06-14_14-32-03.zip
/home/michel/projects/desinfo/political-ads-scraper/data/google/google-political-ads-transparency-bundle_2019-06-18_14-18-01.zip


In [4]:
bundle_path_1 = pathlib.Path('/home/michel/projects/desinfo/political-ads-scraper/data/google/google-political-ads-transparency-bundle_2019-06-14_14-32-03.zip')
bundle_path_2 = pathlib.Path('/home/michel/projects/desinfo/political-ads-scraper/data/google/google-political-ads-transparency-bundle_2019-06-18_14-18-01.zip')
fetch_datetime_1, bundle_data_1 = read_bundle(bundle_path_1)
fetch_datetime_2, bundle_data_2 = read_bundle(bundle_path_2)


In [5]:
bundle_data_1['README.txt'] == bundle_data_2['README.txt']

True

In [6]:
key = 'google-political-ads-creative-stats.csv'
df_1 = bundle_data_1[key]
df_2 = bundle_data_2[key]

assert list(df_1.columns) == list(df_2.columns)

# A few ads were run by distinct advertisers, for instance:
# https://transparencyreport.google.com/political-ads/advertiser/AR185084640594231296/creative/CR275167628257918976
# https://transparencyreport.google.com/political-ads/advertiser/AR242125929453715456/creative/CR275167628257918976
duplicate_ad_ids_1 = [
    ad_id
    for ad_id, count in collections.Counter(df_1['Ad_ID']).items()
    if count > 1
]
duplicate_ad_ids_2 = [
    ad_id
    for ad_id, count in collections.Counter(df_2['Ad_ID']).items()
    if count > 1
]
assert set(duplicate_ad_ids_1) == set(duplicate_ad_ids_2)

print('The following ads were run by several advertisers: {}'.format(duplicate_ad_ids_1))


indexes = ['Ad_ID', 'Advertiser_ID']
df_1.set_index(indexes, inplace=True)
df_2.set_index(indexes, inplace=True)

ids_1 = set(df_1.index)
ids_2 = set(df_2.index)

assert len(ids_1) == len(df_1)
assert len(ids_2) == len(df_2)

removed_ids = ids_1 - ids_2
added_ids = ids_2 - ids_1
common_ids = ids_1 & ids_2
assert len(ids_1) + len(added_ids) - len(removed_ids) == len(ids_2)
assert len(ids_1 | ids_2) == len(common_ids) + len(added_ids) + len(removed_ids)

print('{} ads were removed, {} were added'.format(len(removed_ids), len(added_ids)))

assert len(removed_ids) == len(added_ids)

assert set(map(operator.itemgetter(0), removed_ids)) == set(map(operator.itemgetter(0), added_ids))

# For 65 ads, neither the old nor the add can be viewed:
# https://transparencyreport.google.com/political-ads/advertiser/AR88014531413934080/creative/CR100348028220604416
# https://transparencyreport.google.com/political-ads/advertiser/AR520360095641501696/creative/CR103736173301596160

# Same for the advertisers:
# https://transparencyreport.google.com/political-ads/advertiser/AR88014531413934080
# https://transparencyreport.google.com/political-ads/advertiser/AR520360095641501696

for removed_id, added_id in zip(sorted(removed_ids), sorted(added_ids)):
    print('{}\n    by {}\nwas replaced by\n{}    by {}\n'.format(
        'https://transparencyreport.google.com/political-ads/advertiser/{}/creative/{}'.format(removed_id[1], removed_id[0]),
        'https://transparencyreport.google.com/political-ads/advertiser/{}'.format(removed_id[1]),
        'https://transparencyreport.google.com/political-ads/advertiser/{}/creative/{}'.format(added_id[1], added_id[0]),
        'https://transparencyreport.google.com/political-ads/advertiser/{}'.format(added_id[1]),
    ))

for line_id in list(common_ids):
    modifications = df_1.loc[line_id] != df_2.loc[line_id]
    modified_columns = list(modifications[modifications].axes[0])
    if modified_columns:
        print('Ad {} has be modified'.format(line_id))


The following ads were run by several advertisers: ['CR100007454493900800', 'CR101180908278644736', 'CR102860962045886464', 'CR106855212911689728', 'CR109616361486942208', 'CR110281291143839744', 'CR110562766120550400', 'CR11307652457955328', 'CR114455037282877440', 'CR114762075904933888', 'CR115985557468741632', 'CR117423718677872640', 'CR117779135811551232', 'CR118905035718393856', 'CR119240111886958592', 'CR120180469206614016', 'CR120593885578657792', 'CR121438310508789760', 'CR123549372834119680', 'CR123954817746862080', 'CR124112322787540992', 'CR124777252444438528', 'CR130445509763530752', 'CR13466268661186560', 'CR135485671065255936', 'CR136919434227875840', 'CR137200909204586496', 'CR137623121669652480', 'CR138872991512526848', 'CR140069260163547136', 'CR140139628907724800', 'CR140421103884435456', 'CR146568748273238016', 'CR146895028348780544', 'CR147048959976669184', 'CR147193270877814784', 'CR151328259231973376', 'CR154652357760647168', 'CR156200470132555776', 'CR15752070871

65 ads were removed, 65 were added
https://transparencyreport.google.com/political-ads/advertiser/AR459033735090667520/creative/CR100348028220604416
    by https://transparencyreport.google.com/political-ads/advertiser/AR459033735090667520
was replaced by
https://transparencyreport.google.com/political-ads/advertiser/AR88014531413934080/creative/CR100348028220604416    by https://transparencyreport.google.com/political-ads/advertiser/AR88014531413934080

https://transparencyreport.google.com/political-ads/advertiser/AR8673772353617920/creative/CR103736173301596160
    by https://transparencyreport.google.com/political-ads/advertiser/AR8673772353617920
was replaced by
https://transparencyreport.google.com/political-ads/advertiser/AR520360095641501696/creative/CR103736173301596160    by https://transparencyreport.google.com/political-ads/advertiser/AR520360095641501696

https://transparencyreport.google.com/political-ads/advertiser/AR8673772353617920/creative/CR10541567731302400
    by h

In [7]:
key = 'google-political-ads-advertiser-stats.csv'
df_1 = bundle_data_1[key]
df_2 = bundle_data_2[key]

assert list(df_1.columns) == list(df_2.columns)

indexes = 'Advertiser_ID'
df_1.set_index(indexes, inplace=True)
df_2.set_index(indexes, inplace=True)
ids_1 = set(df_1.index)
ids_2 = set(df_2.index)

assert len(ids_1) == len(df_1)
assert len(ids_2) == len(df_2)

removed_ids = ids_1 - ids_2
added_ids = ids_2 - ids_1
common_ids = ids_1 & ids_2
assert len(ids_1) + len(added_ids) - len(removed_ids) == len(ids_2)
assert len(ids_1 | ids_2) == len(common_ids) + len(added_ids) + len(removed_ids)

print('{} advertisers were removed, {} were added'.format(len(removed_ids), len(added_ids)))
assert len(removed_ids) == len(added_ids)

for removed_id, added_id in zip(sorted(removed_ids), sorted(added_ids)):
    print('{}\nwas replaced by\n{}\n'.format(
        'https://transparencyreport.google.com/political-ads/advertiser/{}'.format(removed_id),
        'https://transparencyreport.google.com/political-ads/advertiser/{}'.format(added_id),
    ))

for line_id in list(common_ids):
    modifications = df_1.loc[line_id] != df_2.loc[line_id]
    modified_columns = list(modifications[modifications].axes[0])
    if modified_columns:
        print('Line {} has be modified'.format(line_id))


12 advertisers were removed, 12 were added
https://transparencyreport.google.com/political-ads/advertiser/AR152788960429473792
was replaced by
https://transparencyreport.google.com/political-ads/advertiser/AR124043328432898048

https://transparencyreport.google.com/political-ads/advertiser/AR188817757448437760
was replaced by
https://transparencyreport.google.com/political-ads/advertiser/AR142057726942380032

https://transparencyreport.google.com/political-ads/advertiser/AR206832155957919744
was replaced by
https://transparencyreport.google.com/political-ads/advertiser/AR160072125451862016

https://transparencyreport.google.com/political-ads/advertiser/AR224846554467401728
was replaced by
https://transparencyreport.google.com/political-ads/advertiser/AR178086523961344000

https://transparencyreport.google.com/political-ads/advertiser/AR260875351486365696
was replaced by
https://transparencyreport.google.com/political-ads/advertiser/AR322201712037199872

https://transparencyreport.googl

In [8]:

key = 'google-political-ads-advertiser-weekly-spend.csv'
df_1 = bundle_data_1[key]
df_2 = bundle_data_2[key]

assert list(df_1.columns) == list(df_2.columns)

indexes = ['Advertiser_ID', 'Week_Start_Date']
df_1.set_index(indexes, inplace=True)
df_2.set_index(indexes, inplace=True)

ids_1 = set(df_1.index)
ids_2 = set(df_2.index)

assert len(ids_1) == len(df_1)
assert len(ids_2) == len(df_2)

removed_ids = ids_1 - ids_2
added_ids = ids_2 - ids_1
common_ids = ids_1 & ids_2
assert len(ids_1) + len(added_ids) - len(removed_ids) == len(ids_2)
assert len(ids_1 | ids_2) == len(common_ids) + len(added_ids) + len(removed_ids)

print('{} lines were removed, {} were added'.format(len(removed_ids), len(added_ids)))
assert len(removed_ids) == len(added_ids)

old_advertisers = set(map(operator.itemgetter(0), removed_ids))
new_advertisers = set(map(operator.itemgetter(0), added_ids))

assert not old_advertisers & new_advertisers

print('Lines have been removed about the following advertisers:\n')
for advertiser_id in old_advertisers:
    print('{} ({})\n'.format(
        'https://transparencyreport.google.com/political-ads/advertiser/{}'.format(advertiser_id),
        [
            removed_id[1]
            for removed_id in removed_ids
            if removed_id[0] == advertiser_id
        ]
    ))

print('Lines have been added about the following advertisers:\n')
for advertiser_id in new_advertisers:
    print('{} ({})\n'.format(
        'https://transparencyreport.google.com/political-ads/advertiser/{}'.format(advertiser_id),
        [
            added_id[1]
            for added_id in added_ids
            if added_id[0] == advertiser_id
        ]
    ))

for line_id in list(common_ids):
    modifications = df_1.loc[line_id] != df_2.loc[line_id]
    modified_columns = list(modifications[modifications].axes[0])
    if modified_columns:
        print('Line {} has be modified'.format(line_id))


72 lines were removed, 72 were added
Lines have been removed about the following advertisers:

https://transparencyreport.google.com/political-ads/advertiser/AR206832155957919744 (['2018-11-04', '2018-10-28'])

https://transparencyreport.google.com/political-ads/advertiser/AR44702569372581888 (['2019-02-17', '2019-02-24', '2019-03-03', '2019-02-10', '2019-03-10'])

https://transparencyreport.google.com/political-ads/advertiser/AR224846554467401728 (['2018-10-28', '2018-11-04'])

https://transparencyreport.google.com/political-ads/advertiser/AR8673772353617920 (['2018-10-21', '2018-09-30', '2018-09-16', '2018-10-07', '2018-09-23', '2018-10-14', '2018-10-28'])

https://transparencyreport.google.com/political-ads/advertiser/AR368961742543257600 (['2019-05-05', '2019-05-12'])

https://transparencyreport.google.com/political-ads/advertiser/AR477048133600149504 (['2018-11-04', '2018-11-11'])

https://transparencyreport.google.com/political-ads/advertiser/AR152788960429473792 (['2018-09-23', 

In [9]:
key = 'google-political-ads-campaign-targeting.csv'
df_1 = bundle_data_1[key]
df_2 = bundle_data_2[key]
assert list(df_1.columns) == list(df_2.columns)

duplicate_campaign_ids_1 = [
    campaign_id
    for campaign_id, count in collections.Counter(df_1['Campaign_ID']).items()
    if count > 1
]
duplicate_campaign_ids_2 = [
    campaign_id
    for campaign_id, count in collections.Counter(df_2['Campaign_ID']).items()
    if count > 1
]
assert set(duplicate_campaign_ids_1) == set(duplicate_campaign_ids_2)

print('The following campaigns were run by several advertisers: {}'.format(duplicate_campaign_ids_1))

indexes = ['Campaign_ID', 'Advertiser_ID']
df_1.set_index(indexes, inplace=True)
df_2.set_index(indexes, inplace=True)

ids_1 = set(df_1.index)
ids_2 = set(df_2.index)

assert len(ids_1) == len(df_1)
assert len(ids_2) == len(df_2)

removed_ids = ids_1 - ids_2
added_ids = ids_2 - ids_1
common_ids = ids_1 & ids_2
assert len(ids_1) + len(added_ids) - len(removed_ids) == len(ids_2)
assert len(ids_1 | ids_2) == len(common_ids) + len(added_ids) + len(removed_ids)

print('{} campaigns were removed, {} were added'.format(len(removed_ids), len(added_ids)))

assert len(removed_ids) == 0

# Campaign data can change. For example, End_Date or Gender_Targeting...

The following campaigns were run by several advertisers: ['CA102091303906443264', 'CA12561095713619968', 'CA129676676257808384', 'CA138683050878828544', 'CA168796475340357632', 'CA172438882485272576', 'CA186731709012639744', 'CA190399679802900480', 'CA20351410474319872', 'CA22429487450816512', 'CA239086054680821760', 'CA246748276336885760', 'CA267875667142508544', 'CA271464473095569408', 'CA274551901746364416', 'CA288141865465675776', 'CA288878538256285696', 'CA294897264906731520', 'CA301024568330420224', 'CA304169171585859584', 'CA311790161555881984', 'CA327975797350465536', 'CA328327641071353856', 'CA329242434745663488', 'CA345004208807739392', 'CA346688660621492224', 'CA34819609106317312', 'CA360199459503603712', 'CA360410565736136704', 'CA362455657363800064', 'CA364703059130974208', 'CA364876781968162816', 'CA366002681875005440', 'CA370034316136153088', 'CA373710258385715200', 'CA376226765623787520', 'CA388611664599056384', 'CA392461329685807104', 'CA399779679080284160', 'CA4122555

In [10]:
key = 'google-political-ads-geo-spend.csv'
df_1 = bundle_data_1[key]
df_2 = bundle_data_2[key]

assert list(df_1.columns) == list(df_2.columns)

indexes = ['Country', 'Country_Subdivision_Primary', 'Country_Subdivision_Secondary']
df_1.set_index(indexes, inplace=True)
df_2.set_index(indexes, inplace=True)
ids_1 = set(df_1.index)
ids_2 = set(df_2.index)

assert len(ids_1) == len(df_1) == len(ids_2) == len(df_2)

removed_ids = ids_1 - ids_2
added_ids = ids_2 - ids_1
common_ids = ids_1 & ids_2
assert len(removed_ids) == 0
assert len(added_ids) == 0

for line_id in list(common_ids):
    modifications = df_1.loc[line_id] != df_2.loc[line_id]
    modified_columns = list(modifications[modifications].axes[0])
    if modified_columns:
        print('Line {} has be modified'.format(line_id))