## Fetch the data

For now, we'll just fetch 1980-2000, since we can't store the remainder of it locally.

In [2]:
import os
import subprocess
import urllib.request

import pandas as pd
from tqdm import tqdm
import numpy as np

from config import DATA_DIR

In [5]:
links = {
    1980: 'https://www.dropbox.com/scl/fi/gdvpzggkb0in9yruircpi/contribDB_1980.csv.gz?rlkey=rs07632m813k3g85ndek1z16g&dl=1',
    1982: 'https://www.dropbox.com/scl/fi/27xvy2hz0r4qxk8scnquh/contribDB_1982.csv.gz?rlkey=l45cyb4zzlr8xo74un5lazs58&dl=1',
    1984: 'https://www.dropbox.com/scl/fi/ll39jbojud23hlvqrtejo/contribDB_1984.csv.gz?rlkey=2wa2khacau0nofzbcakm0y41g&dl=1',
    1986: 'https://www.dropbox.com/scl/fi/dygk66slka6k2kkut2qhr/contribDB_1986.csv.gz?rlkey=8gnic6alr13e02f49rl3bak07&dl=1',
    1988: 'https://www.dropbox.com/scl/fi/bjf3f5ol3m6zgx0pf46h3/contribDB_1988.csv.gz?rlkey=tenj07ipi5w5fe804b8wp7g3b&dl=1',
    1990: 'https://www.dropbox.com/scl/fi/32oz76r1s0rg1f65sokki/contribDB_1990.csv.gz?rlkey=63a7r895m3bgam2eeg3ei1ox2&dl=1',
    1992: 'https://www.dropbox.com/scl/fi/oqdezk0qhlcwu0pg2w54u/contribDB_1992.csv.gz?rlkey=3wy10l308vpfhdoteim95lt36&dl=1',
    1994: 'https://www.dropbox.com/scl/fi/j8c57z43lg7byvhchaajq/contribDB_1994.csv.gz?rlkey=e1y4hz7x36vw26hlvvytmxp13&dl=1',
    1996: 'https://www.dropbox.com/scl/fi/5sptswri21u1z6nyc2zzw/contribDB_1996.csv.gz?rlkey=ar2wx5f05p7f38hzt4o52rngg&dl=1',
    1998: 'https://www.dropbox.com/scl/fi/c3bdwy7v527lrx68ghkvr/contribDB_1998.csv.gz?rlkey=y9jfa2e7tjd0yc5w2x227loy8&dl=1'
}

In [4]:
# Fetch the data and unzip into the data directory
total_storage = 0

for year, url in links.items():
    filename = f'{year}_contributions.csv.gz'
    filepath = os.path.join(DATA_DIR, filename)

    if os.path.exists(filepath):
        print(f'{filename} already exists, skipping...')
        continue

    print(f'Downloading {filename}...')
    with urllib.request.urlopen(url) as response, open(filepath, 'wb') as out_file:
        data = response.read()
        out_file.write(data)

    print(f'Unzipping {filename}...')
    subprocess.run(['gunzip', filepath])

    total_storage += os.path.getsize(filepath.split('.gz')[0])

print(f'Total storage used: {total_storage / 1024 / 1024:.2f} MB')

Downloading 1980_contributions.csv.gz...
Unzipping 1980_contributions.csv.gz...
Downloading 1982_contributions.csv.gz...


gunzip: /Users/aaron/Documents/GitHub/donorAskAmount/data/1980_contributions.csv already exists -- skipping


Unzipping 1982_contributions.csv.gz...
Downloading 1984_contributions.csv.gz...


gunzip: /Users/aaron/Documents/GitHub/donorAskAmount/data/1982_contributions.csv already exists -- skipping


Unzipping 1984_contributions.csv.gz...
Downloading 1986_contributions.csv.gz...


gunzip: /Users/aaron/Documents/GitHub/donorAskAmount/data/1984_contributions.csv already exists -- skipping


Unzipping 1986_contributions.csv.gz...
Downloading 1988_contributions.csv.gz...


gunzip: /Users/aaron/Documents/GitHub/donorAskAmount/data/1986_contributions.csv already exists -- skipping


Unzipping 1988_contributions.csv.gz...
Downloading 1990_contributions.csv.gz...
Unzipping 1990_contributions.csv.gz...
Downloading 1992_contributions.csv.gz...
Unzipping 1992_contributions.csv.gz...
Downloading 1994_contributions.csv.gz...
Unzipping 1994_contributions.csv.gz...
Downloading 1996_contributions.csv.gz...
Unzipping 1996_contributions.csv.gz...
Downloading 1998_contributions.csv.gz...
Unzipping 1998_contributions.csv.gz...
Total storage used: 4324.75 MB


In [6]:
# Now, delete all the .gz files
for year in links.keys():
    filepath = os.path.join(DATA_DIR, f'{year}_contributions.csv.gz')
    try:
        os.remove(filepath)
    except FileNotFoundError:
        pass

In [27]:
# Read the files and ensure the columns all match up, and we've matched the right columns
columns: list[str] = []

for year in links.keys():
    filepath = os.path.join(DATA_DIR, f'{year}_contributions.csv')
    df = pd.read_csv(filepath, nrows=1)
    
    if not columns:
        columns = df.columns.tolist()
    else:
        assert columns == df.columns.tolist(), f'Columns do not match for {year}'

# Print the head from the first file
df = pd.read_csv(os.path.join(DATA_DIR, '1980_contributions.csv'), low_memory=False)
df.head()

Unnamed: 0,cycle,transaction.id,transaction.type,amount,date,bonica.cid,contributor.name,contributor.lname,contributor.fname,contributor.mname,...,efec.memo,efec.memo2,efec.transaction.id.orig,bk.ref.transaction.id,efec.org.orig,efec.comid.orig,efec.form.type,excluded.from.scaling,contributor.cfscore,candidate.cfscore
0,1980,comm:1980:1,24K,100.0,1980-08-19,100055585.0,UFWA COPE COMMITTEE,,,,...,,,,,,,,0,-0.54,-0.67
1,1980,comm:1980:10,24K,250.0,1980-02-19,100046784.0,INTERNATIONAL CHIROPRACTORS POLITICAL ACTION C...,,,,...,,,,,,,,1,-0.23,-0.62
2,1980,comm:1980:100,24Z,375.0,1980-09-19,100058265.0,AMERICANS FOR CONSTITUTIONAL ACTION ACA,,,,...,,,,,,,,1,0.91,0.89
3,1980,comm:1980:1000,24K,200.0,1980-07-19,100031580.0,IBPAT POLITICAL ACTION TOGETHER POLITICAL COMM,,,,...,,,,,,,,0,-0.56,-0.74
4,1980,comm:1980:10000,24K,300.0,1980-06-19,100031424.0,AMERICAN BANKERS ASSOCIATION BANKPAC,,,,...,,,,,,,,1,0.42,1.19


## Sample Creation

Next, we'll create a sample of the data to work with. We would like to sample over the contributors, not over the contributions, so we'll simply iterate through the finals and collect all the IDs. Then, we'll take a random sample of those IDs and collect the corresponding contributions. We do this to save memory, instead of loading all the contributions in at once.

In [6]:
SAMPLE_SIZE = 0.01

contributor_ids: set[str] = set()

for year in links.keys():
    filename = f'{year}_contributions.csv'
    filepath = os.path.join(DATA_DIR, filename)

    print(f'Processing {filename}...')
    df = pd.read_csv(filepath, usecols=['bonica.cid'], dtype=str)
    contributor_ids.update(df['bonica.cid'].unique())

Processing 1980_contributions.csv...
Processing 1982_contributions.csv...
Processing 1984_contributions.csv...
Processing 1986_contributions.csv...
Processing 1988_contributions.csv...
Processing 1990_contributions.csv...
Processing 1992_contributions.csv...
Processing 1994_contributions.csv...
Processing 1996_contributions.csv...
Processing 1998_contributions.csv...


In [7]:
print(f"There are {len(contributor_ids)} unique contributors")

There are 4691382 unique contributors


In [8]:
# Sample the data
sampled_ids: set[str] = set(np.random.choice(list(contributor_ids), size=int(len(contributor_ids) * SAMPLE_SIZE), replace=False))

print(f"Selected {len(sampled_ids)} unique contributors.\n")

print(f"The first 5 sampled IDs are: {list(sampled_ids)[:5]}, of type {type(list(sampled_ids)[0])}")

Selected 46913 unique contributors.

The first 5 sampled IDs are: ['5000004123327978', '2376688058', '2587815079', '104092271', '5000003612205559'], of type <class 'numpy.str_'>


In [9]:
df = pd.read_csv(os.path.join(DATA_DIR, f'1980_contributions.csv'), low_memory=False)
df.head()

Unnamed: 0,cycle,transaction.id,transaction.type,amount,date,bonica.cid,contributor.name,contributor.lname,contributor.fname,contributor.mname,...,efec.memo,efec.memo2,efec.transaction.id.orig,bk.ref.transaction.id,efec.org.orig,efec.comid.orig,efec.form.type,excluded.from.scaling,contributor.cfscore,candidate.cfscore
0,1980,comm:1980:1,24K,100.0,1980-08-19,100055585.0,UFWA COPE COMMITTEE,,,,...,,,,,,,,0,-0.54,-0.67
1,1980,comm:1980:10,24K,250.0,1980-02-19,100046784.0,INTERNATIONAL CHIROPRACTORS POLITICAL ACTION C...,,,,...,,,,,,,,1,-0.23,-0.62
2,1980,comm:1980:100,24Z,375.0,1980-09-19,100058265.0,AMERICANS FOR CONSTITUTIONAL ACTION ACA,,,,...,,,,,,,,1,0.91,0.89
3,1980,comm:1980:1000,24K,200.0,1980-07-19,100031580.0,IBPAT POLITICAL ACTION TOGETHER POLITICAL COMM,,,,...,,,,,,,,0,-0.56,-0.74
4,1980,comm:1980:10000,24K,300.0,1980-06-19,100031424.0,AMERICAN BANKERS ASSOCIATION BANKPAC,,,,...,,,,,,,,1,0.42,1.19


In [16]:
output_file_path = os.path.join(DATA_DIR, 'sample_contributions.csv')

sampled_contributions = pd.DataFrame()
columns: list[str] = []

# Determine whether to write the header; initially, True
write_header = True

matches_per_cycle = {}

for year in tqdm(links.keys()):
    filename = f'{year}_contributions.csv'
    filepath = os.path.join(DATA_DIR, filename)

    df = pd.read_csv(filepath, low_memory=False, dtype={'bonica.cid': str})

    # If the columns list is empty, populate it with the columns from the first dataframe
    if not columns:
        columns = df.columns.tolist()
        sampled_contributions = pd.DataFrame(columns=columns)
    else:
        assert columns == df.columns.tolist(), 'Columns do not match'

    # Filter the dataframe to only include the sampled contributors
    df = df[df['bonica.cid'].isin(sampled_ids)]

    matches_per_cycle[year] = len(df)

    # Append the filtered dataframe to the sampled_contributions dataframe
    sampled_contributions = pd.concat([sampled_contributions, df], ignore_index=True)

# Write the sampled_contributions dataframe to a CSV file
sampled_contributions.to_csv(output_file_path, index=False)

print("\nSampling completed and saved to 'sample_contributions.csv'.")
for year, matches in matches_per_cycle.items():
    print(f"Matched {matches} contributions from {year}")

  sampled_contributions = pd.concat([sampled_contributions, df], ignore_index=True)
  sampled_contributions = pd.concat([sampled_contributions, df], ignore_index=True)
100%|██████████| 10/10 [02:57<00:00, 17.75s/it]



Sampling completed and saved to 'sample_contributions.csv'.
Matched 3909 contributions from 1980
Matched 2512 contributions from 1982
Matched 3498 contributions from 1984
Matched 4186 contributions from 1986
Matched 5426 contributions from 1988
Matched 9730 contributions from 1990
Matched 13986 contributions from 1992
Matched 16430 contributions from 1994
Matched 30427 contributions from 1996
Matched 57378 contributions from 1998


In [17]:
print(f"The {len(sampled_ids)} sampled contributors had a total of {len(sampled_contributions)} contributions.")

The 46913 sampled contributors had a total of 147482 contributions.
