## Fetch the data

For now, we'll just fetch 1980-2000, since we can't store the remainder of it yet.

In [1]:
import os
import subprocess
import urllib.request

import pandas as pd
from tqdm import tqdm
import numpy as np

from config import DATA_DIR

In [2]:
links = {
    1980: 'https://www.dropbox.com/scl/fi/gdvpzggkb0in9yruircpi/contribDB_1980.csv.gz?rlkey=rs07632m813k3g85ndek1z16g&dl=1',
    1982: 'https://www.dropbox.com/scl/fi/27xvy2hz0r4qxk8scnquh/contribDB_1982.csv.gz?rlkey=l45cyb4zzlr8xo74un5lazs58&dl=1',
    1984: 'https://www.dropbox.com/scl/fi/ll39jbojud23hlvqrtejo/contribDB_1984.csv.gz?rlkey=2wa2khacau0nofzbcakm0y41g&dl=1',
    1986: 'https://www.dropbox.com/scl/fi/dygk66slka6k2kkut2qhr/contribDB_1986.csv.gz?rlkey=8gnic6alr13e02f49rl3bak07&dl=1',
    1988: 'https://www.dropbox.com/scl/fi/bjf3f5ol3m6zgx0pf46h3/contribDB_1988.csv.gz?rlkey=tenj07ipi5w5fe804b8wp7g3b&dl=1',
    1990: 'https://www.dropbox.com/scl/fi/32oz76r1s0rg1f65sokki/contribDB_1990.csv.gz?rlkey=63a7r895m3bgam2eeg3ei1ox2&dl=1',
    1992: 'https://www.dropbox.com/scl/fi/oqdezk0qhlcwu0pg2w54u/contribDB_1992.csv.gz?rlkey=3wy10l308vpfhdoteim95lt36&dl=1',
    1994: 'https://www.dropbox.com/scl/fi/j8c57z43lg7byvhchaajq/contribDB_1994.csv.gz?rlkey=e1y4hz7x36vw26hlvvytmxp13&dl=1',
    1996: 'https://www.dropbox.com/scl/fi/5sptswri21u1z6nyc2zzw/contribDB_1996.csv.gz?rlkey=ar2wx5f05p7f38hzt4o52rngg&dl=1',
    1998: 'https://www.dropbox.com/scl/fi/c3bdwy7v527lrx68ghkvr/contribDB_1998.csv.gz?rlkey=y9jfa2e7tjd0yc5w2x227loy8&dl=1'
}

In [3]:
# Fetch the data and unzip into the data directory

total_storage = 0

for year, url in links.items():
    filename = f'{year}_contributions.csv.gz'
    filepath = os.path.join(DATA_DIR, filename)

    if os.path.exists(filepath):
        print(f'{filename} already exists, skipping...')
        continue

    print(f'Downloading {filename}...')
    with urllib.request.urlopen(url) as response, open(filepath, 'wb') as out_file:
        data = response.read()
        out_file.write(data)

    print(f'Unzipping {filename}...')
    subprocess.run(['gunzip', filepath])

    total_storage += os.path.getsize(filepath.split('.gz')[0])

print(f'Total storage used: {total_storage / 1024 / 1024:.2f} MB')

Downloading 1980_contributions.csv.gz...
Unzipping 1980_contributions.csv.gz...
Downloading 1982_contributions.csv.gz...
Unzipping 1982_contributions.csv.gz...
Downloading 1984_contributions.csv.gz...
Unzipping 1984_contributions.csv.gz...
Downloading 1986_contributions.csv.gz...
Unzipping 1986_contributions.csv.gz...
Downloading 1988_contributions.csv.gz...
Unzipping 1988_contributions.csv.gz...
Downloading 1990_contributions.csv.gz...
Unzipping 1990_contributions.csv.gz...
Downloading 1992_contributions.csv.gz...
Unzipping 1992_contributions.csv.gz...
Downloading 1994_contributions.csv.gz...
Unzipping 1994_contributions.csv.gz...
Downloading 1996_contributions.csv.gz...
Unzipping 1996_contributions.csv.gz...
Downloading 1998_contributions.csv.gz...
Unzipping 1998_contributions.csv.gz...
Total storage used: 4324.75 MB


In [4]:
sample_size = 0.01
output_file_path = os.path.join(DATA_DIR, 'sample_contributions.csv')

# Determine whether to write the header; initially, True
write_header = True

for year in tqdm(links.keys()):
    filename = f'{year}_contributions.csv'
    filepath = os.path.join(DATA_DIR, filename)
    print(f'Loading {filename}...')

    # First, determine the number of rows in the file (excluding header)
    total_rows = sum(1 for row in open(filepath, 'r', encoding='utf-8')) - 1

    # Calculate the number of rows to sample
    sample_rows = int(total_rows * sample_size)

    # Generate random row indices to skip
    skiprows = sorted(np.random.choice(np.arange(1, total_rows+1), size=(total_rows-sample_rows), replace=False))
    
    # Use skiprows in read_csv to load the random sample directly
    sample_data = pd.read_csv(filepath, skiprows=skiprows)

    # Append the sample to the output file. Use header only for the first file
    sample_data.to_csv(output_file_path, mode='a', index=False, header=write_header)
    
    # After the first write, disable header for subsequent writes
    write_header = False

print("Sampling completed and saved to 'sample_contributions.csv'.")

  0%|          | 0/10 [00:00<?, ?it/s]

Loading 1980_contributions.csv...


 10%|█         | 1/10 [00:01<00:17,  1.97s/it]

Loading 1982_contributions.csv...


 20%|██        | 2/10 [00:03<00:13,  1.66s/it]

Loading 1984_contributions.csv...


 30%|███       | 3/10 [00:05<00:14,  2.02s/it]

Loading 1986_contributions.csv...


 40%|████      | 4/10 [00:09<00:15,  2.54s/it]

Loading 1988_contributions.csv...


 50%|█████     | 5/10 [00:12<00:13,  2.72s/it]

Loading 1990_contributions.csv...


 60%|██████    | 6/10 [00:17<00:14,  3.59s/it]

Loading 1992_contributions.csv...


 70%|███████   | 7/10 [00:23<00:13,  4.47s/it]

Loading 1994_contributions.csv...


  sample_data = pd.read_csv(filepath, skiprows=skiprows)
 80%|████████  | 8/10 [00:31<00:10,  5.41s/it]

Loading 1996_contributions.csv...


 90%|█████████ | 9/10 [00:44<00:08,  8.02s/it]

Loading 1998_contributions.csv...


  sample_data = pd.read_csv(filepath, skiprows=skiprows)
100%|██████████| 10/10 [01:15<00:00,  7.58s/it]

Sampling completed and saved to 'sample_contributions.csv'.





The following issues arose when processing data: 
`DtypeWarning: Columns (11,12,25,29,39) have mixed types. Specify dtype option on import or set low_memory=False.
  sample_data = pd.read_csv(filepath, skiprows=skiprows)`

`DtypeWarning: Columns (37,39,43) have mixed types. Specify dtype option on import or set low_memory=False.
  sample_data = pd.read_csv(filepath, skiprows=skiprows)`
