## Fetch the data

For now, we'll just fetch 1980-2000, since we can't store the remainder of it yet.

In [8]:
import os
import subprocess
import urllib.request

import pandas as pd
from tqdm import tqdm

from config import DATA_DIR

In [10]:
links = {
    1980: 'https://www.dropbox.com/scl/fi/gdvpzggkb0in9yruircpi/contribDB_1980.csv.gz?rlkey=rs07632m813k3g85ndek1z16g&dl=1',
    1982: 'https://www.dropbox.com/scl/fi/27xvy2hz0r4qxk8scnquh/contribDB_1982.csv.gz?rlkey=l45cyb4zzlr8xo74un5lazs58&dl=1',
    1984: 'https://www.dropbox.com/scl/fi/ll39jbojud23hlvqrtejo/contribDB_1984.csv.gz?rlkey=2wa2khacau0nofzbcakm0y41g&dl=1',
    1986: 'https://www.dropbox.com/scl/fi/dygk66slka6k2kkut2qhr/contribDB_1986.csv.gz?rlkey=8gnic6alr13e02f49rl3bak07&dl=1',
    1988: 'https://www.dropbox.com/scl/fi/bjf3f5ol3m6zgx0pf46h3/contribDB_1988.csv.gz?rlkey=tenj07ipi5w5fe804b8wp7g3b&dl=1',
    1990: 'https://www.dropbox.com/scl/fi/32oz76r1s0rg1f65sokki/contribDB_1990.csv.gz?rlkey=63a7r895m3bgam2eeg3ei1ox2&dl=1',
    1992: 'https://www.dropbox.com/scl/fi/oqdezk0qhlcwu0pg2w54u/contribDB_1992.csv.gz?rlkey=3wy10l308vpfhdoteim95lt36&dl=1',
    1994: 'https://www.dropbox.com/scl/fi/j8c57z43lg7byvhchaajq/contribDB_1994.csv.gz?rlkey=e1y4hz7x36vw26hlvvytmxp13&dl=1',
    1996: 'https://www.dropbox.com/scl/fi/5sptswri21u1z6nyc2zzw/contribDB_1996.csv.gz?rlkey=ar2wx5f05p7f38hzt4o52rngg&dl=1',
    1998: 'https://www.dropbox.com/scl/fi/c3bdwy7v527lrx68ghkvr/contribDB_1998.csv.gz?rlkey=y9jfa2e7tjd0yc5w2x227loy8&dl=1'
}

In [12]:
# Fetch the data and unzip into the data directory

total_storage = 0

for year, url in links.items():
    filename = f'{year}_contributions.csv.gz'
    filepath = os.path.join(DATA_DIR, filename)

    if os.path.exists(filepath):
        print(f'{filename} already exists, skipping...')
        continue

    print(f'Downloading {filename}...')
    with urllib.request.urlopen(url) as response, open(filepath, 'wb') as out_file:
        data = response.read()
        out_file.write(data)

    print(f'Unzipping {filename}...')
    subprocess.run(['gunzip', filepath])

    total_storage += os.path.getsize(filepath.split('.gz')[0])

print(f'Total storage used: {total_storage / 1024 / 1024:.2f} MB')

Downloading 1980_contributions.csv.gz...
Unzipping 1980_contributions.csv.gz...
Downloading 1982_contributions.csv.gz...
Unzipping 1982_contributions.csv.gz...
Downloading 1984_contributions.csv.gz...
Unzipping 1984_contributions.csv.gz...
Downloading 1986_contributions.csv.gz...
Unzipping 1986_contributions.csv.gz...
Downloading 1988_contributions.csv.gz...
Unzipping 1988_contributions.csv.gz...
Downloading 1990_contributions.csv.gz...
Unzipping 1990_contributions.csv.gz...
Downloading 1992_contributions.csv.gz...
Unzipping 1992_contributions.csv.gz...
Downloading 1994_contributions.csv.gz...
Unzipping 1994_contributions.csv.gz...
Downloading 1996_contributions.csv.gz...
Unzipping 1996_contributions.csv.gz...
Downloading 1998_contributions.csv.gz...
Unzipping 1998_contributions.csv.gz...
Total storage used: 4324.75 MB


In [13]:
# Take a 1% sample of the data and save it to a new file for EDA
sample_size = 0.01
sample_data = pd.DataFrame()
for year in tqdm(links.keys()):
    filename = f'{year}_contributions.csv'
    filepath = os.path.join(DATA_DIR, filename)
    print(f'Loading {filename}...')
    data = pd.read_csv(filepath)
    sample_data = pd.concat([sample_data, data.sample(frac=sample_size)])

sample_data.to_csv(os.path.join(DATA_DIR, 'sample_contributions.csv'), index=False)

  0%|          | 0/10 [00:00<?, ?it/s]

Loading 1980_contributions.csv...


  data = pd.read_csv(filepath)
 10%|█         | 1/10 [00:01<00:13,  1.55s/it]

Loading 1982_contributions.csv...


  data = pd.read_csv(filepath)
 20%|██        | 2/10 [00:02<00:10,  1.36s/it]

Loading 1984_contributions.csv...


  data = pd.read_csv(filepath)
 30%|███       | 3/10 [00:04<00:09,  1.38s/it]

Loading 1986_contributions.csv...


  data = pd.read_csv(filepath)
 40%|████      | 4/10 [00:05<00:08,  1.47s/it]

Loading 1988_contributions.csv...


  data = pd.read_csv(filepath)
 50%|█████     | 5/10 [00:08<00:08,  1.74s/it]

Loading 1990_contributions.csv...


  data = pd.read_csv(filepath)
 60%|██████    | 6/10 [00:11<00:09,  2.46s/it]

Loading 1992_contributions.csv...


  data = pd.read_csv(filepath)
 70%|███████   | 7/10 [00:17<00:10,  3.57s/it]

Loading 1994_contributions.csv...


  data = pd.read_csv(filepath)
 80%|████████  | 8/10 [00:25<00:09,  4.78s/it]

Loading 1996_contributions.csv...


  data = pd.read_csv(filepath)
 90%|█████████ | 9/10 [00:39<00:07,  7.69s/it]

Loading 1998_contributions.csv...


  data = pd.read_csv(filepath)
100%|██████████| 10/10 [01:16<00:00,  7.68s/it]
