# Download IRS forms
This notebook gently scrapes the current pdf-forms from the public website.

(The download page (multiple) has a table with forms names, descriptions, and direct links to the pdf files.)

In [None]:
import re
import requests
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from time import time, sleep
from pathlib import Path

In [None]:
# remove the old download
#!rm -rf forms/irs-*

In [None]:
URL = 'https://www.irs.gov'

In [None]:
def download_pdf(path: str) -> str:
    """
    download pdf-file specified by URL path and save in `forms` folder with prefix `irs-`
    return: saved file name
    """
    name = path.split('/')[-1]
    response = requests.get(f'{URL}{path}')
    with open(f'forms/irs-{name}','wb') as output:
        output.write(response.content)
    return f'irs-{name}'


In [None]:
def get_all_forms(num_pages: int, max_wait: int = 10) -> list:
    """
    download all forms with descriptions
    return: list of info-dictionaries
    """
    records = []
    for page in range(1, num_pages):
        html = requests.get(f'{URL}/forms-instructions-and-publications?items_per_page=25&find=&page={page}').text
        soup = BeautifulSoup(html, 'html.parser').find('tbody')
        if soup is None:
            continue
        trs = soup.find_all('tr') or []
        for tr in trs:
            td = tr.find_all('td')
            a = td[0].find('a')
            if a.text.startswith('Form ') and re.match(fr'\/pub\/irs\-pdf\/f\w+\.pdf', a['href']):
                name = a.text.strip()[5:]
                desc = td[1].text.strip()
                file = download_pdf(a['href'])
                # collect metadata
                records.append({'type':name, 'desc':desc, 'path':a['href'], 'file':file })
                # let the server breath...
                sleep(np.random.randint(1, max_wait))
        # save intermediate results
        pd.DataFrame.from_dict(records).to_csv('irs-forms.tmp.csv', index=False)
        print(f'done: {page/num_pages:.2%}', end='\r')
    ok = len([x for x in Path('forms').glob('irs*.pdf')])/num_pages
    print(f'all done: downloaded {ok:.2%}')
    return records


In [None]:
# consult the site for num-pages
num_pages = 110
# run download
records = get_all_forms(num_pages, max_wait=15)
# save metatdata
pd.DataFrame.from_dict(records).to_csv('irs-forms.csv', index=False)

In [None]:
# check result
data = pd.read_csv('irs-forms.csv')
data

In [None]:
def get_lang(r: dict) -> str:
    """
    extract document language
    which appears at the end of `description` field like ... (Spanish Version)
    or at the end of `type` field as ... (sp)
    """
    match = re.match(r'.* \((.*) version\)$', r['desc'].lower())
    if match is None:
        match = re.match(r'.* \(([a-z]{2})\)$', r['type'].lower())
        if match is None:
            return 'en'
        return match.group(1)
    return match.group(1)

data['lang'] = data.apply(get_lang, axis=1)
data

In [None]:
# normalize some most common as two-letter
data.loc[data['lang']=='spanish','lang'] = 'sp'
data.loc[data['lang'].str.startswith('puerto ri'),'lang'] = 'pr'

In [None]:
# see language stats
data.groupby('lang').size().sort_values(ascending=False)

In [None]:
# remove language info from the `type` field
data.loc[data['lang'] != 'en','type'] = data.loc[data['lang'] != 'en','type'].apply(lambda x:x.split(' (')[0])
# see type stats
data.groupby('type').size().sort_values(ascending=False)

In [None]:
# make sure there's no duplicates
(data.groupby('file').size() > 1).sum()

IRS forms are labeled with numeric patterns mostly, there are few labeled with letters (W). Each type may have subdivisions we put in the separate `ext` column.

In [None]:
def get_type(x):
    """
    separate the main type definition
    """
    match = re.match(r'^(\w+)(\W(.*))?$', x)
    return match.group(1)

def get_ext(x):
    """
    separate type-extras from the main type definition
    """
    match = re.match(r'^(\w+)(\W(.*))?$', x)
    return match.group(3)

def fix_ext(x):
    """
    normalize type-extras format
    """
    if x == '(EN-SP)':
        return None
    if type(x) == str:
        x = x.replace('Schedule','Sch')
        x = x.replace(' 990-','')
        x = x.replace('orEZ','EZ')
        x = x.replace('990PF','PF')
    return x

data['ext'] = data['type'].apply(get_ext)
data['type'] = data['type'].apply(get_type)
data['ext'] = data['ext'].apply(fix_ext)
data

In [None]:
# save final metadata
data[['type','ext','desc','path','file','lang']].to_csv('irs-forms.csv', index=False)

In [None]:
# downloaded vs metadata
len([x for x in Path('forms').glob('irs*.pdf')]) == len(data[~data['file'].isna()])

In [None]:
#!rm irs-forms.tmp.csv

In [None]:
# cleanup for classification
data = pd.read_csv('irs-forms.csv')
data.loc[data['type']=='13614','ext'] = data.loc[data['type']=='13614','file'].apply(lambda x:x[10:-4].upper())
data['sub'] = data['ext']
loc = ~data['sub'].isna()
data.loc[loc,'sub'] = data.loc[loc,'sub'].apply(lambda x:str(x).split())
data.loc[loc,'sub'] = data.loc[loc,'sub'].apply(lambda x:x[0] if x[0][0] != '(' else None)
loc = ~data['sub'].isna()
data.loc[loc,'sub'] = data.loc[loc,'sub'].apply(lambda x:re.split(r'\W+', str(x))[0])
data.loc[loc,'sub'] = data.loc[loc,'sub'].apply(lambda x:x[0] if re.match(r'\d\D+', x) else x)
data.loc[loc,'ext'] = data.loc[loc,:].apply(lambda r:str(r['ext'])[len(str(r['sub'])):].strip(' -'), axis=1)
data.to_csv('irs-forms.csv', index=False)

In [None]:
# correction for better pattern matching
data = pd.read_csv('irs-forms.csv')
select = data[(~data['sub'].isna())&(data['sub'].str.match(r'^[A-Z]+$'))].index
data.loc[select,'ext'] = data.loc[select,['ext','sub']]\
    .apply(lambda r:f"{r['sub']} {r['ext']}".replace(' nan',''), axis=1)
data.loc[select,'sub'] = None
data.fillna('').astype(str).to_csv('irs-forms.csv', index=False)