In [1]:
# import libraries

import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup as bs
import regex as re
import unicodedata

### Function to Scrape a Singular 10-K File and Extract Income Tax Paid Value

In [None]:
def find_income_tax_paid(url):

    # request file from url
    r = requests.get(url, headers = {'User-agent': 'goyalavantika'})

    # create beautifulsoup object and normalize text
    html = bs(r.content, "html.parser")
    text = html.get_text(' ', strip=True)
    text = unicodedata.normalize("NFKD", text).encode('ascii', 'ignore').decode('utf8')
    text = text.split("\n")
    text = " ".join(text)
    text = text.replace('  ',' ').replace('\n',' ').replace(' $ ', ' ').replace('$','')

    # search for Supplementary Data/Information in entire file
    supp_info_regex = r'(?:.){50}Supplementa(?:l|ry)\s(?:Information|Data|Disclosures)(?:.){5000}'
    supp_info_pattern = re.compile(supp_info_regex, flags=re.I)
    supp_info_list = supp_info_pattern.findall(text)

    # search for Consolidated Cash Flow in entire file
    cons_cf_regex = r'(?:.){50}(?:Consolidated|Additional)\sCash\sFlow(?:.){2000}'
    cons_cf_pattern = re.compile(cons_cf_regex, flags=re.I)
    cons_cf_list = cons_cf_pattern.findall(text)

    # search for Cash Paid in entire file
    cash_paid_regex = r'(?:.){50}(?:Tax|Cash)\s(?:Paid|Payments?)\s(?:for|of)?(?:\sIncome)?(?:\sTax)?(?:.){1000}'
    cash_paid_pattern = re.compile(cash_paid_regex, flags=re.I)
    cash_paid_list = cash_paid_pattern.findall(text)

    # search for Income Tax Paid in entire file
    tax_paid_regex = r'(?:.){50}(?:NET\s)?INCOME\sTAX(?:ES)?\s(?:PAID)?(?:.){150}'
    tax_paid_pattern = re.compile(tax_paid_regex, flags=re.I)
    tax_paid_list = tax_paid_pattern.findall(text)

    # if Supplementary Data/Info section, search for Consolidated Cash Flow within each
    if supp_info_list:
        for supp_info in supp_info_list:
            cons_cf_list2 = cons_cf_pattern.findall(supp_info)
            cons_cf_list = cons_cf_list2 + cons_cf_list
    
    # if Supplementary Data/Info section, search for Cash Paid within each
            cash_paid_list2 = cash_paid_pattern.findall(supp_info)
            cash_paid_list = cash_paid_list2 + cash_paid_list
    
    # if Supplementary Data/Info section, search for Income Tax Paid within each
            # more general regex pattern when searching within a section
            tax_paid_regex2 = r'(?:.){0,50}(?:income|paid|payments?|tax(?:es)?)\s(?:\D+\s){0,3}(?:income|paid|payments?|tax(?:es)?)\s(?:.){100}'
            tax_paid_pattern2 = re.compile(tax_paid_regex2, flags=re.I)
            tax_paid_list2 = tax_paid_pattern2.findall(supp_info)
            tax_paid_list = tax_paid_list2 + tax_paid_list
    
    # if Consolidated Cash Flow section, search for Cash Paid within each
    if cons_cf_list:
        for cons_cf in cons_cf_list:
            cash_paid_list2 = cash_paid_pattern.findall(cons_cf)
            cash_paid_list = cash_paid_list2 + cash_paid_list

    # if Consolidated Cash Flow section, search for Income Tax Paid within each
            # more general regex pattern when searching within a section
            tax_paid_regex2 = r'(?:.){0,50}(?:income|paid|payments?|tax(?:es)?)\s(?:\D+\s){0,3}(?:income|paid|payments?|tax(?:es)?)(?:.){100}'
            tax_paid_pattern2 = re.compile(tax_paid_regex2, flags=re.I)
            tax_paid_list2 = tax_paid_pattern2.findall(cons_cf)
            tax_paid_list = tax_paid_list2 + tax_paid_list

    # if Cash Paid section, search for Income Tax Paid within each
    if cash_paid_list:
        for cash_paid in cash_paid_list:
            # more general regex pattern when searching within a section
            tax_paid_regex2 = r'(?:.){0,50}(?:income|paid|payments?|tax(?:es)?)\s(?:\D+\s){0,3}(?:income|paid|payments?|tax(?:es)?)(?:.){125}'
            tax_paid_pattern2 = re.compile(tax_paid_regex2, flags=re.I)
            tax_paid_list2 = tax_paid_pattern2.findall(cash_paid)
            tax_paid_list = tax_paid_list2 + tax_paid_list
    
    # remove any match that contains Provision, Benefit, or Loss
    drop_list = []
    for item in tax_paid_list:
        if item.lower().find('provision') != -1:
            drop_list.append(item)
        elif item.lower().find('loss') != -1:
            drop_list.append(item)
        elif item.lower().find('refunded') != -1:
            drop_list.append(item)
        elif item.lower().find('reserves') != -1:
            drop_list.append(item)
    
    # if income tax paid sections found then extract value
    if tax_paid_list:
        # drop the first 50 characters from each match and remove duplicates from list
        tax_paid_list = list(dict.fromkeys([x[50:] for x in tax_paid_list if x not in drop_list]))

        # find the first numerical value
        numerical_regex = r'\s(?:\((?:\s)?)?\d{1,3}(?:,\d{3})?(?:,\d{3})?(?:\.\d{1,3})?(?:\s?million)?(?:\s?\))?\D'
        numerical_pattern = re.compile(numerical_regex, flags=re.I)
        for item in tax_paid_list:
            value = numerical_pattern.findall(item)
            if value:
                return value[0]
    
    else: return None

### Function to Run the Scraping Function on Multiple 10-K Urls

In [None]:
def find_income_tax_paid_multiple(url_list):
    results = []
    for url in (url_list):
        results.append(find_income_tax_paid(url))
    return pd.DataFrame({'Income Taxes Paid':results})

### Input List of 10-K URLs for Parsing Here:

In [None]:
ITP = find_income_tax_paid_multiple(urls_list_name_here)

### Save Resulting Dataframe to Given Filepath As CSV:

In [None]:
ITP.to_csv('file_path_here', index=False)