In [2]:
import requests
from datetime import datetime
import os
from zipfile import ZipFile
import gzip
import shutil
import xmltodict
import pandas as pd

In [3]:
r = requests.get('https://reports.adviserinfo.sec.gov/reports/CompilationReports/CompilationReports.manifest.json')
filenames = r.json()['files']
filenames = [x['name'] for x in filenames]

In [4]:
# only get investment adviser firms (not individuals) for now
filenames = [x for x in filenames if x.startswith('IA_FIRM')]

In [5]:
BASE_XML_URL = ['https://reports.adviserinfo.sec.gov/reports/CompilationReports/']*len(filenames)

In [6]:
urls = []
for (base_url, file) in zip(BASE_XML_URL, filenames):
    urls.append(base_url + file)

In [7]:
urls

['https://reports.adviserinfo.sec.gov/reports/CompilationReports/IA_FIRM_SEC_Feed_03_20_2023.xml.gz',
 'https://reports.adviserinfo.sec.gov/reports/CompilationReports/IA_FIRM_STATE_Feed_03_20_2023.xml.gz']

In [8]:
DOWNLOAD_DIR = 'data/sec'

In [9]:
if not os.path.isdir(DOWNLOAD_DIR):
    os.makedirs(DOWNLOAD_DIR)

In [10]:
def download_file(url, download_dir = DOWNLOAD_DIR):
    filename = url.split('/')[-1]
    path = os.path.join(download_dir, filename)
    with requests.get(url, stream=True) as r:
        with open(path, 'wb') as f:
            shutil.copyfileobj(r.raw, f)
    return filename, path

In [11]:
def zipfile_builtin(download_dir, filename):
    with ZipFile(os.path.join(download_dir, filename),
                 'r') as in_file:
        in_file.extractall(os.path.join(download_dir,
                                        '.'.join(filename.split('.')[:-2])
                                       )
                          )

In [12]:
def gunzip_shutil(download_dir, filename, block_size=65536):
    with gzip.open(os.path.join(download_dir, filename), 'rb') as s_file,\
         open(os.path.join(download_dir,'.'.join(filename.split('.')[:-1])), 'wb') as d_file:
        shutil.copyfileobj(s_file, d_file, block_size)

In [13]:
for url in urls:
    filename, filename_path = download_file(url, DOWNLOAD_DIR)
    filename_format = filename.split('.')[-1]
    print(filename, filename_path, filename_format)
    if filename_format == 'zip':
        zipfile_builtin(DOWNLOAD_DIR, filename)
    elif filename_format == 'gz':
        gunzip_shutil(DOWNLOAD_DIR, filename)
    else:
        print(f'WARNING: unknown file format: {local_filename}')

IA_FIRM_SEC_Feed_03_20_2023.xml.gz data/sec\IA_FIRM_SEC_Feed_03_20_2023.xml.gz gz
IA_FIRM_STATE_Feed_03_20_2023.xml.gz data/sec\IA_FIRM_STATE_Feed_03_20_2023.xml.gz gz


In [18]:
xml_files = [
             ('IA_FIRM_STATE_Feed_03_20_2023.xml', ['data/sec/IA_FIRM_STATE_Feed_03_20_2023.xml']),
             ('IA_FIRM_SEC_Feed_03_20_2023.xml', ['data/sec/IA_FIRM_SEC_Feed_03_20_2023.xml']),
            ]

In [19]:
PROCESSED_DIR = DOWNLOAD_DIR

In [20]:
for filename, file_path in xml_files:
    print(filename, file_path)
    dfs = []
    for fp in file_path:
        with open(fp, encoding = 'ISO-8859-1') as fd:
            data_dict = xmltodict.parse(fd.read(), encoding = 'ISO-8859-1')
    
        if data_dict.get('IAPDFirmSECReport'):
            data = data_dict.get('IAPDFirmSECReport').get('Firms').get('Firm')
        elif data_dict.get('IAPDFirmStateReport'):
            data = data_dict.get('IAPDFirmStateReport').get('Firms').get('Firm')
        elif data_dict.get('IAPDIndividualReport'):
            data = data_dict.get('IAPDIndividualReport').get('Indvls').get('Indvl')
        else:
            print('WARNING: Unrecognized data!')
        print(f'adding data for {fp}')
        df_flat = pd.json_normalize(data)
        
        # TODO: check that WebAddr & Rgltr are indeed the only fields with nested data
        df_flat = df_flat.loc[:, df_flat.columns.str.contains('@|WebAddr|Rgltr')]
        dfs.append(df_flat)
        
    df_all = pd.concat(dfs)
    filename_csv = filename.replace('.xml', '.csv')
    file_path_csv = os.path.join(PROCESSED_DIR, filename_csv)

    print(f'creating {fp} in {file_path_csv}')
    df_all.to_csv(file_path_csv, index=False)

IA_FIRM_STATE_Feed_03_20_2023.xml ['data/sec/IA_FIRM_STATE_Feed_03_20_2023.xml']
adding data for data/sec/IA_FIRM_STATE_Feed_03_20_2023.xml
creating data/sec/IA_FIRM_STATE_Feed_03_20_2023.xml in data/sec\IA_FIRM_STATE_Feed_03_20_2023.csv
IA_FIRM_SEC_Feed_03_20_2023.xml ['data/sec/IA_FIRM_SEC_Feed_03_20_2023.xml']
adding data for data/sec/IA_FIRM_SEC_Feed_03_20_2023.xml
creating data/sec/IA_FIRM_SEC_Feed_03_20_2023.xml in data/sec\IA_FIRM_SEC_Feed_03_20_2023.csv
