In [15]:
import os
import requests
import pandas as pd

In [16]:
def check_zip_signature(file_path):
    with open(file_path, 'rb') as f:
        header = f.read(4)
        return header == b'PK\x03\x04'

zip_path = "/Users/paigeblackstone/Desktop/2021FD.zip"
if check_zip_signature(zip_path):
    print("Valid ZIP file signature.")
else:
    print("Invalid ZIP file signature.")


Valid ZIP file signature.


In [17]:
response = requests.get(zip_url, stream=True)
print("Content-Type:", response.headers.get('Content-Type'))


Content-Type: application/x-zip-compressed


In [19]:
def download_zip(url, save_path):
    try:
        response = requests.get(url, stream=True)
        print(f"Content-Type: {response.headers.get('Content-Type')}")
        if response.status_code == 200:
            with open(save_path, 'wb') as file:
                file.write(response.content)
            print(f"Downloaded ZIP file to {save_path}")
        else:
            print(f"Failed to download ZIP file. Status code: {response.status_code}")
    except Exception as e:
        print(f"Error downloading ZIP file: {e}")


In [20]:
def download_zip(url, save_path):
    try:
        print(f"Downloading from {url}")
        response = requests.get(url, stream=True)
        print(f"Response Status Code: {response.status_code}")
        print(f"Response Content-Type: {response.headers.get('Content-Type')}")
        if response.status_code == 200:
            with open(save_path, 'wb') as file:
                file.write(response.content)
            print(f"Downloaded ZIP file to {save_path}")
        else:
            print(f"Failed to download ZIP file. Status code: {response.status_code}")
    except Exception as e:
        print(f"Error downloading ZIP file: {e}")

def check_zip_signature(file_path):
    try:
        with open(file_path, 'rb') as f:
            header = f.read(4)
            print(f"File header: {header}")
            return header == b'PK\x03\x04'
    except Exception as e:
        print(f"Error checking ZIP file signature: {e}")
        return False

# Example usage
years_list = ['2021', '2022', '2023', '2024']
base_url = "https://disclosures-clerk.house.gov/public_disc/financial-pdfs/{year}FD.zip"
desktop_dir = os.path.expanduser("~/Desktop")

for year in years_list:
    zip_url = base_url.format(year=year)
    zip_filename = f"{year}FD.zip"
    zip_path = os.path.join(desktop_dir, zip_filename)
    
    # Download the ZIP file
    download_zip(zip_url, zip_path)
    
    # Verify ZIP file signature
    if check_zip_signature(zip_path):
        print(f"{zip_path} is a valid ZIP file.")
    else:
        print(f"{zip_path} is not a valid ZIP file or is corrupted.")


Downloading from https://disclosures-clerk.house.gov/public_disc/financial-pdfs/2021FD.zip
Response Status Code: 200
Response Content-Type: application/x-zip-compressed
Downloaded ZIP file to /Users/paigeblackstone/Desktop/2021FD.zip
File header: b'PK\x03\x04'
/Users/paigeblackstone/Desktop/2021FD.zip is a valid ZIP file.
Downloading from https://disclosures-clerk.house.gov/public_disc/financial-pdfs/2022FD.zip
Response Status Code: 200
Response Content-Type: application/x-zip-compressed
Downloaded ZIP file to /Users/paigeblackstone/Desktop/2022FD.zip
File header: b'PK\x03\x04'
/Users/paigeblackstone/Desktop/2022FD.zip is a valid ZIP file.
Downloading from https://disclosures-clerk.house.gov/public_disc/financial-pdfs/2023FD.zip
Response Status Code: 200
Response Content-Type: application/x-zip-compressed
Downloaded ZIP file to /Users/paigeblackstone/Desktop/2023FD.zip
File header: b'PK\x03\x04'
/Users/paigeblackstone/Desktop/2023FD.zip is a valid ZIP file.
Downloading from https://dis

In [21]:
def extract_zip(zip_path, extract_dir):
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_dir)
        print(f"Extracted contents to {extract_dir}")
    except zipfile.BadZipFile:
        print(f"Bad ZIP file: {zip_path}")
    except Exception as e:
        print(f"Error extracting ZIP file: {e}")

def read_files_from_directory(directory_path):
    data_frames = {}
    for file_name in os.listdir(directory_path):
        file_path = os.path.join(directory_path, file_name)
        if file_name.endswith('.txt'):
            # Read text file into DataFrame, adjust delimiter if needed
            df = pd.read_csv(file_path, delimiter='\t')  # Assuming tab-delimited
            data_frames[file_name] = df
        elif file_name.endswith('.xml'):
            # Read XML file into DataFrame
            df = pd.read_xml(file_path)
            data_frames[file_name] = df
    return data_frames

In [22]:
years_list = ['2021', '2022', '2023', '2024']
base_url = "https://disclosures-clerk.house.gov/public_disc/financial-pdfs/{year}FD.zip"
desktop_dir = os.path.expanduser("~/Desktop")

for year in years_list:
    zip_url = base_url.format(year=year)
    zip_filename = f"{year}FD.zip"
    zip_path = os.path.join(desktop_dir, zip_filename)
    extracted_dir = os.path.join(desktop_dir, year)
    
    # Download the ZIP file
    download_zip(zip_url, zip_path)
    
    # Verify ZIP file signature
    if check_zip_signature(zip_path):
        print(f"{zip_path} is a valid ZIP file.")
        
        # Extract the ZIP file
        extract_zip(zip_path, extracted_dir)
        
        # Read extracted files into DataFrames
        data_frames = read_files_from_directory(extracted_dir)
        
        # Print the DataFrames or do further processing
        for file_name, df in data_frames.items():
            print(f"Data from {file_name}:")
            print(df.head()) 

Downloading from https://disclosures-clerk.house.gov/public_disc/financial-pdfs/2021FD.zip
Response Status Code: 200
Response Content-Type: application/x-zip-compressed
Downloaded ZIP file to /Users/paigeblackstone/Desktop/2021FD.zip
File header: b'PK\x03\x04'
/Users/paigeblackstone/Desktop/2021FD.zip is a valid ZIP file.
Extracted contents to /Users/paigeblackstone/Desktop/2021
Data from 2021FD.xml:
  Prefix        Last           First Suffix FilingType StateDst  Year  \
0   None      Aazami         Shervin   None          C     CA30  2021   
1   None  Abdelhamid            Rana   None          C     NY12  2021   
2   None   Achenbach     Jack Joseph   None          A     FL10  2021   
3   None   Achenbach     Jack Joseph   None          C     FL10  2021   
4   None     Acheson  Phelan Douglas   None          W     UT02  2021   

  FilingDate     DocID  
0  2/22/2021  10039551  
1  5/14/2021  10039753  
2   7/8/2021  10041290  
3   7/7/2021  10041275  
4       None   8218052  
Data fr

In [23]:
df.columns.to_list()

['Prefix',
 'Last',
 'First',
 'Suffix',
 'FilingType',
 'StateDst',
 'Year',
 'FilingDate',
 'DocID']