In [4]:
# 02_fetch_recent_filings.ipynb

import requests
import json
import pandas as pd
import os
import re

# Ensure data directory exists
os.makedirs('../data', exist_ok=True)

# Load the CIK list and ensure leading zeros
cik_df = pd.read_csv('../data/masterworks_entity_list.csv')
cik_df['CIK'] = cik_df['CIK'].astype(str).str.zfill(10)

# Load the active SPV entities list
active_spv_df = pd.read_csv('../data/active_spv_entities.csv')

# Function to normalize company names
def normalize_name(name):
    return re.sub(r'\s+', ' ', re.sub(r'[^\w\s]', '', name)).strip().lower()

# Normalize the company names in both lists
cik_df['Normalized Name'] = cik_df['Company Name'].apply(normalize_name)
active_spv_df['Normalized Name'] = active_spv_df['Entity'].apply(normalize_name)

# Filter the CIK list based on normalized names
filtered_cik_df = cik_df[cik_df['Normalized Name'].isin(active_spv_df['Normalized Name'])]

headers = {
    'User-Agent': 'Ahmet Besiroglu (abesiroglu@masterworks.com)'
}

all_filings_data = []

for _, row in filtered_cik_df.iterrows():
    cik = row['CIK']
    company_name = row['Company Name']
    url = f'https://data.sec.gov/submissions/CIK{cik}.json'
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        submission_history = response.json()
        recent_filings = submission_history.get('filings', {}).get('recent', {})
        if isinstance(recent_filings, dict):
            forms = recent_filings.get('form', [])
            accession_numbers = recent_filings.get('accessionNumber', [])
            filing_dates = recent_filings.get('filingDate', [])
            primary_documents = recent_filings.get('primaryDocument', [])

            # Filter for the most recent 1-K and 1-SA filings
            latest_filings = {'1-K': None, '1-SA': None}
            for form, accession_number, filing_date, primary_document in zip(forms, accession_numbers, filing_dates, primary_documents):
                if form in latest_filings:
                    if latest_filings[form] is None or filing_date > latest_filings[form]['filing_date']:
                        latest_filings[form] = {
                            'CIK': cik,
                            'Company Name': company_name,
                            'form': form,
                            'accession_number': accession_number,
                            'filing_date': filing_date,
                            'primary_document': primary_document,
                            'document_url': f"https://www.sec.gov/Archives/edgar/data/{cik}/{accession_number.replace('-', '')}/{primary_document}",
                            'txt_file_url': f"https://www.sec.gov/Archives/edgar/data/{cik}/{accession_number.replace('-', '')}/{accession_number}.txt"
                        }
            for filing in latest_filings.values():
                if filing:
                    all_filings_data.append(filing)
    else:
        print(f"Failed to download data for CIK {cik}. Status code: {response.status_code}")

# Save all filings data to a CSV file
df_filings = pd.DataFrame(all_filings_data)
print(df_filings)

output_path = '../data/recent_filings.csv'
df_filings.to_csv(output_path, index=False)
print(f"Most recent filings data has been written to '{output_path}'")


            CIK          Company Name  form      accession_number filing_date  \
0    0001738134  Masterworks 001, LLC   1-K  0001493152-24-016270  2024-04-26   
1    0001738134  Masterworks 001, LLC  1-SA  0001493152-23-034648  2023-09-28   
2    0001791539  Masterworks 004, LLC   1-K  0001493152-24-016271  2024-04-26   
3    0001791539  Masterworks 004, LLC  1-SA  0001493152-23-034125  2023-09-27   
4    0001794758  Masterworks 005, LLC   1-K  0001493152-24-016272  2024-04-26   
..          ...                   ...   ...                   ...         ...   
531  0001976509  Masterworks 287, LLC  1-SA  0001493152-23-034387  2023-09-27   
532  0001977673  Masterworks 288, LLC   1-K  0001493152-24-016636  2024-04-26   
533  0001977673  Masterworks 288, LLC  1-SA  0001493152-23-034388  2023-09-27   
534  0001977675  Masterworks 289, LLC   1-K  0001493152-24-016637  2024-04-26   
535  0001977675  Masterworks 289, LLC  1-SA  0001493152-23-034389  2023-09-27   

               primary_docu

In [None]:
# 02_fetch_recent_filings.ipynb

import requests
import json
import pandas as pd
import os
from fuzzywuzzy import process

# Ensure data directory exists
os.makedirs('../data', exist_ok=True)

# Load the CIK list and ensure leading zeros
cik_df = pd.read_csv('../data/masterworks_entity_list.csv')
cik_df['CIK'] = cik_df['CIK'].astype(str).str.zfill(10)

# Load the active SPV entities list
active_spv_df = pd.read_csv('../data/active_spv_entities.csv')

try:
    # Prepare the lists for fuzzy matching
    active_spv_entities = active_spv_df['Entity'].str.upper().tolist()
    company_names = cik_df['Company Name'].str.upper().tolist()

    # Use fuzzy matching to find the best matches
    matches = []
    for entity in active_spv_entities:
        match, score = process.extractOne(entity, company_names)
        if score > 80:  # Adjust the threshold as needed
            matches.append(match)

    # Filter the CIK list based on fuzzy matching results
    filtered_cik_df = cik_df[cik_df['Company Name'].str.upper().isin(matches)]
except Exception as e:
    print(f"Fuzzy matching failed with error: {e}")
    print("Falling back to simple lowercase and capitalize matching.")

    # Simple lowercase and capitalize matching
    active_spv_entities = active_spv_df['Entity'].str.lower().str.title().tolist()
    cik_df['Company Name'] = cik_df['Company Name'].str.lower().str.title()
    filtered_cik_df = cik_df[cik_df['Company Name'].isin(active_spv_entities)]

headers = {
    'User-Agent': 'Ahmet Besiroglu (abesiroglu@masterworks.com)'
}

all_filings_data = []

for _, row in filtered_cik_df.iterrows():
    cik = row['CIK']
    company_name = row['Company Name']
    url = f'https://data.sec.gov/submissions/CIK{cik}.json'
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        submission_history = response.json()
        recent_filings = submission_history.get('filings', {}).get('recent', {})
        if isinstance(recent_filings, dict):
            forms = recent_filings.get('form', [])
            accession_numbers = recent_filings.get('accessionNumber', [])
            filing_dates = recent_filings.get('filingDate', [])
            primary_documents = recent_filings.get('primaryDocument', [])

            # Filter for the most recent 1-K and 1-SA filings
            latest_filings = {'1-K': None, '1-SA': None}
            for form, accession_number, filing_date, primary_document in zip(forms, accession_numbers, filing_dates, primary_documents):
                if form in latest_filings:
                    if latest_filings[form] is None or filing_date > latest_filings[form]['filing_date']:
                        latest_filings[form] = {
                            'CIK': cik,
                            'Company Name': company_name,
                            'form': form,
                            'accession_number': accession_number,
                            'filing_date': filing_date,
                            'primary_document': primary_document,
                            'document_url': f"https://www.sec.gov/Archives/edgar/data/{cik}/{accession_number.replace('-', '')}/{primary_document}",
                            'txt_file_url': f"https://www.sec.gov/Archives/edgar/data/{cik}/{accession_number.replace('-', '')}/{accession_number}.txt"
                        }
            for filing in latest_filings.values():
                if filing:
                    all_filings_data.append(filing)
    else:
        print(f"Failed to download data for CIK {cik}. Status code: {response.status_code}")

# Save all filings data to a CSV file
df_filings = pd.DataFrame(all_filings_data)
print(df_filings)

output_path = '../data/recent_filings.csv'
df_filings.to_csv(output_path, index=False)
print(f"Most recent filings data has been written to '{output_path}'")
