In [10]:
# Import packages
import pandas as pd
import requests
import json
import edgar

### Downlaod Master Indices

In [11]:
# Download master indices from SEC
year_begin = 2023
user_agent = "AnthonyNing/1.0 (pn2189@nyu.edu; For educational purposes)"
edgar.download_index('.', year_begin, user_agent, skip_all_present_except_last=False)

### Retrieve the latest S&P500 composition

In [19]:
# Load historical sp500 composition
sp500_com = pd.read_excel('sp500_composition.xlsx')

In [40]:
# Present year
today = pd.to_datetime('today').year

# Retrieve the sp500 composition from the previous quarter
def update_indices(year, quarter):
    # Assign timestamps for different quarters
    if quarter == 1:
        date = pd.Timestamp(year, 1, 1)
    elif quarter == 2:
        date = pd.Timestamp(year, 4, 1)
    elif quarter == 3:
        date = pd.Timestamp(year, 7, 1)
    else:
        date = pd.Timestamp(year, 10, 1)

    # Find the closest date to the assigned dates to the quarters
    closest_date = find_close_date(date)

    # Export company names, tickers, permnos of S&P500 based on the closest date
    sp_data = sp500_com.loc[sp500_com.Date==closest_date, ['Company Name', 'Ticker', 'Permno']]
    
    return sp_data


# Find the closest date to the assigned dates to the quarters
def find_close_date(date):
    
    close_date = pd.Timestamp(2000, 1, 1)
    time_delta = date - close_date
    
    for row in sp500_com.iterrows():
        if row[1][1] <= date and date - row[1][1] < time_delta:
            close_date = row[1][1]
            time_delta = date - close_date
    return close_date

In [41]:
# URL to the SEC's CIK-to-Ticker mapping file
url = "https://www.sec.gov/files/company_tickers.json"
headers = {'User-Agent':"AnthonyNing/1.0 (pn2189@nyu.edu; For educational purposes)"}

# Download the file
response = requests.get(url, headers=headers)
data = response.json()

# Function to get ticker from CIK
def get_ticker_from_cik(cik):
    for company in data.values():
        if company['cik_str'] == cik:
            return company['ticker']
    return None

In [43]:
company_name = []
tik_lst = []
cik_lst = []
per_lst = []
time = []
form = []
file = []
year_begin = 2023

for year in range(year_begin, today+1):
    
    # There's been only 3 quarters so far in 2024
    if year==today:
        quarters = 4
    else:
        quarters = 5
    
    for quarter in range(1, quarters):
        
        # Loading master index
        filename = str(year)+'-QTR'+str(quarter)+'.tsv'
        mas_ind = pd.read_csv(filename, sep='|', header=None)
        
        # Retrieving tickers using the updated(latest) sp500 composition
        sp_lst = update_indices(year, quarter)
        sp_c = list(sp_lst['Company Name'])
        sp_t = list(sp_lst['Ticker'])
        sp_p =list(sp_lst['Permno'])
        
        # Checking every file entry
        for i in range(len(mas_ind.loc[:, 0])):

            # Getting tickers using cik from master index
            cik = int(mas_ind.loc[i, 0])
            ticker = get_ticker_from_cik(cik)

            # Checking if the ticker is in sp500
            if ticker in sp_t:
                
                # Checking if the file is 10K or 10Q
                if mas_ind.loc[i, 2] == '10-K' or mas_ind.loc[i, 2]=='10-K/A' or mas_ind.loc[i, 2]=='10-Q' or mas_ind.loc[i, 2]=='10-Q/A':
                    
                    company_name.append(mas_ind.loc[i, 1])
                    tik_lst.append(ticker)
                    cik_lst.append(mas_ind.loc[i, 0])
                    per_lst.append(sp_p[sp_t.index(ticker)])
                    time.append(mas_ind.loc[i, 3])
                    form.append(mas_ind.loc[i, 2])
                    file.append(mas_ind.loc[i, 4])
                    
                else:
                    continue
            else:
                continue

  if row[1][1] <= date and date - row[1][1] < time_delta:
  close_date = row[1][1]
  if row[1][1] <= date and date - row[1][1] < time_delta:
  close_date = row[1][1]
  if row[1][1] <= date and date - row[1][1] < time_delta:
  close_date = row[1][1]
  if row[1][1] <= date and date - row[1][1] < time_delta:
  close_date = row[1][1]
  if row[1][1] <= date and date - row[1][1] < time_delta:
  close_date = row[1][1]
  if row[1][1] <= date and date - row[1][1] < time_delta:
  close_date = row[1][1]
  if row[1][1] <= date and date - row[1][1] < time_delta:
  close_date = row[1][1]


In [46]:
# Create dataframe
file_dict = {'form_type': form, 'company_name': company_name, 'permno': per_lst, 
             'ticker':tik_lst, 'cik': cik_lst, 'filed_date': time, 'filename': file}
df = pd.DataFrame(file_dict)
df

Unnamed: 0,form_type,company_name,permno,ticker,cik,filed_date,filename
0,10-K,HENRY SCHEIN INC,82581,HSIC,1000228,2023-02-21,edgar/data/1000228/0001000228-23-000011.txt
1,10-K,WATERS CORP /DE/,82651,WAT,1000697,2023-02-27,edgar/data/1000697/0001193125-23-050827.txt
2,10-Q,ESTEE LAUDER COMPANIES INC,82642,EL,1001250,2023-02-02,edgar/data/1001250/0001001250-23-000010.txt
3,10-Q,"NetApp, Inc.",82598,NTAP,1002047,2023-03-01,edgar/data/1002047/0000950170-23-005545.txt
4,10-K,AMEREN CORP,24985,AEE,1002910,2023-02-22,edgar/data/1002910/0001002910-23-000053.txt
...,...,...,...,...,...,...,...
3449,10-K,SYSCO CORP,52038,SYY,96021,2024-08-28,edgar/data/96021/0000096021-24-000128.txt
3450,10-Q,TELEFLEX INC,44329,TFX,96943,2024-08-02,edgar/data/96943/0000096943-24-000132.txt
3451,10-Q,"TERADYNE, INC",51369,TER,97210,2024-08-02,edgar/data/97210/0000950170-24-089858.txt
3452,10-Q,TEXAS INSTRUMENTS INC,15579,TXN,97476,2024-07-24,edgar/data/97476/0000097476-24-000030.txt


In [47]:
# Write data into csv format
df.to_csv('sp500_files.csv', index=False)