### Scraping Wikipedia to obtain all company names that make up the composite S&P 1500 (06.07.2025)

In [1]:
import requests
import pandas as pd
import json

- S&P 500

https://en.wikipedia.org/wiki/List_of_S%26P_500_companies

In [40]:
# Step 1: Scrape the S&P 500 list from Wikipedia
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
tables = pd.read_html(url)
sp500_table = tables[0]  # First table on the page

# Extract Symbol and Security columns
sp500_df = sp500_table[['Symbol', 'Security', 'CIK']].copy()
sp500_df["CIK"] = sp500_df["CIK"].astype(str).str.zfill(10)  # Ensure CIK is 10 digits

# Save to CSV
sp500_df.to_csv("data/sp500_companies.csv", index=False)

In [41]:
sp500_df.head()

Unnamed: 0,Symbol,Security,CIK
0,MMM,3M,66740
1,AOS,A. O. Smith,91142
2,ABT,Abbott Laboratories,1800
3,ABBV,AbbVie,1551152
4,ACN,Accenture,1467373


---
- S&P MidCap 600 component stocks

https://en.wikipedia.org/wiki/List_of_S%26P_600_companies

In [42]:
url = "https://en.wikipedia.org/wiki/List_of_S%26P_600_companies"
tables = pd.read_html(url)
sp600_table = tables[0]  # First table on the page

# Extract Symbol and Security columns
sp600_df = sp600_table[['Symbol', 'Company', 'CIK']].copy()
sp600_df["CIK"] = sp600_df["CIK"].astype(str).str.zfill(10)  # Ensure CIK is 10 digits

# Rename company to Security to match S&P 500 format
sp600_df.rename(columns={"Company": "Security"}, inplace=True)
sp600_df.head()

Unnamed: 0,Symbol,Security,CIK
0,AAP,"Advance Auto Parts, Inc.",1158449
1,AAT,American Assets Trust,1500217
2,ABCB,Ameris Bancorp,351569
3,ABG,Asbury Automotive Group,1144980
4,ABM,"ABM Industries, Inc.",771497


In [43]:
sp600_df.to_csv("data/sp600_companies.csv", index=False)

---
- S&P 400 companies

https://en.wikipedia.org/wiki/List_of_S%26P_400_companies

Unfortunately, Wikipedia does not provide the CIKs for those companies. Therefore, they have to be looked up using a .json file from the SEC.

In [44]:
url = "https://en.wikipedia.org/wiki/List_of_S%26P_400_companies"
tables = pd.read_html(url)
sp400_table = tables[0]  # First table on the page

# Extract Symbol and Security columns
sp400_df = sp400_table[['Symbol', 'Security']].copy()
sp400_df.head()

# Exctract company symbols/Tickers to look up CIKs
company_symbols = sp400_df['Symbol'].tolist()

In [45]:
# Empty dictionary to hold CIK lookups
lookup = {}

# Load in the company tickers JSON file
with open('company_tickers.json') as f:
    data = json.load(f)

# Create a lookup dictionary from the JSON data
lookup = {v['ticker'].upper(): str(v['cik_str']).zfill(10) for v in data.values()}

# Convert do dataframe
lookup_df = pd.DataFrame(list(lookup.items()), columns=['Symbol', 'CIK'])
lookup_df.head()

Unnamed: 0,Symbol,CIK
0,MSFT,789019
1,AAPL,320193
2,NVDA,1045810
3,GOOGL,1652044
4,AMZN,1018724


In [None]:
# Merge with S&P 400 DataFrame to add full company names again
sp400_df = sp400_df.merge(lookup_df, on='Symbol', how='left')

# Show rows with missing CIKs
for index, row in sp400_df[sp400_df['CIK'].isnull()].iterrows():
    print(f"Missing CIK for {row['Symbol']} - {row['Security']}")

Missing CIK for GTM - ZoomInfo


In [None]:
# Manually impute looked up CIK, which is 0001794515
sp400_df.loc[sp400_df['Symbol'] == 'GTM', 'CIK'] = '0001794515'

In [47]:
# Save the DataFrame to a CSV file
sp400_df.to_csv("data/sp400_companies.csv", index=False)