In [None]:
!pip install sec-parsers # base package
!pip install sec-parsers['all'] # installs all extras
!pip install sec-parsers['downloaders'] # installs downloaders extras
!pip install sec-parsers['visualizers'] # installs visualizers extras
!pip install sec-edgar-downloader

Collecting sec-parsers
  Downloading sec_parsers-0.549-py3-none-any.whl.metadata (6.9 kB)
Downloading sec_parsers-0.549-py3-none-any.whl (21 kB)
Installing collected packages: sec-parsers
Successfully installed sec-parsers-0.549
Collecting sec-downloaders (from sec-parsers[all])
  Downloading sec_downloaders-0.4-py3-none-any.whl.metadata (700 bytes)
Collecting sec-visualizers (from sec-parsers[all])
  Downloading sec_visualizers-0.9-py3-none-any.whl.metadata (404 bytes)
Downloading sec_downloaders-0.4-py3-none-any.whl (171 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sec_visualizers-0.9-py3-none-any.whl (3.7 kB)
Installing collected packages: sec-visualizers, sec-downloaders
Successfully installed sec-downloaders-0.4 sec-visualizers-0.9
Collecting sec-edgar-downloader
  Downloading sec_edgar_downloader-5.0.3-py3-none-any.whl.metadata (11 kB)
Collecting pyrate-limiter>=3.6.0 (from sec-edgar-

In [None]:
from sec_parsers import Filing
from sec_downloaders import SEC_Downloader
import csv

# Initialize the SEC_Downloader
downloader = SEC_Downloader()
downloader.set_headers("John Doe", "johndoe@example.com")

# Replace with the actual URL of the filing you want to download
url = 'https://www.sec.gov/Archives/edgar/data/1318605/000162828024002390/tsla-20231231.htm'

# Download the filing content from the URL
download = downloader.download(url)

# Create Filing object from the downloaded HTML content
filing = Filing(download)

# Parse the filing content to prepare for extraction
filing.parse()

# Extract the specific sections (e.g., 'Item 1A - Risk Factors' or 'Management's Discussion and Analysis')
sections = filing.find_all_sections_from_title('Item 1A')  # Change 'Item 1A' to 'Management\'s Discussion and Analysis' if needed

# Check the sections and their attributes
for section in sections:
    print(f"Section Title: {section.attrib.get('title', 'No Title')}")
    print(f"Section Content: {filing.get_text_from_section(section)}\n")

# Save the extracted sections and their content to a CSV file
with open('filing.csv', mode='w', encoding='utf-8', newline='') as file:  # Changed encoding to 'utf-8'
    writer = csv.writer(file)
    writer.writerow(['Section Title', 'Content'])  # Write the header
    for section in sections:
        title = section.attrib.get('title', 'Unknown')  # Default value if 'title' is missing
        content = filing.get_text_from_section(section)
        writer.writerow([title, content])  # Write each row (section title and content)

print("CSV file saved successfully as 'filing.csv'.")


Section Title: ITEM 1A. RISK FACTORS
Section Content: ITEM 1A. RISK FACTORS
You should carefully consider the risks described below together with the other information set forth in this report, which could materially affect our business, financial condition and future results. The risks described below are not the only risks facing our company. Risks and uncertainties not currently known to us or that we currently deem to be immaterial also may materially adversely affect our business, financial condition and operating results.

Risks Related to Our Ability to Grow Our Business

We may experience delays in launching and ramping the production of our products and features, or we may be unable to control our manufacturing costs.
We have previously experienced and may in the future experience launch and production ramp delays for new products and features. For example, we encountered unanticipated supplier issues that led to delays during the initial ramp of our first Model X and experien

In [None]:
import pandas as pd
df=pd.read_csv("/content/unique_tickers.csv")
df.head()

Unnamed: 0,Ticker
0,AE.2
1,AMFD.
2,ANTQ
3,AIR
4,ABA.2


In [None]:
import os
import shutil
import re
import csv
from sec_edgar_downloader import Downloader
from sec_parsers import Filing

# Initialize SEC Edgar Downloader
edgar_downloader = Downloader("MyCompanyName", "my.email@domain.com")

# Function to download filings and extract specified sections
def process_filings_and_extract_sections(tickers, start_year, end_year, sections_to_extract):
    # Create a folder for saving extracted sections
    output_folder = "extracted_sections_results"
    os.makedirs(output_folder, exist_ok=True)

    # Track tickers successfully processed, failed, and missing sections
    tickers_processed = []
    tickers_failed = []
    missing_sections = {}

    for ticker in tickers:
        try:
            print(f"Processing filings for ticker: {ticker}")

            # Download 10-K filings for the given ticker
            edgar_downloader.get("10-K", ticker)

            # Directory containing the downloaded filings
            filings_dir = f"sec-edgar-filings/{ticker}/10-K/"

            # Initialize list for missing years
            missing_years = []

            # Process each filing in the directory
            for folder in os.listdir(filings_dir):
                folder_path = os.path.join(filings_dir, folder)
                if os.path.isdir(folder_path):
                    # Extract year from the folder name
                    year_match = re.search(r'-(\d{2})-', folder)
                    if year_match:
                        year = f"19{year_match.group(1)}" if int(year_match.group(1)) > 50 else f"20{year_match.group(1)}"
                        if not (start_year <= int(year) <= end_year):
                            print(f"Skipping filing from year {year} (outside specified range).")
                            continue
                    else:
                        print(f"Year not found in folder: {folder}")
                        continue

                    # Process each HTML or text file in the folder
                    for file in os.listdir(folder_path):
                        if file.endswith(".html") or file.endswith(".txt"):
                            file_path = os.path.join(folder_path, file)
                            try:
                                # Read the local file content
                                with open(file_path, "r", encoding="utf-8") as f:
                                    file_content = f.read()

                                # Initialize Filing object with file content
                                filing = Filing(file_content)
                                filing.parse()

                                # Extract specified sections
                                for section_title in sections_to_extract:
                                    extracted_sections = filing.find_all_sections_from_title(section_title)
                                    if not extracted_sections:
                                        missing_years.append(year)
                                        continue

                                    # Save extracted sections to CSV
                                    csv_file = os.path.join(output_folder, f"{ticker}_{year}_{section_title.replace(' ', '_')}.csv")
                                    with open(csv_file, mode='w', encoding='utf-8', newline='') as csv_out:
                                        writer = csv.writer(csv_out)
                                        writer.writerow(['Section Title', 'Content'])  # Header
                                        for section in extracted_sections:
                                            title = section.attrib.get('title', 'Unknown')
                                            content = filing.get_text_from_section(section)
                                            writer.writerow([title, content])
                                    print(f"Saved '{section_title}' for {ticker} ({year}) to {csv_file}.")

                            except Exception as e:
                                print(f"Error processing {file_path}: {e}")

            # Add ticker to processed list
            tickers_processed.append(ticker)

            # Store missing years for the ticker
            if missing_years:
                missing_sections[ticker] = missing_years

            # Clean up: delete the downloaded filing directory
            shutil.rmtree(filings_dir)
            print(f"Cleaned up files for {ticker}.")

        except Exception as e:
            print(f"Failed to process filings for ticker {ticker}: {e}")
            tickers_failed.append(ticker)

    # Summary of processing
    print("\nProcessing Summary:")
    print(f"Tickers Processed: {tickers_processed}")
    print(f"Tickers Failed: {tickers_failed}")
    print("Missing Sections:")
    for ticker, years in missing_sections.items():
        print(f"{ticker}: {years}")

# Example usage
if __name__ == "__main__":
    # List of tickers to process
    tickers = df['Ticker'].tolist()[0:5000] # Replace with your list of tickers

    # Specify the range of years
    start_year = 1900
    end_year = 2024

    # Sections to extract (you can add/remove section titles as needed)
    sections_to_extract = [

        "Item 1A"

    ]

    # Process the filings and extract specified sections
    process_filings_and_extract_sections(tickers, start_year, end_year, sections_to_extract)


Processing filings for ticker: AE.2
Failed to process filings for ticker AE.2: Ticker 'AE.2' is invalid and cannot be mapped to a CIK. Please enter a valid ticker or CIK.
Processing filings for ticker: AMFD.
Failed to process filings for ticker AMFD.: Ticker 'AMFD.' is invalid and cannot be mapped to a CIK. Please enter a valid ticker or CIK.
Processing filings for ticker: ANTQ
Failed to process filings for ticker ANTQ: Ticker 'ANTQ' is invalid and cannot be mapped to a CIK. Please enter a valid ticker or CIK.
Processing filings for ticker: AIR
Saved 'Item 1A' for AIR (2020) to extracted_sections_results/AIR_2020_Item_1A.csv.




Saved 'Item 1A' for AIR (2016) to extracted_sections_results/AIR_2016_Item_1A.csv.
Saved 'Item 1A' for AIR (2009) to extracted_sections_results/AIR_2009_Item_1A.csv.
Saved 'Item 1A' for AIR (2010) to extracted_sections_results/AIR_2010_Item_1A.csv.
Saved 'Item 1A' for AIR (2012) to extracted_sections_results/AIR_2012_Item_1A.csv.
Saved 'Item 1A' for AIR (2021) to extracted_sections_results/AIR_2021_Item_1A.csv.
Saved 'Item 1A' for AIR (2008) to extracted_sections_results/AIR_2008_Item_1A.csv.
Saved 'Item 1A' for AIR (2018) to extracted_sections_results/AIR_2018_Item_1A.csv.
Saved 'Item 1A' for AIR (2014) to extracted_sections_results/AIR_2014_Item_1A.csv.
Saved 'Item 1A' for AIR (2015) to extracted_sections_results/AIR_2015_Item_1A.csv.
Saved 'Item 1A' for AIR (2023) to extracted_sections_results/AIR_2023_Item_1A.csv.
Saved 'Item 1A' for AIR (2024) to extracted_sections_results/AIR_2024_Item_1A.csv.
Saved 'Item 1A' for AIR (2019) to extracted_sections_results/AIR_2019_Item_1A.csv.
Save

In [None]:
import os
import csv
import pandas as pd
import sys

# Increase CSV field size limit to handle large fields
csv.field_size_limit(sys.maxsize)

# Folder where the extracted CSV files are stored
input_folder = "extracted_sections_results"

# Create an output folder to store the yearly CSVs
output_folder = "yearly_combined_results"
os.makedirs(output_folder, exist_ok=True)

# Dictionary to hold combined data for each year
yearly_data = {}

# Iterate over all CSV files in the input folder
for file in os.listdir(input_folder):
    if file.endswith(".csv"):
        # Extract ticker and year from the file name (e.g., AAPL_2008_Item_1A.csv)
        parts = file.split('_')
        ticker = parts[0]
        year = parts[1]

        # Initialize the content variable
        content = ""

        # Read the content from the extracted section CSV file
        with open(os.path.join(input_folder, file), 'r', encoding='utf-8') as csv_file:
            reader = csv.reader(csv_file)
            header = next(reader)  # Skip the header row
            for row in reader:
                # Merge all content for each row
                content += row[1] + " "  # Assuming content is in the second column

        # Add the extracted data to the yearly data dictionary
        if year not in yearly_data:
            yearly_data[year] = {}

        if ticker not in yearly_data[year]:
            yearly_data[year][ticker] = ""

        # Append the combined content for the ticker and year
        yearly_data[year][ticker] += content.strip() + " "

# Now create CSV files for each year
for year, tickers in yearly_data.items():
    year_file_path = os.path.join(output_folder, f"{year}.csv")

    # Prepare the rows for the CSV
    rows = []
    for ticker, combined_content in tickers.items():
        rows.append([ticker, combined_content.strip()])  # Strip extra spaces at the end

    # Write the combined data for each year into the CSV
    with open(year_file_path, mode='w', encoding='utf-8', newline='') as year_csv_file:
        writer = csv.writer(year_csv_file)
        writer.writerow(['Ticker', 'Extracted 1A'])  # Header
        writer.writerows(rows)

    print(f"Saved combined data for {year} to {year_file_path}.")

print("Processing complete. Combined CSVs saved for each year.")


In [None]:
import os
import zipfile

# Folder where the yearly combined CSV files are stored
input_folder = "yearly_combined_results"

# Output zip file name
output_zip = "combined_yearly_data.zip"

# Create a zip file and add all CSV files
with zipfile.ZipFile(output_zip, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for file in os.listdir(input_folder):
        if file.endswith(".csv"):
            file_path = os.path.join(input_folder, file)
            zipf.write(file_path, arcname=file)  # arcname keeps the filename without full path

print(f"All CSV files have been zipped into {output_zip}.")


In [None]:
from google.colab import files
import shutil

# Path to the zip file you created
zip_file_path = "combined_yearly_data.zip"

# Move the zip file to the current working directory (if necessary)
shutil.move(zip_file_path, '/content/combined_yearly_data.zip')

# Trigger the download
files.download('/content/combined_yearly_data.zip')
