In [3]:
!pip install -U sec-edgar-downloader
!pip install openai
!pip install pandas
!pip install matplotlib
!pip install numpy



In [13]:
!pip install flask



In [3]:
import os
import glob
import openai
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import requests
import json
import re
import shutil

openai.api_key = 'API_KEY_HERE'



In [12]:
from sec_edgar_downloader import Downloader

In [99]:
dl = Downloader("ShuklaInc", "adityashukla2015@gmail.com")
print(dl.get("10-K", "MSFT", after="1994-12-31", before="2024-01-01"))
print(dl.get("10-K", "TSLA", after="1994-12-31", before="2024-01-01"))
print(dl.get("10-K", "AAPL", after="1994-12-31", before="2024-01-01"))

29
13
27


In [10]:
def analyze_company(company_folder, year_folder):
    """
    Analyzes the 10-K filings of a company for a given year by extracting financial insights using OpenAI's GPT model.

    Parameters:
    - company_folder (str): Path to the folder containing the company's 10-K filings.
    - year_folder (str): The specific year folder to analyze within the company's 10-K filings.

    Returns:
    - tuple: A tuple containing the company name, year, and a dictionary of extracted financial insights.
    """
    # Read all submission.txt files in the year_folder
    report_text = ""
    year_folder_path = os.path.join(company_folder, year_folder)
    for submission_folder in os.listdir(year_folder_path):
        submission_folder_path = os.path.join(year_folder_path, submission_folder)
        if os.path.isdir(submission_folder_path):
            submission_file_path = os.path.join(submission_folder_path, 'submission.txt')
            if os.path.exists(submission_file_path):
                with open(submission_file_path, 'r') as file:
                    submission_text = file.read()
                    report_text += submission_text + "\n"
                    print(f"Read report from: {submission_file_path}\n{submission_text}")  # Debug statement

    company_name = os.path.basename(os.path.dirname(company_folder))

    # JSON extraction prompt
    json_prompt = f"""
    From the 10-k filing information that is passed in to you, extract the following information: Revenue (product wise if applicable), Net Income (product wise if applicable), Effective Tax Rate, Deferred Tax Assets, Deferred Tax Liabilities, Foreign Income Percentage, and any other relevant financial information.
    Answer in proper json format. Make sure the format is right that it doesn't face the JSONDecodeError: Expecting ',' delimiter issue. Note: Only return the json, no additional text.
    Example Json (sub-fields may not match completely and change but the units should be same (billions), convert if you need to)
    {{
        "2022": {{
            "Revenue": {{"Compute & Networking": "$26.938 billion", "Graphics": "$11.718 billion"}},
            "Net Income": {{"Compute & Networking": "$7.634 billion", "Graphics": "$2.462 billion"}},
            "Effective Tax Rate": "9.9%",
            "Deferred Tax Assets": "$5.05 billion",
            "Deferred Tax Liabilities": "$339 million",
            "Foreign Income Percentage": "85%"
        }},
        "2021": {{
            ...
        }}
    }}

    Note: main fields ("Revenue", "Net Income", "Effective Tax Rate", "Deferred Tax Assets", "Deferred Tax Liabilities", "Foreign Income Percentage") should be present in the json. However, sub-fields may vary for example for Revenue the source of revenue may be different for different companies or different for same company in different years. You should mention the source of revenue in the sub-fields.
    Also mention profit or loss with positive or negative sign in front of the number.
    Make sure the json format is parseable and correct and doesn't face issues like "Expecting property name enclosed in double quotes", "Expecting ',' delimiter", etc.
    Here are the reports:
    {report_text}
    """

    try:
        response_json = openai.Completion.create(
            model="gpt-3.5-turbo-instruct",
            prompt=json_prompt,
            max_tokens=1500,
            n=1,
            stop=None,
            temperature=0.7,
        )
        json_data = response_json['choices'][0]['text'].strip()
        print(f"Extracted JSON data for {company_name} - {year_folder}: {json_data}")  # Debug statement
        json_insights = json.loads(json_data)
    except Exception as e:
        print(f"Error extracting JSON insights for {company_name} - {year_folder}: {e}")  # Debug statement
        print(f"json_data: {json_data}")  # Debug statement
        json_insights = {}

    return company_name, year_folder, json_insights

def save_insights_to_json(company_name, year_folder, insights):
    """
    Saves the extracted financial insights to a JSON file.

    Parameters:
    - company_name (str): Name of the company.
    - year_folder (str): The year of the financial insights.
    - insights (dict): A dictionary containing the extracted financial insights.
    """
    os.makedirs('insights', exist_ok=True)
    company_folder = os.path.join('insights', company_name)
    os.makedirs(company_folder, exist_ok=True)
    insights_path = os.path.join(company_folder, f"{year_folder}_insights.json")
    with open(insights_path, 'w') as insights_file:
        json.dump(insights, insights_file)
    print(f"Saved insights for {company_name} - {year_folder} to {insights_path}")

base_folder = '/Users/adi_shukla/Documents/sec-edgar-proj/sec-edgar-filings'
companies = ['AAPL', 'MSFT', 'TSLA']

for company in companies:
    company_folder = os.path.join(base_folder, company, '10-K')
    year_folders = sorted(os.listdir(company_folder), reverse=True)  # Sort folders in descending order
    for year_folder in year_folders:
        if os.path.isdir(os.path.join(company_folder, year_folder)):
            company_name, year, insights = analyze_company(company_folder, year_folder)
            save_insights_to_json(company_name, year, insights)

print("Completed processing all companies.")

Extracted JSON data for AAPL - 0001628280-16-020309: {
    "2022": {
        "Revenue": {
            "Total": "$25.93 billion",
            "Compute & Networking": "$26.938 billion",
            "Graphics": "$11.718 billion",
            "Storage": "$5.926 billion",
            "Enterprise, Embedded and Semi-Custom": "$3.087 billion",
            "Other": "$2.462 billion"
        },
        "Net Income": {
            "Total": "$10.634 billion",
            "Compute & Networking": "$7.634 billion",
            "Graphics": "$2.462 billion",
            "Storage": "$0.926 billion",
            "Enterprise, Embedded and Semi-Custom": "$0.087 billion",
            "Other": "$0.134 billion"
        },
        "Effective Tax Rate": "9.9%",
        "Deferred Tax Assets": "$5.05 billion",
        "Deferred Tax Liabilities": "$339 million",
        "Foreign Income Percentage": "85%"
    },
    "2021": {
        "Revenue": {
            "Total": "$23.93 billion",
            "Compute & Networki

In [11]:
def create_json_files_list(base_folder):
    """
    Creates a list of JSON files along with their corresponding years and company names.

    Parameters:
    - base_folder (str): Path to the base folder containing the JSON files.

    Returns:
    - list: A list of tuples, each containing the file path, year, and company name.
    """
    json_files = []
    for root, _, files in os.walk(base_folder):
        company_name = os.path.basename(root)
        year = 2023
        for file in files:
            if file.endswith('.json'):
                json_files.append((os.path.join(root, file), year, company_name))
                year -= 1
    return json_files

def get_summarized_metrics(data):
    """
    Summarizes the total revenue and net income from the provided JSON data using OpenAI's GPT model.

    Parameters:
    - data (dict): The JSON data to be analyzed.

    Returns:
    - str: The summarized metrics in a formatted text.
    """
    prompt = f"Analyze the following JSON data and summarize the total revenue and net income:\n\n{json.dumps(data, indent=2)}\n\n"
    prompt += "Provide the total revenue, total net income, effective tax rate, and foreign income percentage in the format:\nRevenue: [value]\nNet Income: [value]\nEffective Tax Rate: [value]\nForeign Income Percentage: [value]"

    response = openai.Completion.create(
        model="gpt-3.5-turbo-instruct",
        prompt=prompt,
        max_tokens=150
    )
    
    return response.choices[0].text.strip()

def extract_metrics(summary_text):
    """
    Extracts specific financial metrics from the summarized text.

    Parameters:
    - summary_text (str): The summarized text containing financial metrics.

    Returns:
    - tuple: A tuple containing the revenue, net income, tax rate, and foreign income percentage.
    """
    lines = summary_text.split('\n')
    revenue = None
    net_income = None
    tax_rate = None
    foreign_income = None
    
    for line in lines:
        if 'Revenue' in line:
            value = line.split(':', 1)[1].strip().replace('$', '').replace(' million', '').replace(' billion', '').strip()
            try:
                revenue = float(value)
            except ValueError:
                continue
        elif 'Net Income' in line:
            value = line.split(':', 1)[1].strip().replace('$', '').replace(' million', '').replace(' billion', '').strip()
            try:
                net_income = float(value)
            except ValueError:
                continue
        elif 'Effective Tax Rate' in line:
            value = line.split(':', 1)[1].strip().replace('%', '').strip()
            try:
                tax_rate = float(value)
            except ValueError:
                continue
        elif 'Foreign Income Percentage' in line:
            value = line.split(':', 1)[1].strip().replace('%', '').strip()
            try:
                foreign_income = float(value)
            except ValueError:
                continue

    return revenue, net_income, tax_rate, foreign_income

def analyze_json_file(json_file_path, year):
    """
    Analyzes a JSON file to extract and summarize financial metrics.

    Parameters:
    - json_file_path (str): Path to the JSON file to be analyzed.
    - year (int): The year corresponding to the JSON file.

    Returns:
    - tuple: A tuple containing the year, revenue, net income, tax rate, and foreign income percentage.
    """
    print(f"Processing {json_file_path}")  # Debug statement
    with open(json_file_path, 'r') as file:
        data = json.load(file)

    summary_text = get_summarized_metrics(data)
    revenue, net_income, tax_rate, foreign_income = extract_metrics(summary_text)
    
    print(f"Extracted - Year: {year}, Revenue: {revenue}, Net Income: {net_income}, Tax Rate: {tax_rate}, Foreign Income: {foreign_income}")  # Debug statement

    if revenue is not None and net_income is not None:
        return (year, revenue, net_income, tax_rate, foreign_income)
    else:
        return None

def plot_metrics(metrics, company_name):
    """
    Plots the financial metrics for a company over the years.

    Parameters:
    - metrics (list): A list of tuples containing the financial metrics.
    - company_name (str): The name of the company.
    """
    if not metrics:
        print(f"No data to plot for {company_name}.")
        return

    years, revenues, net_incomes, tax_rates, foreign_incomes = zip(*metrics)

    x = range(len(years))
    
    # Plot Revenue and Net Income
    plt.figure(figsize=(15, 8))
    plt.bar(x, revenues, width=0.4, label='Revenue', align='center')
    plt.bar(x, net_incomes, width=0.4, label='Net Income', align='edge')
    
    plt.xlabel('Year')
    plt.ylabel('Value (in billions)')
    plt.title(f'{company_name} Revenue and Net Income Over the Years')
    plt.xticks(x, years, fontsize=8, rotation=45)
    plt.legend()
    
    visualization_dir = f'visualizations/{company_name}'
    os.makedirs(visualization_dir, exist_ok=True)
    
    plt.savefig(f'{visualization_dir}/revenue_net_income_over_years.png')
    plt.close()

    # Plot Effective Tax Rate
    plt.figure(figsize=(15, 8))
    plt.plot(x, tax_rates, marker='o', label='Effective Tax Rate')
    
    plt.xlabel('Year')
    plt.ylabel('Effective Tax Rate (%)')
    plt.title(f'{company_name} Effective Tax Rate Over the Years')
    plt.xticks(x, years, fontsize=8, rotation=45)
    plt.legend()
    
    plt.savefig(f'{visualization_dir}/effective_tax_rate_over_years.png')
    plt.close()

    # Plot Foreign Income Percentage
    plt.figure(figsize=(15, 8))
    plt.plot(x, foreign_incomes, marker='o', label='Foreign Income Percentage')
    
    plt.xlabel('Year')
    plt.ylabel('Foreign Income Percentage (%)')
    plt.title(f'{company_name} Foreign Income Percentage Over the Years')
    plt.xticks(x, years, fontsize=8, rotation=45)
    plt.legend()
    
    plt.savefig(f'{visualization_dir}/foreign_income_percentage_over_years.png')
    plt.close()

base_folder = '/Users/adi_shukla/Documents/sec-edgar-proj/insights'

json_files_list = create_json_files_list(base_folder)

# Group JSON files by company name
companies = {}
for json_file, year, company_name in json_files_list:
    if company_name not in companies:
        companies[company_name] = []
    companies[company_name].append((json_file, year))

# Process and plot metrics for each company
for company_name, company_json_files in companies.items():
    metrics = []
    for json_file, year in company_json_files:
        metric = analyze_json_file(json_file, year)
        if metric:
            metrics.append(metric)
    if metrics:
        plot_metrics(metrics, company_name)
    else:
        print(f"No metrics to plot for {company_name}.")

print("Completed processing the specified JSON files.")

Processing /Users/adi_shukla/Documents/sec-edgar-proj/insights/AAPL/0001193125-13-416534_insights.json
Extracted - Year: 2023, Revenue: 51.655, Net Income: 13.791, Tax Rate: 9.9, Foreign Income: 85.0
Processing /Users/adi_shukla/Documents/sec-edgar-proj/insights/AAPL/0001047469-02-007674_insights.json
Extracted - Year: 2022, Revenue: 508.23, Net Income: 90.562, Tax Rate: 10.2, Foreign Income: 84.0
Processing /Users/adi_shukla/Documents/sec-edgar-proj/insights/AAPL/0000320193-21-000105_insights.json
Extracted - Year: 2021, Revenue: None, Net Income: None, Tax Rate: None, Foreign Income: None
Processing /Users/adi_shukla/Documents/sec-edgar-proj/insights/AAPL/0001628280-16-020309_insights.json
Extracted - Year: 2020, Revenue: 25.93, Net Income: 10.634, Tax Rate: 9.9, Foreign Income: 85.0
Processing /Users/adi_shukla/Documents/sec-edgar-proj/insights/AAPL/0000320193-18-000145_insights.json
Extracted - Year: 2019, Revenue: 51.61, Net Income: 14.09, Tax Rate: 10.2, Foreign Income: 84.0
Proc

In [6]:
def create_written_insights(company_folder):
    """
    Extracts insights from the 10-K filings of a company by reading all submission.txt files.

    Args:
    - company_folder (str): Path to the folder containing the company's 10-K filings.

    Returns:
    - tuple: A tuple containing the company name (str) and the concatenated text of all 10-K filings (str).
    """
    report_text = ""
    for submission_folder in os.listdir(company_folder):
        submission_folder_path = os.path.join(company_folder, submission_folder)
        if os.path.isdir(submission_folder_path):
            submission_file_path = os.path.join(submission_folder_path, 'submission.txt')
            if os.path.exists(submission_file_path):
                with open(submission_file_path, 'r') as file:
                    submission_text = file.read()
                    report_text += submission_text + "\n"
                    print(f"Read report from: {submission_file_path}\n{submission_text}")  # Debug statement

    company_name = os.path.basename(company_folder)
    return company_name, report_text

def summarize_company_findings(company_folder):
    """
    Analyzes a company's 10-K filings to extract insights using OpenAI's API.

    Args:
    - company_folder (str): Path to the folder containing the company's 10-K filings.

    Returns:
    - tuple: A tuple containing the company name (str) and the extracted insights (str).
    """
    company_name, report_text = create_written_insights(company_folder)
    
    prompt = f"""
    You are an analyst examining {company_name}'s 10-K filings. Extract the following insights:
    1. Revenue and Net Income trends/growth percentage.
    2. Total Debt.
    3. Gross Margin and Percentage.
    4. Capital Expenditure.
    5. Effective Tax Rate and Deferred Tax Assets/Liabilities.
    6. Number of Shares Outstanding.
    7. Foreign Income Percentage.
    8. Share Buy-Back.
    And then give two more custom insights which you deem as important.
    
    Make sure that the insights are for the specific company.
    
    Here are the reports:
    {report_text}
    
    Provide the insights in a structured format.
    """
    
    try:
        response = openai.Completion.create(
            model="gpt-3.5-turbo-instruct",
            prompt=prompt,
            max_tokens=1500,
            n=1,
            stop=None,
            temperature=0.7,
        )
        return company_name, response['choices'][0]['text'].strip()
    except Exception as e:
        print(f"Error extracting insights: {e}")  # Debug statement
        return company_name, ""

sub_folder = '/Users/adi_shukla/Documents/sec-edgar-proj/sec-edgar-filings'
companies = ['AAPL', 'MSFT', 'TSLA']
all_insights = []

# Iterate over each company
for company in companies:
    company_folder = os.path.join(sub_folder, company, '10-K')
    print(f"Analyzing reports for {company} in folder: {company_folder}\n")
    # Analyze the company's 10-K filings and extract insights
    company_name, insights = summarize_company_findings(company_folder)
    if insights:
        # Append the company name and its insights to the list
        all_insights.append({
            "Company": company_name,
            "Insights": insights
        })
        print(f"Company: {company_name}")
        print(f"Insights:\n{insights}\n")
        print("-" * 80)
    else:
        print(f"No insights found for {company_name}.\n")

print("Analysis completed.")

Analyzing reports for AAPL in folder: /Users/adi_shukla/Documents/sec-edgar-proj/sec-edgar-filings/AAPL/10-K

Company: 10-K
Insights:
1. Revenue and Net Income Trends/Growth Percentage:
- The company has shown consistent growth in both its revenue and net income over the past five years, with an average annual growth rate of 5% and 8% respectively.
- In the most recent year, the company's revenue increased by 7% and its net income increased by 10% compared to the previous year.
- This growth can be attributed to the company's successful product innovations and expansion into new markets.

2. Total Debt:
- The company's total debt has been decreasing over the past five years, with a significant decrease of 20% in the most recent year.
- This decrease can be attributed to the company's focus on debt reduction and improved financial management.
- As a result, the company's debt-to-equity ratio has also decreased, indicating a stronger financial position.

3. Gross Margin and Percentage:
-