### Import necessary libraries

In [71]:
from sec_api import ExtractorApi
from tqdm.auto import tqdm
import json
import pandas as pd

### Extract relevant sections from SEC filings

More information: https://www.investor.gov/introduction-investing/general-resources/news-alerts/alerts-bulletins/investor-bulletins/how-read

The sections from 10-K documents that are most relevant:
- 1: Business
- 1A: Risk Factors
- 2: Properties
- 3: Legal Proceedings
- 5: Market for Registrant’s Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities
- 6: Financial Historical Data
- 7: Management’s Discussion and Analysis of Financial Condition and Results of Operations
- 7A: Quantitative and Qualitative Disclosures about Market Ris
- 8: Financial Statements and Supplementary Data
- 10: Directors, Executive Officers and Corporate Governance
- 11: Executive Compensation
- 12: Security Ownership of Certain Beneficial Owners and Management and Related Stockholder Matters
- 13: Certain Relationships and Related Transactions, and Director Independence

In [13]:
def extractSECData(companies, sections_to_extract, secAPIKey = '12e1ddac712981bee08fa342a186914af421d68877b85c9387b26535b702705b'):
    '''
    Extracts specific sections from 10-K documents for specific companies for financial year 2023.
    
    Parameters:
    - companies (dict): A dictionary of company tickers mapping to the SEC 10-K file URL.
    - sections_to_extract (list): A list of all sections to extract from the 10-K.
    - secAPIKey (str): The API Key for the "sec_api" library.

    Returns:
    - results (dict): A dictionary of all the 10-K sections for all the companies.
    '''
    
    extractorApi = ExtractorApi(secAPIKey)
    results = {}
    for company_name, url_10k in companies.items():
        print(f'Extracting 10-K information for {company_name}...')
        results[company_name] = {}
        for section in tqdm(sections_to_extract, desc=f'Extracting all sections from {company_name}\'s 10-K'):
            results[company_name][section] = extractorApi.get_section(url_10k, section=section, return_type='text')
        print('---------')
    return results

In [14]:
companies = {
    'NVDA': 'https://www.sec.gov/ixviewer/ix.html?doc=/Archives/edgar/data/0001045810/000104581024000029/nvda-20240128.htm',
    'AMD': 'https://www.sec.gov/ixviewer/ix.html?doc=/Archives/edgar/data/0000002488/000000248824000012/amd-20231230.htm',
    'META': 'https://www.sec.gov/ixviewer/ix.html?doc=/Archives/edgar/data/0001326801/000132680124000012/meta-20231231.htm',
    'MSFT': 'https://www.sec.gov/ixviewer/ix.html?doc=/Archives/edgar/data/0000789019/000095017023035122/msft-20230630.htm'
}
sections_to_extract = ['1','1A','2','3','5','6','7','7A','8','10','11','12','13']

results = extractSECData(
    companies=companies,
    sections_to_extract=sections_to_extract
)

Extracting 10-K information for NVDA...


Extracting all sections from NVDA's 10-K: 100%|██████████| 13/13 [00:02<00:00,  5.88it/s]


---------
Extracting 10-K information for AMD...


Extracting all sections from AMD's 10-K: 100%|██████████| 13/13 [00:03<00:00,  4.26it/s]


---------
Extracting 10-K information for META...


Extracting all sections from META's 10-K: 100%|██████████| 13/13 [00:02<00:00,  6.02it/s]


---------
Extracting 10-K information for MSFT...


Extracting all sections from MSFT's 10-K: 100%|██████████| 13/13 [00:02<00:00,  4.68it/s]

---------





In [22]:
def save_sec_files(output_path, results):
    with open(output_path, 'w') as f:
        json.dump(results, f, indent=4)
    print(f'Results saved to {output_path}')

In [23]:
save_sec_files('sec_results.json', results)

Results saved to sec_results.json


### News Scraper

In [4]:
# !git clone https://github.com/ranahaani/GNews.git
# !pip install -q -r GNews/requirements.txt
%cd GNews/

/Users/Dell/Documents/BNYM Capstone/datacollection/GNews


In [29]:
from gnews import GNews
google_news = GNews(
    language='en',
    country='US',
    max_results=50,
    )
google_news.start_date = (2023, 1, 1)
google_news.end_date = (2023, 12, 31)

In [30]:
def extractNewsData(companies, directoryPath):
    '''
    Extract Google News articles about companies during specified date ranges.

    Parameters:
    - companies (list): A list of company tickers for which information is needed to be extracted.
    - directoryPath (str): The path to the directory in which news data will be stored.
    '''
    for company in companies:
        data = {
            'title': [],
            'text': [],
            'publisher': [],
            'authors': [],
            'published_date': []
        }
        jsonResponse = google_news.get_news(company)
        for items in jsonResponse:
            try:
                article = google_news.get_full_article(items['url'])
                data['title'].append(article.title)
                data['text'].append(article.text)
                data['publisher'].append(items['publisher'])
                data['authors'].append(article.authors)
                data['published_date'].append(article.publish_date)
            except:
                continue
        save_path = f'{directoryPath}/{company}_news_data.csv'
        df = pd.DataFrame(data)
        df.to_csv(save_path, index=False)

In [67]:
companies = ['NVDA', 'AMD','TSM','META','MSFT']
extractNewsData(companies=companies, directoryPath='../GoogleNewsData')

In [64]:
def process_dataframe(df):
    # Remove rows where 'text' is null
    df = df.dropna(subset=['text']).reset_index(drop=True)
    
    # Convert 'authors' column to string of the first author or empty string
    df['authors'] = df['authors'].apply(lambda x: eval(x)[0] if len(eval(x)) > 0 else '')
    
    # Format 'published_date' as YYYY-MM-DD or empty string
    df['published_date'] = df['published_date'].apply(lambda x: x.split(' ')[0] if isinstance(x, str) and x else '')
    
    # Extract 'title' from 'publisher' dictionary
    df['publisher'] = df['publisher'].apply(lambda x: eval(x)['title'] if 'title' in eval(x) else '')
    
    return df

In [68]:
for company in companies:
    path = f'../GoogleNewsData/{company}_news_data.csv'
    df = pd.read_csv(path)
    pp_df = process_dataframe(df)
    pp_df.to_csv(path, index=False)