In [None]:
import json
import os
import re
import shutil

import pandas as pd
from anyascii import anyascii
from bs4 import BeautifulSoup
from pymongo import MongoClient
from sec_edgar_downloader import Downloader
from tqdm.notebook import tqdm


# Parameters

- upl_text: int - the ratio between the positions of index 1A and index 1B
- lrts: int - Lower boundary from which fragments become relevant data

(Default values were selected empirically)


In [None]:
upl_text = 10
lrts = 50

# There are two types of documents obtained by sec_edgar_downloader library
# The first one - 'latest' - is a web-ready version, which retain the information better after parsing
# The second one - 'legacy' - is a text based version, which retains the document structure and readability better
targeted_document = 'latest'


In [None]:
# Connection to DB
client = MongoClient('127.0.0.1', 27017)
db = client.frtp
collection = db.documents


In [None]:
# Importing all company tickers available from SEC EDGAR
with open('static/company_tickers.json') as j:
    company_tickers = json.load(j)
all_tickers = []
for company in company_tickers:
    all_tickers.append(company_tickers[company]['ticker'])
print("Number of tickers available in SEC EDGAR:", len(all_tickers))
all_tickers[:5]


In [None]:
dl = Downloader()


In [None]:
def get_10k_reports_ticker_timeframe(ticker, start_date, end_date):
    years = []
    filenames = []
    dl.get("10-K", ticker, after=start_date, before=end_date)
    try:
        for folder in os.listdir(os.getcwd() + f'/sec-edgar-filings/{ticker}/10-K/'):
            year = folder.split('-')[1]
            filename = f'data/{year}-{ticker}-report.txt'
            result = collection.find({"year": str(year), "ticker": ticker})
            df = pd.DataFrame(list(result))
            if df.shape[0] > 0:
                print(f"The 10K report of {ticker} from 20{year} is already in the DB")
                shutil.rmtree(os.getcwd() + f'/sec-edgar-filings/{ticker}/10-K/' + folder+'/')
                continue
            try:
                if targeted_document == 'latest':
                    targeted_filename = 'filing-details.html'
                else:
                    targeted_filename = 'full-submission.txt'
                with open(os.getcwd() + f'/sec-edgar-filings/{ticker}/10-K/' + folder + '/filing-details.html', encoding='utf-8') as fp:
                    try:
                        soup = BeautifulSoup(fp)
                    except UnicodeDecodeError:
                        print("File Error: "+filename)
                        continue
                text = soup.get_text(strip=True)
                text = anyascii(text)
                with open(filename, 'w+') as f:
                    f.write(text)
                years.append(year)
                filenames.append(filename)
            except FileNotFoundError:
                print(ticker, f"does not have 10K reports in {year}")
                shutil.rmtree(os.getcwd() + f'/sec-edgar-filings/{ticker}/10-K/' + folder + '/')
        shutil.rmtree(os.getcwd() + f'/sec-edgar-filings/{ticker}')
    except FileNotFoundError:
        print(ticker, " does not have 10K reports in the selected timeframe")
    return years, filenames


In [None]:
years, filenames = get_10k_reports_ticker_timeframe('TSLA', '2010-01-01', '2015-01-01')
years, filenames


In [None]:
def store_risk_factors_fragments_from_file(filename, ticker, year):
    ids_list = []
    with open(filename) as f:
        text = f.read()
    pos_1a = [m.start() for m in re.finditer('Item 1A', text)]
    pos_1b = [m.start() for m in re.finditer('Item 1B', text)]
    fragments = {}
    index_pos1a = 0
    index_pos1b = 0
    while index_pos1a < len(pos_1a) and index_pos1b < len(pos_1b):
        pos1a = pos_1a[index_pos1a]
        pos1b = pos_1b[index_pos1b]
        if pos1a * upl_text < pos1b:
            index_pos1a += 1
            continue
        if pos1a > pos1b:
            index_pos1b += 1
            continue
        fragments[(pos1a, pos1b)] = text[pos1a:pos1b]
        index_pos1a += 1
        index_pos1b += 1

    for ifragment in fragments.keys():
        fgr = fragments[ifragment]
        if collection.find_one({
            'ticker': ticker,
            'year': year,
            'size': len(fgr)
        }):
            continue
        id = collection.insert_one({
            'ticker': ticker,
            'year': year,
            'start_index': ifragment[0],
            'end_index': ifragment[1],
            'size': len(fgr),
            'text': fgr
        }).inserted_id
        ids_list.append(id)
    os.remove(filename)
    return ids_list


In [None]:
import sys

orig_stdout = sys.stdout
f = open('logs/text_data_gathering.log', 'w')
sys.stdout = f

In [None]:
last_ticker_checked = 'NM-PG'

In [None]:
if last_ticker_checked is None:
    remaining_tickers = all_tickers
else:
    remaining_index = all_tickers.index(last_ticker_checked)
    remaining_tickers = all_tickers[remaining_index:]

for ticker in tqdm(remaining_tickers):
    years, filenames = get_10k_reports_ticker_timeframe(ticker, '2008-01-01', '2022-01-01')
    filenames = list(set(filenames))
    result = collection.delete_many({'size': {'$lt': lrts}})# Removed fragments too small to be relevant from the DB
    result.deleted_count
    for index in range(0, len(filenames)):
        ids_list = store_risk_factors_fragments_from_file(filenames[index], ticker, years[index])


In [None]:
sys.stdout = orig_stdout
f.close()
