In [59]:
import datetime
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
#nltk.download('all')
from bs4 import BeautifulSoup
import re
import pysentiment2
import math
import os
from google.cloud import storage
YEARS = range(2008, 2024)
tables = pd.read_html("https://en.wikipedia.org/wiki/S%26P_100")
TICKERS = tables[2]['Symbol']

In [60]:
def read_10K(file_path, form):
    with open(file_path, 'r', encoding='utf-8') as file:
        raw_10k = file.read()
    doc_start_pattern = re.compile(r'<DOCUMENT>')
    doc_end_pattern = re.compile(r'</DOCUMENT>')
    type_pattern = re.compile(r'<TYPE>[^\n]+')
    doc_start_is = [x.end() for x in doc_start_pattern.finditer(raw_10k)]
    doc_end_is = [x.start() for x in doc_end_pattern.finditer(raw_10k)]

    doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(raw_10k)]
    document = {}

    for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
        #if doc_type == '10-K':
        document[doc_type] = raw_10k[doc_start:doc_end]

    regex = re.compile(r'(>Item(\s|&#160;|&nbsp;)(1A|1B|7A|7|8)\.{0,1})|(ITEM\s(1A|1B|7A|7|8))')

    matches = regex.finditer(document[form])
    matches = regex.finditer(document[form])
    test_df = pd.DataFrame([(x.group(), x.start(), x.end()) for x in matches])

    test_df.columns = ['item', 'start', 'end']
    test_df['item'] = test_df.item.str.lower()
    test_df.replace('&#160;',' ',regex=True,inplace=True)
    test_df.replace('&nbsp;',' ',regex=True,inplace=True)
    test_df.replace(' ','',regex=True,inplace=True)
    test_df.replace('\.','',regex=True,inplace=True)
    test_df.replace('>','',regex=True,inplace=True)

    pos_dat = test_df.sort_values('start', ascending=True).drop_duplicates(subset=['item'], keep='last')

    pos_dat.set_index('item', inplace=True)

    item_7_raw = document[form][pos_dat['start'].loc['item7a']:pos_dat['start'].loc['item8']]
    item_7_content = BeautifulSoup(item_7_raw, 'lxml')
    #print(item_1a_content.prettify()[0:1000])
    return item_7_content.get_text("\n\n")

In [61]:
def sentiment(text):
    s = SentimentIntensityAnalyzer()
    return s.polarity_scores(text)

In [62]:
def download_blob(bucket_name, source_blob_name, destination_file_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)

In [63]:
def list_subdirectories(bucket_name, parent_directory):
    """Lists 'subdirectories' within a specified parent directory in the bucket."""
    storage_client = storage.Client()
    if not parent_directory.endswith('/'):
        parent_directory += '/'

    bucket = storage_client.bucket(bucket_name)

    iterator = bucket.list_blobs(prefix=parent_directory, delimiter='/')
    subdirectories = set()

    for page in iterator.pages:
        subdirectories.update(page.prefixes)

    return subdirectories

In [64]:
def file_dl(ticker, year, filing='10-K'):
    paths = []
    counter = 1
    year = str(year)
    year_new = year[2:4]
    path = f'sec-edgar-filings/{ticker}/{filing}/'
    if len(ticker) == 4:
        dirs = list_subdirectories('portfolio-rl-edgar', path)
        for i in dirs:
            if i[39:41] == str(year_new):
                #print(i)
                download_blob("portfolio-rl-edgar", i+"full-submission.txt", f"{ticker}-{year_new}-{filing}-{counter}.txt")
                counter += 1
                paths.append(f"{ticker}-{year_new}-{filing}-{counter-1}.txt")
    if len(ticker) == 3:
        dirs = list_subdirectories('portfolio-rl-edgar', path)
        for i in dirs:
            if i[38:40] == str(year_new):
                #print(i)
                download_blob("portfolio-rl-edgar", i+"full-submission.txt", f"{ticker}-{year_new}-{filing}-{counter}.txt")
                counter += 1
                paths.append(f"{ticker}-{year_new}-{filing}-{counter-1}.txt")
    if len(ticker) == 2:
        dirs = list_subdirectories('portfolio-rl-edgar', path)
        for i in dirs:
            if i[37:39] == str(year_new):
                #print(i)
                download_blob("portfolio-rl-edgar", i+"full-submission.txt", f"{ticker}-{year_new}-{filing}-{counter}.txt")
                counter += 1
                paths.append(f"{ticker}-{year_new}-{filing}-{counter-1}.txt")
    if len(ticker) == 1:
        dirs = list_subdirectories('portfolio-rl-edgar', path)
        for i in dirs:
            if i[36:38] == str(year_new):
                #print(i)
                download_blob("portfolio-rl-edgar", i+"full-submission.txt", f"{ticker}-{year_new}-{filing}-{counter}.txt")
                counter += 1
                paths.append(f"{ticker}-{year_new}-{filing}-{counter-1}.txt")
        
    #11th and 12th characters are the year, 39:41 or 38:40
    
    #return f"{ticker}-{year_new}-{filing}-{counter-1}.txt"
    return paths


In [65]:
def read_10Q(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
            raw_10k = file.read()
    doc_start_pattern = re.compile(r'<DOCUMENT>')
    doc_end_pattern = re.compile(r'</DOCUMENT>')
    type_pattern = re.compile(r'<TYPE>[^\n]+')
    doc_start_is = [x.end() for x in doc_start_pattern.finditer(raw_10k)]
    doc_end_is = [x.start() for x in doc_end_pattern.finditer(raw_10k)]

    doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(raw_10k)]
    document = {}

    for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
        #if doc_type == '10-K':
        document[doc_type] = raw_10k[doc_start:doc_end]

    regex = re.compile(r'(>Item(\s|&#160;|&nbsp;)(1A|1B|2|3|7A|7|8)\.{0,1})|(ITEM\s(1A|1B|2|3|7A|7|8))')

    matches = regex.finditer(document[form])
    matches = regex.finditer(document[form])
    test_df = pd.DataFrame([(x.group(), x.start(), x.end()) for x in matches])

    test_df.columns = ['item', 'start', 'end']
    test_df['item'] = test_df.item.str.lower()
    test_df.replace('&#160;',' ',regex=True,inplace=True)
    test_df.replace('&nbsp;',' ',regex=True,inplace=True)
    test_df.replace(' ','',regex=True,inplace=True)
    test_df.replace('\.','',regex=True,inplace=True)
    test_df.replace('>','',regex=True,inplace=True)

    pos_dat = test_df.sort_values('start', ascending=True).drop_duplicates(subset=['item'], keep='last')

    pos_dat.set_index('item', inplace=True)

    #pos_dat

    item_3_raw = document[form][pos_dat['start'].loc['item3']:pos_dat['start'].loc['item1a']]
    item_3_content = BeautifulSoup(item_3_raw, 'lxml')
    #print(item_7_content.prettify()[0:1000])
    return item_3_content.get_text("\n\n")

In [71]:
frame = pd.DataFrame(columns=['Ticker', 'Form', 'Date', 'Positive', "Negative", "Neutral", "Score"])

for ticker in TICKERS:
    for year in YEARS:
        for form in ["10-K", "10-Q"]:
            if form == "10-K":
                arr = []
                arr.append(ticker)
                arr.append(form)
                path = file_dl(ticker, year, form)
                tpath = path[0]
                with open(f'{tpath}') as f:
                    for line in f:
                        if line.startswith('FILED AS OF DATE'):
                            new_line = re.sub(r'\s+', '', line.split(" ")[3])
                            arr.append(pd.to_datetime(new_line.split(":")[1]).date())
                text = read_10K(f"./{tpath}", form)
                s_out = sentiment(text)
                arr.append(s_out['pos'])
                arr.append(s_out['neg'])
                arr.append(s_out['neu'])
                #print("pos: ", s_out['pos'])
                #print("neg: ", s_out['neg'])
                #print("neu: ", s_out['neu'])
                try:
                    arr.append(math.tanh(math.log(s_out['pos']/s_out['neg'])/s_out['neu']))
                except:
                    arr.append(0.0)
                #print(arr)
                frame.loc[len(frame)]=(arr)
                
            else:
                path = file_dl(ticker, year, form)
                print(path)
                for i in path:
                    arr = []
                    arr.append(ticker)
                    arr.append(form)
                    with open(f'{i}') as f:
                        for line in f:
                            if line.startswith('FILED AS OF DATE'):
                                new_line = re.sub(r'\s+', '', line.split(" ")[3])
                                arr.append(pd.to_datetime(new_line.split(":")[1]).date())
                    text = read_10Q(f"./{i}")
                    s_out = sentiment(text)
                    arr.append(s_out['pos'])
                    arr.append(s_out['neg'])
                    arr.append(s_out['neu'])
                    #print("pos: ", s_out['pos'])
                    #print("neg: ", s_out['neg'])
                    #print("neu: ", s_out['neu'])
                    try:
                        arr.append(math.tanh(math.log(s_out['pos']/s_out['neg'])/s_out['neu']))
                    except:
                        arr.append(0.0)
                    #print(arr)
                    frame.loc[len(frame)]=(arr)
frame.head()


            

['AAPL-08-10-Q-1.txt', 'AAPL-08-10-Q-2.txt', 'AAPL-08-10-Q-3.txt']
['AAPL-09-10-Q-1.txt', 'AAPL-09-10-Q-2.txt', 'AAPL-09-10-Q-3.txt']
['AAPL-10-10-Q-1.txt', 'AAPL-10-10-Q-2.txt', 'AAPL-10-10-Q-3.txt']
['AAPL-11-10-Q-1.txt', 'AAPL-11-10-Q-2.txt', 'AAPL-11-10-Q-3.txt']
['AAPL-12-10-Q-1.txt', 'AAPL-12-10-Q-2.txt', 'AAPL-12-10-Q-3.txt']


KeyboardInterrupt: 

In [73]:
frame

Unnamed: 0,Ticker,Form,Date,Positive,Negative,Neutral,Score
0,AAPL,10-K,2008-11-05,0.107,0.059,0.834,0.613041
1,AAPL,10-Q,2008-05-01,0.079,0.108,0.813,-0.366699
2,AAPL,10-Q,2008-02-01,0.077,0.107,0.816,-0.382696
3,AAPL,10-Q,2008-07-23,0.079,0.111,0.81,-0.396808
4,AAPL,10-K,2009-10-27,0.124,0.063,0.813,0.68203
5,AAPL,10-Q,2009-07-22,0.0,0.0,0.0,0.0
6,AAPL,10-Q,2009-01-23,0.0,0.0,0.0,0.0
7,AAPL,10-Q,2009-04-23,0.0,0.0,0.0,0.0
8,AAPL,10-K,2010-10-27,0.125,0.063,0.813,0.687278
9,AAPL,10-Q,2010-01-25,0.0,0.0,0.0,0.0
