In [None]:
import pandas as pd
from sec_edgar_downloader import Downloader
from bs4 import BeautifulSoup
import re
import os
from anyascii import anyascii
from bertopic import BERTopic
from nltk.tokenize import sent_tokenize
import nltk
import numpy as np
import math
from nltk import sent_tokenize, word_tokenize, PorterStemmer
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
import json

### Get list of tickers from SEC

In [None]:
with open('company_tickers.json') as j:
    company_tickers = json.load(j)
all_tickers = []
for company in company_tickers:
    all_tickers.append(company_tickers[company]['ticker'])
print(len(all_tickers))
all_tickers[:5]

In [None]:
dl = Downloader()

In [None]:
tickers = ['AAPL',"MSFT","V",'0000102909']
ticker = 'AAPL'
submissions_folder=f'/sec-edgar-filings/{ticker}/10-K/'
ngram_length = 3

In [None]:
def get_10k_reports_ticker_timeframe(ticker, start_date ,end_date,local=False):
    df = pd.DataFrame()
    if local:
        df_filenames = pd.DataFrame()
    dl.get("10-K", ticker, after=start_date, before=end_date)
    for folder in os.listdir(os.getcwd() + submissions_folder):
        year = folder.split('-')[1]
        with open(os.getcwd() + submissions_folder + folder+ '/filing-details.html',encoding='utf-8') as fp:
            soup = BeautifulSoup(fp)
        text = soup.get_text(strip=True)
        text = anyascii(text)
        filename = f'data/{year}-{ticker}-report.txt'
        if local:
            df_filenames = df_filenames.append({
                'ticker' : ticker,
                'year' :('20' + year),
                'filename' : filename
            },ignore_index=True)
            with open(filename,'w+') as f:
                f.write(text)
        df = df.append({
            'ticker' : ticker,
            'year' :('20' + year),
            'text' : text
        },ignore_index=True)
    if local:
        return df_filenames
    return df

In [None]:
reports = get_10k_reports_ticker_timeframe(all_tickers,'2015-01-01','2021-01-01', True)

In [None]:
def get_risk_factors_fragments_from_file(filename, ticker ,year ,local=False):
    with open(filename) as f:
        text = f.read()
    if local:
        df = pd.DataFrame()
    pos_1a = [m.start() for m in re.finditer('Item 1A', text)]
    pos_1b = [m.start() for m in re.finditer('Item 1B', text)]
    fragments = {}
    index_pos1a = 0
    index_pos1b = 0
    while index_pos1a < len(pos_1a) and index_pos1b < len(pos_1b):
        pos1a = pos_1a[index_pos1a]
        pos1b = pos_1b[index_pos1b]
        if pos1a * 10 < pos1b:
            index_pos1a += 1
            continue
        if pos1a > pos1b:
            index_pos1b += 1
            continue
        fragments[(pos1a,pos1b)] = text[pos1a:pos1b]
        index_pos1a += 1
        index_pos1b += 1
    
    for ifragment in fragments.keys():
        if local:
            fgr = fragments[ifragment]
            df = df.append({
                'ticker':ticker,
                'year': year,
                'start_index':ifragment[0],
                'end_index':ifragment[1],
                'size':len(fgr),
                'text':fgr
            },ignore_index=True)
            continue
        t_filename = filename +'_'+ str(ifragment)+'.txt'
        with open(t_filename,'w+') as f:
            f.write(fragments[ifragment])
    if local:
        return df

In [None]:
fragments_df = pd.DataFrame()
for (index,row) in reports.iterrows():
#     get_risk_factors_fragments_from_file(row['filename'],row['ticker'],row['year'])
    df = get_risk_factors_fragments_from_file(row['filename'],row['ticker'],row['year'],local=True)
    fragments_df = fragments_df.append(df,ignore_index=True)

In [None]:
fragments_df = pd.DataFrame()
for ticker in tickers:
    reports = get_10k_reports_ticker_timeframe(ticker,'2015-01-01','2021-01-01', True)
    for (index,row) in reports.iterrows():
        df = get_risk_factors_fragments_from_file(row['filename'],row['ticker'],row['year'],local=True)
        fragments_df = fragments_df.append(df,ignore_index=True)

In [None]:
fragments_df = fragments_df[fragments_df['size'] > 25]
fragments_df

### Cleaning the data

In [None]:
# fragments_df.text = fragments_df.apply(lambda row: re.sub(r"http\S+", "", row.text).lower(), 1)
# fragments_df.text = fragments_df.apply(lambda row: " ".join(filter(lambda x:x[0]!="@", row.text.split())), 1)
# fragments_df.text = fragments_df.apply(lambda row: " ".join(re.sub("[^a-zA-Z0-9]+", " ", row.text).split()), 1)

Cleaning of data was obtained from here: https://github.com/etattershall/burst-detection/blob/master/Detecting%20Bursty%20Terms%20in%20Computer%20Science.ipynb

In [None]:
alphabets = "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"
htmltags = '<[^>]+>'
htmlspecial = '&#?[xX]?[a-zA-Z0-9]{2,8};'

start_delimiter = 'documentstart'
sent_delimiter = 'sentenceboundary'
end_delimiter = 'documentend'

delimiters = [start_delimiter, sent_delimiter, end_delimiter]

# Download the lemmatisesr
wnl = WordNetLemmatizer()

# Create a tokeniser
count = CountVectorizer(strip_accents='ascii', min_df=1)
tokeniser = count.build_analyzer()

def normalise_acronymns(text):
    '''
    Remove the periods in acronyms. 
    Adapted from the method found at https://stackoverflow.com/a/40197005 
    '''
    return re.sub(r'(?<!\w)([A-Z, a-z])\.', r'\1', text)

def normalise_decimals(text):
    '''
    Remove the periods in decimal numbers and replace with POINT
    '''
    return re.sub(r'([0-9])\.([0-9])', r'\1POINT\2', text)

def split_into_sentences(text):
    '''
    Sentence splitter adapted from https://stackoverflow.com/a/31505798
    '''
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    
    # my addition
    text = re.sub(htmltags, " ", text)
    text = re.sub(htmlspecial, " ", text)
    
    if "FactorsThe" in text:
        text = text.replace("FactorsThe", "Factors The")
    
    if "Ph.D" in text: 
        text = text.replace("Ph.D.","PhD")
        
    text = re.sub("\s" + alphabets + "[.] "," \\1",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1\\2\\3",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1\\2",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1 \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1",text)
    text = re.sub(" " + alphabets + "[.]"," \\1",text)
    
    if "”" in text: 
        text = text.replace(".”","”.")
    if "\"" in text: 
        text = text.replace(".\"","\".")
    if "!" in text: 
        text = text.replace("!\"","\"!")
    if "?" in text: 
        text = text.replace("?\"","\"?")
        
    text = text.replace(".","<stop>")
    text = text.replace("?","<stop>")
    text = text.replace("!","<stop>")
        
    sentences = text.split("<stop>")
    sentences = [s.strip() for s in sentences]
    
    non_empty = []
    for s in sentences: 
        # we require that there be two alphanumeric characters in a row
        if len(re.findall("[A-Za-z0-9][A-Za-z0-9]", s)) > 0:
            non_empty.append(s)
    return non_empty

def pad_sentences(sentences):
    '''
    Takes a list of sentences and returns a string in which:
        - The beginning of the abstract is indicated by DOCUMENTSTART
        - The end is indicated by DOCUMENTEND
        - Sentence boundaries are indicated by SENTENCEBOUNDARY
        
    The number of delimiters used is dependent on the ngram length
    '''
    sent_string = (' '+(sent_delimiter+' ')*(ngram_length-1)).join(sentences)
    
    return (start_delimiter+' ')*(ngram_length-1) + sent_string + (' '+end_delimiter)*(ngram_length-1)
    
def cleaning_pipeline(row):
    '''
    Takes a binary string and returns a list of cleaned sentences, stripped of punctuation and lemmatised
    '''

    text = normalise_decimals(normalise_acronymns(row['text']))
    sentences = split_into_sentences(text)
    
    # strip out punctuation and make lowercase
    clean_sentences = []
    for s in sentences:
        
        # Deal with special cases
        s = re.sub(r'[-/]', ' ', s)
        
        # Remove all other punctuation
        s = re.sub(r'[^\w\s]','',s)
                   
        clean_sentences.append(s.lower())
        
    # pad sentences with delimiters
    text = pad_sentences(clean_sentences)
    
    # Lemmatise word by word
    lemmas = []
    for word in tokeniser(text):
        lemmas.append(wnl.lemmatize(word))
    
    row['clean_text'] = ' '.join(lemmas)
    return row

def cleaning_pipeline_sentences(text):
    '''
    Takes a binary string and returns a list of cleaned sentences, stripped of punctuation and lemmatised
    '''

    text = normalise_decimals(normalise_acronymns(text))
    sentences = split_into_sentences(text)
    
    # strip out punctuation and make lowercase
    clean_sentences = []
    for s in sentences:
        
        # Deal with special cases
        s = re.sub(r'[-/]', ' ', s)
        
        # Remove all other punctuation
        s = re.sub(r'[^\w\s]','',s)
                   
        clean_sentences.append(s.lower())
        
    return clean_sentences

In [None]:
fragments_df

In [None]:
fragments_df['clean_text'] = ''
fragments_df = fragments_df.apply(cleaning_pipeline, axis=1)
fragments_df['clean_text'].values[0]

In [None]:
fragments_df

TextSummarization https://www.analyticsvidhya.com/blog/2018/11/introduction-text-summarization-textrank-python/

In [None]:
sentences = []
for s in fragments_df['text']:
    sentences.append(cleaning_pipeline_sentences(s))
sentences = [y for x in sentences for y in x]

In [None]:
sentences[:5]

In [None]:
word_embeddings = {}
f = open('glove.6B/glove.6B.300d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [None]:
sentence_vectors = []
for i in sentences:
    if len(i) != 0:
        v = sum([word_embeddings.get(w, np.zeros((300,))) for w in i.split()])/(len(i.split())+0.001)
    else:
        v = np.zeros((300,))
    sentence_vectors.append(v)

In [None]:
sim_mat = np.zeros([len(sentences), len(sentences)])

In [None]:
for i in range(len(sentences)):
    for j in range(len(sentences)):
        if i != j:
            sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,300), sentence_vectors[j].reshape(1,300))[0,0]

In [None]:
nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [None]:
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

In [None]:
ranked_sentences_list = []
set_ranked_sentences = set()
for i in range(100):
    ranked_sentences_list.append(ranked_sentences[i][1])
    set_ranked_sentences.update([ranked_sentences[i][1]])
for sentence in set_ranked_sentences:
    print(sentence)
    print()

### BERTOPIC

In [None]:
dates = fragments_df['year'].apply(lambda x: pd.Timestamp(x)).to_list()
dates

In [None]:
titles = []
for (index,row) in fragments_df.iterrows():
    title = row['ticker'] +'-'+ row['year'] +'-'+ str(row['start_index']) +'-'+ str(row['end_index'])
    titles.append(title)
titles

In [None]:
data = fragments_df.summary.tolist()

In [None]:
topic_model = BERTopic(verbose=True)

In [None]:
topics,probs = topic_model.fit_transform(data)

In [None]:
freq = topic_model.get_topic_info()
freq

In [None]:
topics_over_time = topic_model.topics_over_time(data, ['COVID','Pandemic','ARM','Lockdown','Supply','Mac','Windows'], dates)
topics_over_time