In [14]:
import calcbench as cb
from sklearn.feature_extraction.text import TfidfVectorizer
from bs4 import BeautifulSoup
from scipy.spatial.distance import cosine
from IPython.core.display import display, HTML
import sklearn
import itertools
import tqdm
from matplotlib import colors
import matplotlib.pyplot as plt

In [2]:
class NumberNormalizingVectorizer(sklearn.feature_extraction.text.TfidfVectorizer):
    def build_tokenizer(self):
        tokenize = super(NumberNormalizingVectorizer, self).build_tokenizer()
        return lambda doc: list(number_normalizer(tokenize(doc)))

In [3]:
def number_normalizer(tokens):
    """ Map all numeric tokens to a placeholder.

    For many applications, tokens that begin with a number are not directly
    useful, but the fact that such a token exists can be relevant.  By applying
    this form of dimensionality reduction, some methods may perform better.
    """

    return ("#NUMBER" if token[0].isdigit() else token for token in tokens)

In [4]:
def pairwise(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = itertools.tee(iterable)
    next(b, None)
    return zip(a, b)

In [18]:
tickers = cb.tickers(index='DJIA')[:2]

In [20]:
document_section = "ManagementsDiscussionAndAnalysis" # See the full list @ https://www.calcbench.com/disclosure_list
first_year = 2008
end_year = 2019
diffs = pd.DataFrame(index=tickers, columns=range(end_year, first_year - 1, -1))
for ticker in tqdm.notebook.tqdm(tickers):
    ten_K_sections = (d for d in cb.document_search(company_identifiers=[ticker], 
                                                document_name=document_section, 
                                                all_history=True) if d['fiscal_period'] == 'Y')
    sorted_disclosures = sorted(ten_K_sections, key=lambda d: d['fiscal_year'])
    year_pairs = pairwise(sorted_disclosures)
    for last_year, this_year in tqdm.notebook.tqdm(year_pairs):
        text_last_year = BeautifulSoup(last_year.get_contents(), 'html.parser').text
        text_this_year = BeautifulSoup(this_year.get_contents(), 'html.parser').text
        vectorizer = NumberNormalizingVectorizer(stop_words='english')
        X = vectorizer.fit_transform([text_this_year, text_last_year])
        distance = cosine(X[0].todense(), X[1].todense())
        diffs[this_year['fiscal_year']][ticker] = distance

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))





In [23]:
d = list(cb.document_search(company_identifiers=['msft'], 
                                                document_name=document_section, 
                                                all_history=True))

In [21]:
def background_gradient(s, m, M, cmap='PuBu', low=0, high=0):
    # from https://stackoverflow.com/questions/38931566/pandas-style-background-gradient-both-rows-and-columns
    rng = M - m
    norm = colors.Normalize(m - (rng * low),
                            M + (rng * high))
    normed = norm(s.values)
    c = [colors.rgb2hex(x) for x in plt.cm.get_cmap(cmap)(normed)]
    return ['background-color: %s' % color for color in c]

def highlight_largest_diffs(diffs):
    filled_df = diffs.loc[diffs.sum(axis=1).sort_values(ascending=False).index].fillna(0).round(3)
    return filled_df.style.apply(background_gradient, cmap='Reds', m=filled_df.min().min(), M=filled_df.max().max(), low=0, high=2.5)

## Hightlight Risk Factors with Greatest Change
### Brightest cells are those documents which changed the most vis-a-vis the previous period.

In [22]:
highlight_largest_diffs(diffs)

Unnamed: 0,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008
AXP,0.004,0.004,0.004,0.004,0.006,0.022,0.015,0.006,0.003,0.004,0.502,0.515
MMM,0.006,0.003,0.01,0.003,0.002,0.002,0.003,0.004,0.003,0.004,0.003,0.007


In [12]:
diffs.to_excel("C:\\Users\\Andrew Kittredge\\Dropbox (Calcbench)\\Andrew\\tf-idf examples\\md_a.xlsx")

In [8]:
document_section = "Management's Discussion And Analysis"
ticker = "CVX"
year = 2012
previous_year = 2011
doc = next(cb.document_search(company_identifiers=[ticker], document_name=document_section, year=year)).get_contents()
previous_doc = next(cb.document_search(company_identifiers=[ticker], document_name=document_section, year=previous_year)).get_contents()
display(HTML(cb.html_diff(doc, previous_doc)))

## Review Changes
#### The .607 distance between JNJ's 2015 and 2016 risk factors indicates a substantial change.  We verify the change on Calcbench's [disclosure page](https://www.calcbench.com/query/footnotes?pg_classificationMethod=tickers&pg_tickers=JNJ&doc_searchingBy=footnoteType&doc_footnoteType=1110&doc_selectedDisclosure=b-648365_section111&pc_year=2016&pc_periodType=Annual&pc_useFiscalPeriod=false&pc_rangeOption=Single%20Period&pc_dateRange=%5Bobject%20Object%5D).
![Diff](https://dl.dropboxusercontent.com/s/vjd382gr4vvhvuh/diff.png?raw=1)