In [1]:
import calcbench as cb
from sklearn.feature_extraction.text import TfidfVectorizer
from bs4 import BeautifulSoup
from scipy.spatial.distance import cosine
from IPython.core.display import display, HTML
import sklearn
import itertools
from tqdm import tqdm_notebook
from matplotlib import colors
import matplotlib.pyplot as plt

In [2]:
class NumberNormalizingVectorizer(sklearn.feature_extraction.text.TfidfVectorizer):
    def build_tokenizer(self):
        tokenize = super(NumberNormalizingVectorizer, self).build_tokenizer()
        return lambda doc: list(number_normalizer(tokenize(doc)))

In [3]:
def number_normalizer(tokens):
    """ Map all numeric tokens to a placeholder.

    For many applications, tokens that begin with a number are not directly
    useful, but the fact that such a token exists can be relevant.  By applying
    this form of dimensionality reduction, some methods may perform better.
    """

    return ("#NUMBER" if token[0].isdigit() else token for token in tokens)

In [4]:
def pairwise(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = itertools.tee(iterable)
    next(b, None)
    return zip(a, b)

In [9]:
tickers = """ABCB
ABTX
ACBI
AGM
AMTD
ANCX
AROW
ASB
ATAX
BAC
BANC
BANF
BANR
BBT
BCBP
BCML
BDGE
BFIN
BFST
BGCP
BHB
BHBK
BHLB
BKU
BMRC
BMTC
BNCL
BOCH
BOFI
BOH
BPFH
BPRN
BRKL
BSRR
BUSE
BWB
BXS
BY
C
CAC
CADE
CARO
CASH
CATC
CATY
CBSH
CBTX
CBU
CCBG
CCNE
CFFI
CFG
CFR
CHCO
CHFC
CIT
CIVB
CLBK
CMA
CNBKA
CNOB
COBZ
COLB
COWN
CPF
CSFL
CSTR
CTBI
CUBI
CVBF
CVCY
CZNC
DCOM
EBSB
EFSC
EGBN
EQBK
EQH
ESQ
ETFC
EVR
EWBC
FBC
FBIZ
FBK
FBMS
FBNC
FBNK
FCB
FCBC
FCF
FCNCA
FDEF
FFBC
FFIC
FFIN
FFNW
FFWM
FHB
FHN
FIBK
FISI
FITB
FLIC
FMAO
FMBH
FMBI
FMNB
FNB
FNLC
FRBK
FRC
FRME
FSB
FSBW
FULT
GABC
GBCI
GBNK
GCAP
GHL
GNBC
GNTY
GS
GSBC
GWB
HAFC
HBAN
HBCP
HBMD
HBNC
HFWA
HIFS
HLI
HMST
HOMB
HONE
HOPE
HTBI
HTBK
HTH
HTLF
HWC
IBCP
IBKC
IBKR
IBOC
IBTX
INBK
INDB
INTL
ISBC
ITG
JPM
KEY
KRNY
LBAI
LBC
LEVL
LION
LKFN
LOB
LPLA
LTS
LTXB
MBFI
MBIN
MBTF
MBWM
MC
MCB
MCBC
MOFG
MS
MSBI
MSL
MTB
MTG
NBHC
NBN
NBTB
NCBS
NCOM
NFBK
NMIH
NRIM
NWBI
NYCB
OBNK
OCFC
OCN
OLBK
ONB
OPB
OPY
ORIT
OSBC
OZK
PACW
PB
PBCT
PCSB
PEBO
PFBC
PFS
PFSI
PGC
PHH
PJC
PJT
PNC
PNFP
PPBI
PRK
PUB
PWOD
QCRH
RBB
RBCAA
RDN
RF
RILY
RJF
RNST
RVSB
SASR
SBBX
SBCF
SBNY
SBSI
SBT
SCHW
SF
SFBS
SFNC
SFST
SHBI
SIEB
SIVB
SMBC
SMBK
SMMF
SNFCA
SNV
SONA
SRCE
SSB
STBA
STBZ
STI
STL
STXB
SYBT
TBBK
TBK
TBNK
TCBI
TCBK
TCF
THFF
TMP
TOWN
TREE
TRMK
TRST
TSBK
TSC
UBNK
UBSH
UBSI
UCBI
UCFC
UMBF
UMPQ
USB
UVSP
VBTX
VIRT
VLY
VOYA
WABC
WAFD
WAL
WASH
WBS
WD
WFC
WNEB
WSBC
WSBF
WSFS
WTBA
WTFC
ZION
""".splitlines()

In [10]:
document_section = "Legal Proceedings"
first_year = 2008
end_year = 2018
diffs = pd.DataFrame(index=tickers, columns=range(end_year, first_year, -1))
for ticker in tqdm_notebook(tickers):
    ten_K_sections = (d for d in cb.document_search(company_identifiers=[ticker], 
                                                document_name=document_section, 
                                                all_history=True) if d['fiscal_period'] == 'Y')
    sorted_disclosures = sorted(ten_K_sections, key=lambda d: d['fiscal_year'])
    year_pairs = pairwise(sorted_disclosures)
    for last_year, this_year in year_pairs:
        text_last_year = BeautifulSoup(last_year.get_contents(), 'html.parser').text
        text_this_year = BeautifulSoup(this_year.get_contents(), 'html.parser').text
        vectorizer = NumberNormalizingVectorizer(stop_words='english')
        X = vectorizer.fit_transform([text_this_year, text_last_year])
        distance = cosine(X[0].todense(), X[1].todense())
        diffs[this_year['fiscal_year']][ticker] = distance

HBox(children=(IntProgress(value=0, max=294), HTML(value='')))

HTTPError: 404 Client Error: Missing blob text for 216615 for url: https://www.calcbench.com/query/disclosureBySECLink?blobid=216615_section130&secid=216615

In [7]:
def background_gradient(s, m, M, cmap='PuBu', low=0, high=0):
    # from https://stackoverflow.com/questions/38931566/pandas-style-background-gradient-both-rows-and-columns
    rng = M - m
    norm = colors.Normalize(m - (rng * low),
                            M + (rng * high))
    normed = norm(s.values)
    c = [colors.rgb2hex(x) for x in plt.cm.get_cmap(cmap)(normed)]
    return ['background-color: %s' % color for color in c]

def highlight_largest_diffs(diffs):
    filled_df = diffs.loc[diffs.sum(axis=1).sort_values(ascending=False).index].fillna(0).round(3)
    return filled_df.style.apply(background_gradient, cmap='Reds', m=filled_df.min().min(), M=filled_df.max().max(), low=0, high=2.5)

## Hightlight Risk Factors with Greatest Change
### Brightest cells are those documents which changed the most vis-a-vis the previous period.

In [8]:
highlight_largest_diffs(diffs)

Unnamed: 0,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009
XOM,0.327,0.547,0.243,0.449,0.312,0.339,0.46,0.426,0.0,0
CAT,0.0,0.0,0.0,0.886,0.285,0.416,0.659,0.329,0.11,0
PG,0.0,0.0,0.539,0.539,0.207,0.533,0.494,0.057,0.0,0
HD,0.0,0.35,0.148,0.165,0.184,0.656,0.393,0.111,0.201,0
DWDP,0.2,0.405,0.283,0.065,0.069,0.123,0.382,0.067,0.363,0
VZ,0.017,0.231,0.168,0.49,0.183,0.133,0.255,0.101,0.257,0
JNJ,0.0,0.0,0.0,0.0,0.326,0.194,0.028,0.535,0.591,0
CSCO,0.0,0.163,0.403,0.112,0.411,0.224,0.07,0.25,0.0,0
UTX,0.258,0.246,0.117,0.142,0.148,0.137,0.143,0.273,0.167,0
CVX,0.187,0.064,0.262,0.079,0.125,0.147,0.176,0.334,0.192,0


In [6]:
document_section = 'Risk Factors'
ticker = "JNJ"
year = 2015
previous_year = 2016
doc = next(cb.document_search(company_identifiers=[ticker], document_name=document_section, year=year)).get_contents()
previous_doc = next(cb.document_search(company_identifiers=[ticker], document_name=document_section, year=previous_year)).get_contents()
display(HTML(cb.html_diff(doc, previous_doc)))

## Review Changes
#### The .607 distance between JNJ's 2015 and 2016 risk factors indicates a substantial change.  We verify the change on Calcbench's [disclosure page](https://www.calcbench.com/query/footnotes?pg_classificationMethod=tickers&pg_tickers=JNJ&doc_searchingBy=footnoteType&doc_footnoteType=1110&doc_selectedDisclosure=b-648365_section111&pc_year=2016&pc_periodType=Annual&pc_useFiscalPeriod=false&pc_rangeOption=Single%20Period&pc_dateRange=%5Bobject%20Object%5D).
![Diff](https://dl.dropboxusercontent.com/s/vjd382gr4vvhvuh/diff.png?raw=1)