# 10-K/Q Text Change Detection
### [The Code](https://github.com/calcbench/notebooks/blob/master/risk_factor_similarity_and_diffing-tf-idf.ipynb)

## Goal
Reduce the amount of time analysts spend reading 10-K/Qs by highlighting the sections which change the most between periods.

## Hypothesis
The [cosine distance](https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.spatial.distance.cosine.html) between [Term Frequency - Inverse Document Frequencey (TF-IDF)](http://scikit-learn.org/stable/modules/feature_extraction.html#tfidf-term-weighting) vectors of 10-K sections is a useful proxy for symantic change in 10-K sections across time.


## Procedure
1. Use the [Calcbench Python API Client](https://github.com/calcbench/python_api_client) to download document section contents from Calcbench

2. Tokenize the sections
3. Build TF-IDF matrices
4. Compute the cosine distance between each section and the same section from the previous filing/period
5. Render the matrix of distances with largest distances highlighted.
6. Review large changes by "diffing" documents with distance above a certain threshold.


In [1]:
import calcbench as cb
from sklearn.feature_extraction.text import TfidfVectorizer
from bs4 import BeautifulSoup
from scipy.spatial.distance import cosine
from IPython.core.display import display, HTML
import sklearn
import itertools
from tqdm import tqdm_notebook
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import colors

  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
class NumberNormalizingVectorizer(sklearn.feature_extraction.text.TfidfVectorizer):
    def build_tokenizer(self):
        tokenize = super(NumberNormalizingVectorizer, self).build_tokenizer()
        return lambda doc: list(number_normalizer(tokenize(doc)))

In [3]:
def number_normalizer(tokens):
    """ Map all numeric tokens to a placeholder.

    For many applications, tokens that begin with a number are not directly
    useful, but the fact that such a token exists can be relevant.  By applying
    this form of dimensionality reduction, some methods may perform better.
    """

    return ("#NUMBER" if token[0].isdigit() else token for token in tokens)

In [4]:
def pairwise(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = itertools.tee(iterable)
    next(b, None)
    return zip(a, b)

In [6]:
cb.api_client._rig_for_testing('calcbenchtest.cloudapp.net')

In [7]:
accession_data = cb.standardized_data(entire_universe=True, year=2017, metrics=['auditor_name', 'filer_category']).iloc[0]
kpmg_large_accelerated = accession_data.auditor_name[accession_data.auditor_name.str.contains('KPMG', case=False).fillna(False) & (accession_data.filer_category == 'Large Accelerated Filer')].index

In [8]:
document_section = "Management's Discussion And Analysis"
tickers = kpmg_large_accelerated
first_year = 2008
end_year = 2017
all_years = range(end_year, first_year, -1)
diffs = pd.DataFrame(index=tickers, columns=all_years)
for ticker in tqdm_notebook(tickers):
    ten_K_sections = (d for d in cb.document_search(company_identifiers=[ticker],
                                                document_name=document_section,
                                                all_history=True) if d['fiscal_period'] == 'Y' and d['fiscal_year'] in all_years and d['document_type'] == '10-K')
    sorted_disclosures = sorted(ten_K_sections, key=lambda d: d['fiscal_year'])
    year_pairs = pairwise(sorted_disclosures)
    for last_year, this_year in year_pairs:
        text_last_year = BeautifulSoup(last_year.get_contents(), 'html.parser').text
        text_this_year = BeautifulSoup(this_year.get_contents(), 'html.parser').text
        vectorizer = NumberNormalizingVectorizer(stop_words='english')
        X = vectorizer.fit_transform([text_this_year, text_last_year])
        distance = cosine(X[0].todense(), X[1].todense())
        diffs[this_year['fiscal_year']][ticker] = distance

HBox(children=(IntProgress(value=0, max=420), HTML(value='')))

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):





In [10]:
def background_gradient(s, m, M, cmap='PuBu', low=0, high=0):
    # from https://stackoverflow.com/questions/38931566/pandas-style-background-gradient-both-rows-and-columns
    rng = M - m
    norm = colors.Normalize(m - (rng * low),
                            M + (rng * high))
    normed = norm(s.values)
    c = [colors.rgb2hex(x) for x in plt.cm.get_cmap(cmap)(normed)]
    return ['background-color: %s' % color for color in c]

def highlight_largest_diffs(diffs):
    filled_df = diffs.loc[diffs.sum(axis=1).sort_values(ascending=False).index].fillna(0)
    return filled_df.style.apply(background_gradient, cmap='Reds', m=filled_df.min().min(), M=filled_df.max().max(), low=0, high=2.5)

In [11]:
highlight_largest_diffs(diffs)

Unnamed: 0_level_0,2017,2016,2015,2014,2013,2012,2011,2010,2009
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
CE,0.00416782,0.890281,0.0,0.0,0.0,0.0,0.899309,0.00589972,0
EMR,0.00543985,0.762232,0.0084045,0.120113,0.0707926,0.137097,0.0495006,0.0597947,0
ESE,0.00849642,0.00963403,0.00510631,0.00957066,0.0108589,0.384235,0.368059,0.0,0
RLI,0.00197691,0.00176903,0.00110681,0.0012436,0.560637,0.0627403,0.00508433,0.0,0
HMN,0.470414,0.00231983,0.00242369,0.00271039,0.00128432,0.00153935,0.0,0.0,0
CQP,0.0106561,0.0125933,0.126166,0.0117709,0.0251088,0.0372246,0.0,0.0,0
LNG,0.00695398,0.0111564,0.142014,0.0117331,0.0270915,0.0,0.0,0.0,0
AIR,0.00928359,0.00832658,0.0299143,0.0268738,0.0962007,0.00916955,0.0,0.0,0
LVLT,0.0768378,0.0395567,0.0106749,0.0152169,0.00683862,0.0102787,0.0143186,0.0,0
TWNK,0.00477476,0.166258,0.0,0.0,0.0,0.0,0.0,0.0,0
