In [None]:
# Substantailly copied by andrew@calcbench.com  
# from http://scikit-learn.org/stable/auto_examples/text/document_clustering.html
# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
#         Lars Buitinck
# http://scikit-learn.org/stable/auto_examples/text/document_clustering.html
# License: BSD 3 clause

In [None]:
import calcbench as cb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from bs4 import BeautifulSoup
import numpy as np
from  time import time
import sklearn
from sklearn import metrics

In [None]:
class NumberNormalizingVectorizer(sklearn.feature_extraction.text.TfidfVectorizer):
    def build_tokenizer(self):
        tokenize = super(NumberNormalizingVectorizer, self).build_tokenizer()
        return lambda doc: list(number_normalizer(tokenize(doc)))

In [None]:
def number_normalizer(tokens):
    """ Map all numeric tokens to a placeholder.

    For many applications, tokens that begin with a number are not directly
    useful, but the fact that such a token exists can be relevant.  By applying
    this form of dimensionality reduction, some methods may perform better.
    """

    return ("#NUMBER" if token[0].isdigit() else token for token in tokens)

In [None]:
#The four biggest CIKs.
industries = {
    'pharmaceutical' : {'cik' : 2834},               
    'REIT' : {'cik' : 6798}, 
    'software' : {'cik' : 7372}, 
    'oil' : {'cik' : 1311}
}

In [None]:
all_tickers = []
CIK_map = {}
for industry, industry_stuff in industries.items():
    cik = industry_stuff['cik']
    tickers = cb.tickers(SIC_codes=[cik])
    all_tickers.extend(tickers)
    CIK_map.update({ticker : cik for ticker in tickers})

In [None]:
docs = list(cb.document_search(document_name='Risk Factors', year=2016, company_identifiers=all_tickers))
doc_contents = [BeautifulSoup(d.get_contents(), 'lxml').text for d in docs]

In [None]:
print("%d documents" % len(docs))
print("%d categories" % len(industries.keys()))
print()

labels = [CIK_map[d['ticker']] for d in docs]
true_k = np.unique(labels).shape[0]

print("Extracting features from the training dataset using a sparse vectorizer")
t0 = time()
max_features = 1000 # From the original
use_idf = False
vectorizer = NumberNormalizingVectorizer(max_df=0.5, 
                                         max_features=max_features,
                                         min_df=2, 
                                         stop_words='english',
                                         use_idf=use_idf)

X = vectorizer.fit_transform(doc_contents)

print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)
print()

# #############################################################################
# Do the actual clustering


km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,                
            verbose=True)

print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()

print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))

print()

print("Top terms per cluster:")

order_centroids = km.cluster_centers_.argsort()[:, ::-1]

terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

In [None]:
doc_distances = [(doc_index, np.linalg.norm(X[doc_index] - km.cluster_centers_[km.labels_[doc_index]]), km.labels_[doc_index]) for doc_index in range(X.shape[0])]

In [None]:
sorted(doc_distances, key=lambda dd:dd[1], reverse=True)

In [None]:
distances = X[511] - km.cluster_centers_[2]