In [None]:
# Substantailly copied by andrew@calcbench.com  
# from http://scikit-learn.org/stable/auto_examples/text/document_clustering.html
# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
#         Lars Buitinck
# http://scikit-learn.org/stable/auto_examples/text/document_clustering.html
# License: BSD 3 clause

In [23]:
import calcbench as cb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from bs4 import BeautifulSoup

In [140]:
#http://scikit-learn.org/stable/auto_examples/text/document_clustering.html

In [6]:
class NumberNormalizingVectorizer(sklearn.feature_extraction.text.TfidfVectorizer):
    def build_tokenizer(self):
        tokenize = super(NumberNormalizingVectorizer, self).build_tokenizer()
        return lambda doc: list(number_normalizer(tokenize(doc)))

In [8]:
def number_normalizer(tokens):
    """ Map all numeric tokens to a placeholder.

    For many applications, tokens that begin with a number are not directly
    useful, but the fact that such a token exists can be relevant.  By applying
    this form of dimensionality reduction, some methods may perform better.
    """

    return ("#NUMBER" if token[0].isdigit() else token for token in tokens)

In [13]:
#The four biggest CIKs.
industries = {
    'pharmaceutical' : {'cik' : 2834},               
    'REIT' : {'cik' : 6798},    
    'software' : {'cik' : 7372}, 
    'oil' : {'cik' : 1311}
}

In [19]:
all_tickers = []
CIK_map = {}
for industry, industry_stuff in industries.items():
    cik = industry_stuff['cik']
    tickers = cb.tickers(SIC_codes=[cik])
    all_tickers.extend(tickers)
    CIK_map.update({ticker : cik for ticker in tickers})

In [24]:
docs = list(cb.document_search(document_name='Risk Factors', year=2016, company_identifiers=all_tickers))
doc_contents = [BeautifulSoup(d.get_contents(), 'lxml').text for d in docs]

In [29]:
print("%d documents" % len(docs))
print("%d categories" % len(industries.keys()))
print()

labels = [CIK_map[d['ticker']] for d in docs]
true_k = np.unique(labels).shape[0]

print("Extracting features from the training dataset using a sparse vectorizer")
t0 = time()
max_features = 1000 # From the original
use_idf = False
vectorizer = NumberNormalizingVectorizer(max_df=0.5, max_features=max_features,
                                 min_df=2, stop_words='english',
                                 use_idf=use_idf)

X = vectorizer.fit_transform(doc_contents)

print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)
print()

# #############################################################################
# Do the actual clustering


km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,                
            verbose=True)

print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()

print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))

print()

print("Top terms per cluster:")

order_centroids = km.cluster_centers_.argsort()[:, ::-1]

terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

1094 documents
4 categories

Extracting features from the training dataset using a sparse vectorizer
done in 15.517489s
n_samples: 1094, n_features: 1000

Clustering sparse data with KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=4, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=True)
Initialization complete
Iteration  0, inertia 959.739
Iteration  1, inertia 530.678
Iteration  2, inertia 522.164
Iteration  3, inertia 521.471
Iteration  4, inertia 520.565
Iteration  5, inertia 504.685
Iteration  6, inertia 504.679
Converged at iteration 6: center shift 0.000000e+00 within tolerance 8.123865e-08
done in 1.847s

Homogeneity: 0.794
Completeness: 0.778
V-measure: 0.786
Adjusted Rand-Index: 0.757
Silhouette Coefficient: 0.318

Top terms per cluster:
Cluster 0: gas oil reserves drilling wells fracturing exploration hydraulic proved emissions
Cluster 1: clinical candidates trials fda patent drug patents manufac