In [1]:
import topycal
import os
import glob
import random

INSTR_PATH = os.path.join(os.getcwd(),"afi_txt")

In [2]:
from concurrent.futures import ThreadPoolExecutor, wait, as_completed
def do_fn_on_iter(fn, iterator, num_threads=6):
    futures = []
    if isinstance(num_threads, str):
        num_threads = int(num_threads)
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        for elem in iterator:
            futures.append(executor.submit(fn, elem))
    results = []
    for x in as_completed(futures):
        results.append(x.result())
    return results


In [3]:
def get_file_list(limit=500, shuffle=True):
    files = glob.glob("{}/afi*.txt".format(INSTR_PATH))
    if shuffle:
        random.shuffle(files)
    if limit:
        return files[0:limit]
    else:
        return files
    #data = myfile.read()
    
def read_file(fname):
    with open(fname, errors='replace') as fd:
        return fd.read()

In [4]:
file_list = get_file_list(limit=None)


In [5]:
import re
import os

def load_file(fname):       
    with open(fname, 'r') as myfile:
        contents = re.sub(r'[\t\n\r\x0b\x0c]',' ', myfile.read())
        return (os.path.basename(fname),re.sub("\s+",' ', contents))

def load_corpus(file_list):
    return {f[0]:f[1] for f in do_fn_on_iter(load_file, file_list)}    

In [6]:
corpus_dict = load_corpus(file_list)

In [7]:
corpus_list = list(corpus_dict.values())

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

tf_vectorizer = CountVectorizer(max_df=0.90, min_df=5, stop_words='english',token_pattern = r'\b[a-zA-Z]{3,}\b')
dtm_tf = tf_vectorizer.fit_transform(corpus_list)

LDA, 20 topics

In [9]:
from sklearn.decomposition import LatentDirichletAllocation 

lda_model = LatentDirichletAllocation(n_components=20, max_iter=20)
lda_model.fit_transform(dtm_tf)



array([[  1.33511351e-05,   1.33511352e-05,   3.11962881e-01, ...,
          1.64455629e-01,   1.33511351e-05,   1.33511351e-05],
       [  1.75174302e-06,   1.75174302e-06,   3.33694840e-02, ...,
          1.75174303e-06,   1.69700233e-02,   1.75174302e-06],
       [  3.54358616e-05,   3.54358617e-05,   4.11749021e-01, ...,
          1.40909003e-01,   3.54358617e-05,   3.54358616e-05],
       ..., 
       [  3.64777126e-06,   9.99930692e-01,   3.64777127e-06, ...,
          3.64777125e-06,   3.64777126e-06,   3.64777127e-06],
       [  1.46149035e-02,   5.80620124e-02,   3.50680327e-06, ...,
          3.50680328e-06,   3.50680326e-06,   3.50680327e-06],
       [  8.34724560e-05,   8.34724557e-05,   3.54593832e-02, ...,
          8.34724559e-05,   8.34724559e-05,   8.34724561e-05]])

In [None]:
#from sklearn.manifold import TSNE
#tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=0.99, init='pca')

In [16]:
feature_names = tf_vectorizer.get_feature_names()

In [22]:
#print(feature_names)

In [10]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

Display LDA output

In [11]:
vis_data = pyLDAvis.sklearn.prepare(lda_model,dtm_tf,tf_vectorizer)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


In [12]:
pyLDAvis.display(vis_data)

In [21]:
from sklearn.decomposition import TruncatedSVD

In [25]:
svd = TruncatedSVD(n_components=50, random_state=0)
svd_tfidf = svd.fit_transform(dtm_tf)

In [24]:
dtm_tf.shape

(1141, 22329)

In [26]:
svd_tfidf.shape

(1141, 50)

In [27]:
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

output_notebook()
plot_tfidf = bp.figure(plot_width=700, plot_height=600, title="tf-idf clustering of the news",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

In [29]:
import pandas as pd
tfidf_df = pd.DataFrame(svd_tfidf, columns=['x', 'y'])
#tfidf_df['description'] = data['description']
#tfidf_df['category'] = data['category']

ValueError: Shape of passed values is (50, 1141), indices imply (2, 1141)