In [1]:
import topycal
import os
import glob
import random

INSTR_PATH = os.path.join(os.getcwd(),"afi_txt")

In [2]:
from concurrent.futures import ThreadPoolExecutor, wait, as_completed
def do_fn_on_iter(fn, iterator, num_threads=6):
    futures = []
    if isinstance(num_threads, str):
        num_threads = int(num_threads)
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        for elem in iterator:
            futures.append(executor.submit(fn, elem))
    results = []
    for x in as_completed(futures):
        results.append(x.result())
    return results


## Get a list of all the AFI pubs to analyze

In [3]:
def get_file_list(limit=500, shuffle=True):
    files = glob.glob("{}/afi*.txt".format(INSTR_PATH))
    if shuffle:
        random.shuffle(files)
    if limit:
        return files[0:limit]
    else:
        return files
    #data = myfile.read()
    
def read_file(fname):
    with open(fname, errors='replace') as fd:
        return fd.read()

In [4]:
file_list = get_file_list(limit=None)


In [5]:
import re
import os

def load_file(fname):       
    with open(fname, 'r') as myfile:
        contents = re.sub(r'[\t\n\r\x0b\x0c]',' ', myfile.read())
        return (os.path.basename(fname),re.sub("\s+",' ', contents))

def load_corpus(file_list):
    return {f[0]:f[1] for f in do_fn_on_iter(load_file, file_list)}    

## Load our corpus (all the AFI pubs) as both a list and a dict

In [6]:
corpus_dict = load_corpus(file_list)

In [7]:
corpus_list = list(corpus_dict.values())

## Text -> Features 

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

tf_vectorizer = CountVectorizer(max_df=0.90, min_df=5, stop_words='english',token_pattern = r'\b[a-zA-Z]{3,}\b')
dtm_tf = tf_vectorizer.fit_transform(corpus_list)

LDA, 20 topics

In [9]:
from sklearn.decomposition import LatentDirichletAllocation 

lda_model = LatentDirichletAllocation(n_components=20, max_iter=20)
lda_model.fit_transform(dtm_tf)



array([[  1.51727261e-01,   6.68985834e-06,   6.68985836e-06, ...,
          3.55738224e-03,   6.68985830e-06,   6.68985827e-06],
       [  8.17260559e-06,   8.17260561e-06,   1.62833588e-02, ...,
          1.68891169e-01,   8.17260561e-06,   8.17260560e-06],
       [  7.03432769e-06,   7.03432771e-06,   2.24063682e-02, ...,
          7.03432763e-06,   3.02229924e-02,   7.03432772e-06],
       ..., 
       [  1.54559508e-05,   1.54559508e-05,   3.62457590e-02, ...,
          1.54559507e-05,   7.25804621e-01,   1.54559507e-05],
       [  2.95159390e-05,   2.95159390e-05,   2.95159391e-05, ...,
          2.95159389e-05,   2.95159391e-05,   2.95159392e-05],
       [  3.16455701e-05,   3.16455700e-05,   3.16455705e-05, ...,
          3.16455701e-05,   3.16455702e-05,   3.16455718e-05]])

In [11]:
feature_names = tf_vectorizer.get_feature_names()

## What do you mean by features? Words - as this is a simple Bag of Words model

In [29]:
print(feature_names[100:200])

['abuses', 'abusing', 'abusive', 'abw', 'aca', 'acad', 'academia', 'academic', 'academically', 'academics', 'academies', 'academy', 'acars', 'acas', 'acat', 'acaws', 'acbrn', 'acbt', 'acc', 'acca', 'accelerate', 'accelerated', 'accelerates', 'accelerating', 'acceleration', 'accelerations', 'accent', 'accept', 'acceptability', 'acceptable', 'acceptance', 'accepted', 'accepting', 'accepts', 'acces', 'access', 'accessed', 'accesses', 'accessibility', 'accessible', 'accessing', 'accession', 'accessioned', 'accessioning', 'accessions', 'accessories', 'accessory', 'acci', 'accident', 'accidental', 'accidentally', 'accidents', 'acclimated', 'accom', 'accommodate', 'accommodated', 'accommodates', 'accommodating', 'accommodation', 'accommodations', 'accompanied', 'accompanies', 'accompany', 'accompanying', 'accomplish', 'accomplished', 'accomplishes', 'accomplishing', 'accomplishment', 'accomplishments', 'accor', 'accord', 'accordancewith', 'accorded', 'according', 'accordingly', 'accordingto',

In [13]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

Display LDA output

In [14]:
vis_data = pyLDAvis.sklearn.prepare(lda_model,dtm_tf,tf_vectorizer)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


## We can visualize our topic model.. remember these topic numbers are arbitrary and do not reference the AFI doc series

In [64]:
pyLDAvis.display(vis_data)

## Now we get the sense that our feature extraction is working fairly well. We can go back and tweak some things like look at adding lemmatization to potentially improve performance - but essentially we are on-target.