In [None]:
cd "/home/kesj/work/nov2014hackday/data"

In [None]:
try:
    from xml.etree.cElementTree import XML
except ImportError:
    from xml.etree.ElementTree import XML
import zipfile


"""
Module that extract text from MS XML Word document (.docx).
(Inspired by python-docx <https://github.com/mikemaccana/python-docx>)
"""

WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
PARA = WORD_NAMESPACE + 'p'
TEXT = WORD_NAMESPACE + 't'


def get_docx_text(path):
    """
    Take the path of a docx file as argument, return the text in unicode.
    """
    document = zipfile.ZipFile(path)
    xml_content = document.read('word/document.xml')
    document.close()
    tree = XML(xml_content)

    paragraphs = []
    for paragraph in tree.getiterator(PARA):
        texts = [node.text.encode('ascii','ignore')
                 for node in paragraph.getiterator(TEXT)
                 if node.text]
        if texts:
            paragraphs.append(''.join(texts))

    return '\n\n'.join(paragraphs)


In [None]:
def lexical_diversity(text):
    return float(len(text)) / len(set(text))

from nltk import FreqDist

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import glob

In [None]:
# load all the data into 
all_lines = []

fnames = glob.glob('*.docx')
print "Going to process {0} files".format(len(fnames))
for f in fnames:
    txt = get_docx_text(f)
    lines = [l for l in txt.splitlines() if len(l)>0]
    all_lines.extend(lines)

len(all_lines)


In [None]:
all_lines[:10]

In [None]:
from nltk.stem import SnowballStemmer
from nltk.tokenize.punkt import PunktWordTokenizer

stemmer = SnowballStemmer('english')
punkt_tokenizer = PunktWordTokenizer()

def tokenize(x):
    return [stemmer.stem(s) for s in punkt_tokenizer.tokenize(x)]

#bow_matrix = CountVectorizer(tokenizer=tokenize).fit_transform(texts)
#normalized = TfidfTransformer().fit_transform(bow_matrix)

tokenizer=tokenize

from sklearn.feature_extraction.text import _check_stop_list
stoplist = set(_check_stop_list('english'))

In [None]:
documents=all_lines
texts = [[stemmer.stem(word) for word in (''.join(x for x in document.lower() if x in set('qwertyuiopasdfghjklz xcvbnm|'))).split() 
          if word not in stoplist] for document in documents]
# remove words that appear only once
all_tokens = sum(texts)#, [])
#from itertools import chain
#all_tokens = list(chain.from_iterable(texts))
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word)< (0.01 * len(texts)))
my_text = [' '.join([word for word in text if word not in tokens_once]) for text in texts if len(text)>5]


In [None]:
my_text[:10]


In [None]:
np.shape(my_text)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer(tokenizer = tokenizer,max_df=.9, min_df=.01, stop_words=stoplist,
                             strip_accents="ascii", ngram_range=(0,4))

In [None]:
X_trans = vectorizer.fit_transform(my_text)#all_lines)
X_trans.shape

# cluster the bullets

In [None]:
from sklearn import cluster
import pandas as pd
def clusterBullets(X,nclusters=14,random_state=44,show_plot=True):
    clstr = cluster.KMeans(random_state=random_state, n_clusters=nclusters)
    clstr.fit(X_trans)
    if show_plot:
        obs_per_cluster = pd.Series(clstr.labels_).value_counts()
        obs_per_cluster.plot(kind="bar",color='cadetblue')
        ylabel('Number of Observations')
        xlabel('Cluster ID')
        show()
    return clstr

In [None]:
cc = clusterBullets(X_trans,16)

# Determine what each group represents

In [None]:

from sklearn.ensemble import RandomForestClassifier
def plotGroupDetails(X,cluster,terms,nestimators=100):
    nclusters = cluster.n_clusters
    X_dense = X.toarray()
    y = cluster.labels_
    obs_per_cluster = pd.Series(cluster.labels_).value_counts()
    for k in range(nclusters):
        y_binary = pd.Series(y).replace(k, nestimators)
        y_binary = y_binary.replace([e for e in range(nclusters) if e != k], 0)
        Forest = RandomForestClassifier(n_estimators=nestimators)
        Forest.fit(X_dense, y_binary)
        temp = pd.Series(Forest.feature_importances_, index=terms)
        temp.sort(ascending=False)
        temp[:7].plot(kind="bar", figsize=(8,3), fontsize=14, grid=False, alpha=0.7, linewidth=0.0)
        xticks(rotation=50, ha="right")
        title("Group %d  (n = %d)" % (k, obs_per_cluster[k]), fontsize=20)
        show()

In [None]:
terms = vectorizer.get_feature_names() 
plotGroupDetails(X_trans,cc,terms)

In [None]:
vectorizerLong = TfidfVectorizer(tokenizer = tokenizer,max_df=.9, min_df=.01, stop_words=stoplist,
                             strip_accents="ascii", ngram_range=(2,10))

In [None]:
X_transLong = vectorizerLong.fit_transform(my_text)
X_transLong.shape

In [None]:
vectorizerLong.get_feature_names()

In [None]:
cc = clusterBullets(X_transLong,8)


In [None]:
terms = vectorizerLong.get_feature_names() 
plotGroupDetails(X_transLong,cc,terms)

In [None]:
vectorizerLongB = TfidfVectorizer(tokenizer = tokenizer,max_df=.9, min_df=.01, stop_words=stoplist,
                             strip_accents="ascii", ngram_range=(3,10))

In [None]:
X_transLongB = vectorizerLongB.fit_transform(my_text)
X_transLongB.shape



In [None]:
vectorizerLongB.get_feature_names()

In [None]:
# Add stop words
more_stop_words = """agent dimont fatalities insured s ph phs rd started claim noticed 
                     n e w south north west east insd insureds went got stated said know 
                     t policy nan ni took policyholder unsure states like""".split()
stop_words = _check_stop_list('english') | set(more_stop_words)
stop_words = set(stop_words)
stop_words.remove("fire")
vectorizer = TfidfVectorizer(max_df=.9, min_df=50, stop_words=stop_words,
                             strip_accents="ascii", ngram_range=(0,2))
X_trans = vectorizer.fit_transform(X)
X_trans.shape