In [1]:
import pickle

import pyLDAvis
import pyLDAvis.sklearn

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [3]:
with open ('../data/test_corpus', 'rb') as fp:
    lemmatized_corpus = pickle.load(fp)

In [6]:
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=400,
                                stop_words='english')
tf = tf_vectorizer.fit_transform(lemmatized_corpus)

In [9]:
lda_model = LatentDirichletAllocation(n_topics=14, max_iter=5,
                                      learning_method='online',
                                      learning_offset=50.,
                                      random_state=1)

lda_model.fit(tf)
to_disp = pyLDAvis.sklearn.prepare(lda_model,tf, tf_vectorizer, R=20)
pyLDAvis.display(to_disp)

In [10]:
import string

from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag

from sklearn.base import BaseEstimator, TransformerMixin


class NLTKPreprocessor(BaseEstimator, TransformerMixin):

    def __init__(self, stopwords=None, punct=None,
                 lower=True, strip=True):
        self.lower      = lower
        self.strip      = strip
        self.stopwords  = stopwords or set(sw.words('english'))
        self.punct      = punct or set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()

    def fit(self, X, y=None):
        return self

    def inverse_transform(self, X):
        return [" ".join(doc) for doc in X]

    def transform(self, X):
        return [
            list(self.tokenize(doc)) for doc in X
        ]

    def tokenize(self, document):
        # Break the document into sentences
        for sent in sent_tokenize(document):
            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                # Apply preprocessing to the token
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token

                # If stopword, ignore token and continue
                if token in self.stopwords:
                    continue

                # If punctuation, ignore token and continue
                if all(char in self.punct for char in token):
                    continue

                # Lemmatize the token and yield
                lemma = self.lemmatize(token, tag)
                yield lemma

    def lemmatize(self, token, tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)

In [38]:
with open ('../data/test_corpus', 'rb') as fp:
    raw_corpus = pickle.load(fp)

In [39]:
from unidecode import unidecode
to_preproc = []
for doc in raw_corpus:
    to_preproc.append(unidecode(' '.join(doc.translate(None, string.punctuation)
                              .translate(None, string.digits).replace('\n', ' ').split()).decode('utf-8')).decode('utf-8'))

In [82]:
preproc = NLTKPreprocessor()
NLTKtokenized = preproc.fit_transform(to_preproc)

In [49]:
len(NLTKtokenized[0])

599

In [66]:
NLTKtokenized[0]

[u'form',
 u'state',
 u'colorado',
 u'de',
 u'et',
 u'oil',
 u'gas',
 u'conservation',
 u'commission',
 u'lincoln',
 u'street',
 u'suite',
 u'denver',
 u'colorado',
 u'phone',
 u'fax',
 u'document',
 u'number',
 u'well',
 u'abandonment',
 u'report',
 u'form',
 u'submit',
 u'intent',
 u'abandon',
 u'whenever',
 u'abandonment',
 u'plan',
 u'borehole',
 u'afterthe',
 u'abandonment',
 u'complete',
 u'form',
 u'shall',
 u'submit',
 u'subsequent',
 u'report',
 u'actual',
 u'work',
 u'complete',
 u'approved',
 u'intent',
 u'shall',
 u'valid',
 u'six',
 u'month',
 u'afterthe',
 u'approval',
 u'date',
 u'period',
 u'new',
 u'intent',
 u'require',
 u'attachment',
 u'require',
 u'intent',
 u'abandon',
 u'wellbore',
 u'diagram',
 u'current',
 u'configuration',
 u'propose',
 u'configuration',
 u'plug',
 u'set',
 u'subsequent',
 u'report',
 u'abandonment',
 u'shall',
 u'indicate',
 u'actual',
 u'work',
 u'complete',
 u'attachment',
 u'require',
 u'subsequent',
 u'report',
 u'wellbore',
 u'diagram',


In [59]:
NLTKsentence = ' '.join(w for w in NLTKtokenized[0])
len(NLTKsentence)

3829

In [60]:
NLTKsentence

u'form state colorado de et oil gas conservation commission lincoln street suite denver colorado phone fax document number well abandonment report form submit intent abandon whenever abandonment plan borehole afterthe abandonment complete form shall submit subsequent report actual work complete approved intent shall valid six month afterthe approval date period new intent require attachment require intent abandon wellbore diagram current configuration propose configuration plug set subsequent report abandonment shall indicate actual work complete attachment require subsequent report wellbore diagram show plug set case remain hole job summary plug contractor use include wireline cement third party verification log may run abandonment date received ogcc operator number kerrmcgee oil gas onshore lp contact name cheryl light name operator phone address p box fax city denver state co zip email cheryllghtanadarkocom intent hour notice require name hickey mike tel email mikehickeystatecous co

In [67]:
from itertools import product
from gensim.parsing.preprocessing import STOPWORDS

stop_two_letters = [''.join(cb) for cb in product(string.ascii_lowercase, string.ascii_lowercase)]
stop_specific = ['wattenberg', 'yes', 'na', '----', '4n', 'n2', 'acre', "'s", 'pm', '--', 'number', "''", 'ii', 'iii',\
                 'um', 'mu', 'mm', 'mum', 'nwse', 'swne', 'lease', 'rule', 'drilling', 'permit', 'application', 'form',\
                 'felfwl', 'fnlfsl', 'fnl', 'fsl', 'page', 'file', 'survey']

stoplist = STOPWORDS.union([c for c in string.ascii_lowercase]).union([p for p in string.punctuation])\
                    .union([d for d in string.digits]).union(stop_specific).union(stop_two_letters)

In [55]:
NLTKtokenized_sw = [w for w in NLTKtokenized[0] if w not in stoplist]
len(NLTKtokenized_sw)

482

In [65]:
NLTKtokenized_sw

[u'state',
 u'colorado',
 u'oil',
 u'gas',
 u'conservation',
 u'commission',
 u'lincoln',
 u'street',
 u'suite',
 u'denver',
 u'colorado',
 u'phone',
 u'fax',
 u'document',
 u'abandonment',
 u'report',
 u'submit',
 u'intent',
 u'abandon',
 u'abandonment',
 u'plan',
 u'borehole',
 u'afterthe',
 u'abandonment',
 u'complete',
 u'shall',
 u'submit',
 u'subsequent',
 u'report',
 u'actual',
 u'work',
 u'complete',
 u'approved',
 u'intent',
 u'shall',
 u'valid',
 u'month',
 u'afterthe',
 u'approval',
 u'date',
 u'period',
 u'new',
 u'intent',
 u'require',
 u'attachment',
 u'require',
 u'intent',
 u'abandon',
 u'wellbore',
 u'diagram',
 u'current',
 u'configuration',
 u'propose',
 u'configuration',
 u'plug',
 u'set',
 u'subsequent',
 u'report',
 u'abandonment',
 u'shall',
 u'indicate',
 u'actual',
 u'work',
 u'complete',
 u'attachment',
 u'require',
 u'subsequent',
 u'report',
 u'wellbore',
 u'diagram',
 u'plug',
 u'set',
 u'case',
 u'remain',
 u'hole',
 u'job',
 u'summary',
 u'plug',
 u'contr

In [64]:
NLTKsentence_sw = ' '.join(w for w in NLTKtokenized_sw)
len(NLTKsentence_sw)

3304

In [62]:
NLTKsentence_sw

u'state colorado oil gas conservation commission lincoln street suite denver colorado phone fax document abandonment report submit intent abandon abandonment plan borehole afterthe abandonment complete shall submit subsequent report actual work complete approved intent shall valid month afterthe approval date period new intent require attachment require intent abandon wellbore diagram current configuration propose configuration plug set subsequent report abandonment shall indicate actual work complete attachment require subsequent report wellbore diagram plug set case remain hole job summary plug contractor use include wireline cement party verification log run abandonment date received ogcc operator kerrmcgee oil gas onshore contact cheryl light operator phone address box fax city denver state zip email cheryllghtanadarkocom intent hour notice require hickey mike tel email mikehickeystatecous cogcc contact api john henry stolz unit location qterr section township range meridian county

In [68]:
stop_two_letters = [''.join(cb) for cb in product(string.ascii_lowercase, string.ascii_lowercase)]
stop_specific = ['wattenberg', 'yes', 'na', '----', '4n', 'n2', 'acre', "'s", 'pm', '--', 'number', "''", 'ii', 'iii',\
                 'um', 'mu', 'mm', 'mum', 'nwse', 'swne', 'lease', 'rule', 'drilling', 'permit', 'application', 'form',\
                 'felfwl', 'fnlfsl', 'fnl', 'fsl', 'page', 'file', 'survey']

stoplist = set([c for c in string.ascii_lowercase]).union([p for p in string.punctuation])\
                    .union([d for d in string.digits]).union(stop_specific).union(stop_two_letters)

In [69]:
NLTKtokenized_sw1 = [w for w in NLTKtokenized[0] if w not in stoplist]
len(NLTKtokenized_sw1)

530

In [70]:
NLTKtokenized_sw1

[u'state',
 u'colorado',
 u'oil',
 u'gas',
 u'conservation',
 u'commission',
 u'lincoln',
 u'street',
 u'suite',
 u'denver',
 u'colorado',
 u'phone',
 u'fax',
 u'document',
 u'well',
 u'abandonment',
 u'report',
 u'submit',
 u'intent',
 u'abandon',
 u'whenever',
 u'abandonment',
 u'plan',
 u'borehole',
 u'afterthe',
 u'abandonment',
 u'complete',
 u'shall',
 u'submit',
 u'subsequent',
 u'report',
 u'actual',
 u'work',
 u'complete',
 u'approved',
 u'intent',
 u'shall',
 u'valid',
 u'six',
 u'month',
 u'afterthe',
 u'approval',
 u'date',
 u'period',
 u'new',
 u'intent',
 u'require',
 u'attachment',
 u'require',
 u'intent',
 u'abandon',
 u'wellbore',
 u'diagram',
 u'current',
 u'configuration',
 u'propose',
 u'configuration',
 u'plug',
 u'set',
 u'subsequent',
 u'report',
 u'abandonment',
 u'shall',
 u'indicate',
 u'actual',
 u'work',
 u'complete',
 u'attachment',
 u'require',
 u'subsequent',
 u'report',
 u'wellbore',
 u'diagram',
 u'show',
 u'plug',
 u'set',
 u'case',
 u'remain',
 u'hole

In [71]:
stoplist

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 "''",
 "'s",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '--',
 '----',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '4n',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'aa',
 'ab',
 'ac',
 'acre',
 'ad',
 'ae',
 'af',
 'ag',
 'ah',
 'ai',
 'aj',
 'ak',
 'al',
 'am',
 'an',
 'ao',
 'ap',
 'application',
 'aq',
 'ar',
 'as',
 'at',
 'au',
 'av',
 'aw',
 'ax',
 'ay',
 'az',
 'b',
 'ba',
 'bb',
 'bc',
 'bd',
 'be',
 'bf',
 'bg',
 'bh',
 'bi',
 'bj',
 'bk',
 'bl',
 'bm',
 'bn',
 'bo',
 'bp',
 'bq',
 'br',
 'bs',
 'bt',
 'bu',
 'bv',
 'bw',
 'bx',
 'by',
 'bz',
 'c',
 'ca',
 'cb',
 'cc',
 'cd',
 'ce',
 'cf',
 'cg',
 'ch',
 'ci',
 'cj',
 'ck',
 'cl',
 'cm',
 'cn',
 'co',
 'cp',
 'cq',
 'cr',
 'cs',
 'ct',
 'cu',
 'cv',
 'cw',
 'cx',
 'cy',
 'cz',
 'd',
 'da',
 'db',
 'dc',
 'dd',
 'de',
 'df',
 'dg',
 'dh',
 'di',
 'dj',
 'dk',
 'dl',
 'dm',
 'dn',
 'do',
 'dp',
 'dq',
 'dr',
 'drilling',


In [75]:
len(preproc.stopwords)

153

In [77]:
preproc.stopwords

{u'a',
 u'about',
 u'above',
 u'after',
 u'again',
 u'against',
 u'ain',
 u'all',
 u'am',
 u'an',
 u'and',
 u'any',
 u'are',
 u'aren',
 u'as',
 u'at',
 u'be',
 u'because',
 u'been',
 u'before',
 u'being',
 u'below',
 u'between',
 u'both',
 u'but',
 u'by',
 u'can',
 u'couldn',
 u'd',
 u'did',
 u'didn',
 u'do',
 u'does',
 u'doesn',
 u'doing',
 u'don',
 u'down',
 u'during',
 u'each',
 u'few',
 u'for',
 u'from',
 u'further',
 u'had',
 u'hadn',
 u'has',
 u'hasn',
 u'have',
 u'haven',
 u'having',
 u'he',
 u'her',
 u'here',
 u'hers',
 u'herself',
 u'him',
 u'himself',
 u'his',
 u'how',
 u'i',
 u'if',
 u'in',
 u'into',
 u'is',
 u'isn',
 u'it',
 u'its',
 u'itself',
 u'just',
 u'll',
 u'm',
 u'ma',
 u'me',
 u'mightn',
 u'more',
 u'most',
 u'mustn',
 u'my',
 u'myself',
 u'needn',
 u'no',
 u'nor',
 u'not',
 u'now',
 u'o',
 u'of',
 u'off',
 u'on',
 u'once',
 u'only',
 u'or',
 u'other',
 u'our',
 u'ours',
 u'ourselves',
 u'out',
 u'over',
 u'own',
 u're',
 u's',
 u'same',
 u'shan',
 u'she',
 u'shoul

In [76]:
len(STOPWORDS)

337

In [78]:
STOPWORDS

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

model = Pipeline([
                ('preprocessor', NLTKPreprocessor()),
                ('vectorizer', TfidfVectorizer(preprocessor=None, lowercase=False))
                ])

TypeError: 'instancemethod' object has no attribute '__getitem__'