## Notebook Goal:  
Using existing NLP and LDA methodologies to perform topic modeling on docket texts. Three hyperparameters to consider:
1. to remove organization or not in docket texts, so organizations themselves won't become topics.
2. to remove names or not in docket texts, so names themselves won't become topics.
3. variations in topic numbers: [2, 3, 5, 10]

Will then perform visualizations and model summary output on every permutation/iteration.

In [1]:
import nltk
from nltk.tag.stanford import StanfordNERTagger
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords

In [2]:
from gensim.models.word2vec import LineSentence
from gensim.models import Phrases
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

#visualization libraries
import pyLDAvis
import pyLDAvis.gensim



In [3]:
import os
java_path = 'C:/Program Files/Java/jdk-10.0.1/bin/java.exe'
os.environ['JAVAHOME'] = java_path

import pandas as pd
import numpy as np
import codecs
import itertools as it
from bs4 import BeautifulSoup
import warnings
import pickle
from collections import Counter
import re

In [4]:
#import corpus/docket texts from html to pandas DataFrame
def grab_docket_train():
    files = []
    #get all .html files in the folder (all docket files are in .html)
    for file in os.listdir('docket_texts/train/'):
        if file.endswith('.html'):
            files.append(os.path.join('docket_texts/train/', file))

    df_docket_texts = pd.DataFrame()
    
    for i in range(len(files)): #gather all docket texts
    #for i in [0, 1]: #for testing purposes
        
        content = codecs.open(files[i], 'r', 'utf-8').read()
        #use beautiful soup to get the case ID
        soup = BeautifulSoup(content, 'lxml')
        case_id = str(soup.find_all('h3'))    
        bookmark1 = case_id.find('CASE #:') + len('CASE #:')
        bookmark2 = case_id.find('</h3>')
        case_id = case_id[bookmark1:bookmark2]

        #use pandas to grab tables in the html files
        docket_tables = pd.read_html(content)

        #error checking: gotta do this because there's different length of docket_list/
        #usually docket texts are in docket_list[3], but not always
        n = 0
        while docket_tables[n].isin(['Docket Text']).sum().sum() == 0:
            #print(n, docket_tables[n].isin(['Docket Text']).sum().sum())
            n += 1
                        
        #print(i, files[i])
        #print(docket_tables[n].head())

        #docket_tables[n] is the docket text table
        new_header = docket_tables[n].iloc[0]
        docket_tables[n] = docket_tables[n][1:]
        docket_tables[n].columns = new_header
        
        docket_tables[n]['#'] = pd.to_numeric(docket_tables[n]['#'],
                                              downcast = 'signed', errors = 'coerce')
        docket_tables[n]['Date Filed'] = pd.to_datetime(docket_tables[n]['Date Filed'])
        docket_tables[n]['Case ID'] = case_id

        df_docket_texts = pd.concat([df_docket_texts, docket_tables[n]])
    #reorder a column
    cols = list(df_docket_texts.columns)
    df_docket_texts = df_docket_texts[[cols[-1]] + cols[:-1]]
    
    print('current docket text table size/shape: {}'.format(df_docket_texts.shape))
    return df_docket_texts

### Pull from dir .html files

In [5]:
%%time
df = grab_docket_train()
docket_original = list(df['Docket Text'])
for i in range(5):
    print('docket text {}'.format(i))
    print(docket_original[i], '\n')

  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)


current docket text table size/shape: (3244, 4)
docket text 0
COMPLAINT against Cardiogenics Holdings, Inc. filing fee $ 400, receipt number 0207-8445206 Was the Disclosure Statement on Civil Cover Sheet completed -YES,, filed by LG Capital Funding, LLC. (Steinmetz, Michael) (Additional attachment(s) added on 3/11/2016: # 1 Civil Cover Sheet, # 2 Proposed Summons) (Bowens, Priscilla). (Entered: 03/10/2016) 

docket text 1
Case assigned to Judge Ann M Donnelly and Magistrate Judge Vera M. Scanlon. Please download and review the Individual Practices of the assigned Judges, located on our website. Attorneys are responsible for providing courtesy copies to judges where their Individual Practices require such. (Bowens, Priscilla) (Entered: 03/11/2016) 

docket text 2
Summons Issued as to Cardiogenics Holdings, Inc.. (Bowens, Priscilla) (Entered: 03/11/2016) 

docket text 3
NOTICE - emailed attorney regarding missing second page of the civil cover sheet. (Bowens, Priscilla) (Entered: 03/11/2

### Used Stanford NER to identy Names and Entities

In [6]:
%%time
path_to_model = r'C:\Users\inves\AppData\Local\Programs\Python\Python35\Lib\site-packages\nltk\stanford-ner-2018-02-27\classifiers\english.all.3class.distsim.crf.ser.gz'
path_to_jar = r'C:\Users\inves\AppData\Local\Programs\Python\Python35\Lib\site-packages\nltk\stanford-ner-2018-02-27\stanford-ner.jar'
tagger = StanfordNERTagger(path_to_model, path_to_jar = path_to_jar)

output = []
#length = 100 
length = len(docket_original)
for i in range(length):
    org_str = []
    name_str = []
    stripped_str1 = []
    stripped_str2 = []
    tokens = nltk.tokenize.word_tokenize(docket_original[i])
    for label in tagger.tag(tokens):
        #print(label)
        if label[1] == 'ORGANIZATION':
            org_str.append(label[0])
            stripped_str1.append('-ORG-')
        elif label[1] == 'PERSON':
            name_str.append(label[0])
            stripped_str1.append('-NAME-')
        else:
            stripped_str1.append(label[0])
            stripped_str2.append(label[0])
    
    output.append([docket_original[i],
                   ' '.join(org_str),
                   ' '.join(name_str),
                   ' '.join(stripped_str1),
                   ' '.join(stripped_str2)])
    

Wall time: 1h 44min 25s


In [7]:
NER_df = pd.DataFrame(output, columns = ['Original Docket Text', 'Organization Portion', 'Name Portion', 
                                         'Identifying Org and Name', 'Stripped Org and Name'])

### To re-build new_df, start here

In [8]:
new_df = NER_df.copy()

In [9]:
print(new_df.head())
docket_text_list = list(new_df['Stripped Org and Name'])

                                Original Docket Text  \
0  COMPLAINT against Cardiogenics Holdings, Inc. ...   
1  Case assigned to Judge Ann M Donnelly and Magi...   
2  Summons Issued as to Cardiogenics Holdings, In...   
3  NOTICE - emailed attorney regarding missing se...   
4  In accordance with Rule 73 of the Federal Rule...   

                              Organization Portion  \
0  Cardiogenics Holdings , Inc. LG Capital Funding   
1      Individual Practices of the assigned Judges   
2                            Cardiogenics Holdings   
3                                                    
4                                                    

                                      Name Portion  \
0             ( Steinmetz Michael Bowens Priscilla   
1  Ann M Donnelly Vera M. Scanlon Bowens Priscilla   
2                                 Bowens Priscilla   
3                                 Bowens Priscilla   
4                                 Bowens Priscilla   

             

In [11]:
def text_preprocess1(text):
    text = re.sub("[\(\[].*?[\)\]]", "", text)
    text = text.replace('-', '')
    text = text.replace('(', ' ')
    text = text.replace(')', ' ')
    text = text.replace('(s)', 's')
    text = text.replace("'s", 's')
    text = text.replace('*', '')
    text = text.replace('', '')
    text = text.replace('<', '')
    text = text.replace('/', ' ')
    text = text.replace('\\', '')
    text = text.replace('&', ' ')
    return text

def text_preprocess2(text):
    text = text.replace('.', '')
    return text

def remove_stop(sentence):
    output = []
    for word in sentence.split():
        if word not in set(stopwords.words('english')):
            output.append(word)
    return ' '.join(output)

keywords = pd.read_csv('docket_texts/keywords.csv', header = None)
keywords.columns = ['keywords']
keyword_list = list(keywords['keywords'])

In [84]:
print(docket_text_list[1])
docket_text_list = [text_preprocess1(sentence).lower() for sentence in docket_text_list]
docket_text_list = [text_preprocess2(sentence) for sentence in docket_text_list]
print(docket_text_list[1])
print(len(docket_text_list))

case assigned to judge and magistrate judge . please download and review the , located on our website . attorneys are responsible for providing courtesy copies to judges where their individual practices require such .  
case assigned to judge and magistrate judge  please download and review the , located on our website  attorneys are responsible for providing courtesy copies to judges where their individual practices require such   
3244


In [13]:
class Splitter(object):

    def __init__(self):
        self.splitter = nltk.data.load('tokenizers/punkt/english.pickle')
        self.tokenizer = nltk.tokenize.TreebankWordTokenizer()

    def split(self,text):

        # split into single sentence
        sentences = self.splitter.tokenize(text)
        # tokenization in each sentences
        tokens = [self.tokenizer.tokenize(remove_stop(sent)) for sent in sentences]
        return tokens


class LemmatizationWithPOSTagger(object):
    def __init__(self):
        pass
    def get_wordnet_pos(self,treebank_tag):
        """
        return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v) 
        """
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            # As default pos in lemmatization is Noun
            return wordnet.NOUN

    def pos_tag(self,tokens):
        # find the pos tagginf for each tokens [('What', 'WP'), ('can', 'MD'), ('I', 'PRP') ....
        pos_tokens = [nltk.pos_tag(token) for token in tokens]

        # lemmatization using pos tagg   
        # convert into feature set of [('What', 'What', ['WP']), ('can', 'can', ['MD']), ... ie [original WORD, Lemmatized word, POS tag]
        pos_tokens = [ [(word, lemmatizer.lemmatize(word,self.get_wordnet_pos(pos_tag)), [pos_tag]) for (word,pos_tag) in pos] for pos in pos_tokens]
        return pos_tokens

In [85]:
%%time
lemmatizer = WordNetLemmatizer()
splitter = Splitter()
lemmatization_using_pos_tagger = LemmatizationWithPOSTagger()

lemma_docket_text_list = []
for docket_text in docket_text_list:
    #step 1 split document into sentence followed by tokenization
    tokens = splitter.split(docket_text)

    #step 2 lemmatization using pos tagger 
    lemma_pos_token = lemmatization_using_pos_tagger.pos_tag(tokens)
    lemma_docket_text_list.append(lemma_pos_token)

Wall time: 1min 4s


In [86]:
print(len(lemma_docket_text_list)) #docket text document level
print(len(lemma_docket_text_list[0])) #docket text sentence level
print(len(lemma_docket_text_list[0][0])) #docket text word level
print(lemma_docket_text_list[0][0][0]) #docket text token level
print(lemma_docket_text_list[0][0][0][0]) #docket text tuple level

3244
1
35
('complaint', 'complaint', ['NN'])
complaint


In [87]:
#lets do a collection of what we have
collection = {}
for lemma_pos_token in lemma_docket_text_list:
    for sentence in lemma_pos_token:
        for token in sentence:
            #print(token[2][0])
            if token[2][0] not in list(collection.keys()):
                collection[token[2][0]] = []
                collection[token[2][0]].append(token[1])
            else:
                if token[1] not in collection[token[2][0]]:
                    collection[token[2][0]].append(token[1])

In [88]:
pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in collection.items()])).to_csv('NLP_pos.csv', index = False)

In [18]:
%%time
remove_pos = ["``", "NNPS", "NNP", "CD", '#', '$', "''", ",", "0", ":"]
remove_word = ["'s", "judge", "party", "defendant", "ex", "plantiff", "shall", "date", "b", "exhibit", "pennsylvania", "sign_judge", 
               "Inc..", "inc..", "llc", "'", "[_]", "action", "clerk", "july", "kw", "regard", "sac", "attachment", "c.d", "cal", "case", "cd", "l.p.", 
               "claim", "copy", "court", "direct", "form", "hereby", "magistrate", "p.c", "pl", "plaintiff", "regard", "sign", "time", "mr.", 
               "docket", "follow", "set", "matter" "agreement" "proceeding", "cotton", "january", "february", "march", "april", "may", "june", 
               "july", "august", "september", "october", "november", "december",
               "agreement", "v.", "place_vault", "modify", "fund", "associated", "provide", "material", "amount", "accordingly", "additional", 
               "second", "esq", "transmission", "g.c.", "seal", "review", "honor", "submit", "counsel", "witness", "civ", "first", "ltd..", "enter", 
               "stay", "forth", "matter", "whether", "class", "master", "information", "statement", "submission", "related", "see", "make", "paper", 
               "brookfield", "designate", "remain", "reportertranscriber", "submit", "include", "mail", "fact", "refer", "take", "pursuant", "amount", 
               "behalf", "I.p..", "must", "attorney",
               'abovecapitoned', 'attach', 'add', 'concern', 'chamber', 'close', 'district', 'damage', 'later', 
               'relate', 'return', 'require', 'restriction', 'respect', 'ny', 'seek', 'write', 'expert', 'transcript', 
               'day', 'h.o', 'damage', 'pre', 'proceeding', 'present', 'page', 'pending', 'p.m.', 'frcp', 'g.c.', 'record', 'r.']

    
#rebuild corpus
docket_texts_output = [] #ultimate output after cleaning

for lemma_pos_token in lemma_docket_text_list:
    docket_text_output = [] 
    for sentence in lemma_pos_token:
        sentence_output = []
        for token in sentence:
            #print(token[1])
            
            if token[2][0] not in remove_pos: #if the pos is not in the remove_pos list
                if token[1] not in remove_word: #these are the intentionally left out words
                    sentence_output.append(token[1]) #append the the sentence
        docket_text_output.append(' '.join(sentence_output))
    docket_texts_output.append(docket_text_output)
print(docket_texts_output[:10])

[['complaint file fee receipt number disclosure civil cover sheet complete yes file .', 'civil cover sheet propose summons .'], ['assign .', 'please download locate website .', 'responsible courtesy individual practice .'], ['summons issue'], ['notice email miss civil cover sheet .'], ['accordance rule federal rule civil procedure local rule notify consent united state available conduct civil trial order entry final judgment .', 'notice blank consent fill file electronically wish consent .', 'also access link http www.uscourts.govuscourtsformsandfeesformsao085.pdf .', 'withhold consent without adverse substantive consequence .', 'file consent unless consent .'], ['open filing check quality control .', 'correction .'], ['notice appearance'], ['backend note .', 'document complaint file .', ''], ['ta letter .', 'document complaint file .', ''], ['c notice conversion .', 'document complaint file .', '']]
Wall time: 201 ms


In [102]:
new_df['Removed unnecessary POS & vocab'] = pd.Series(docket_texts_output)
new_df.head()

Unnamed: 0,Original Docket Text,Organization Portion,Name Portion,Identifying Org and Name,Stripped Org and Name,Removed unnecessary POS & vocab,DT Topics
0,"COMPLAINT against Cardiogenics Holdings, Inc. ...","Cardiogenics Holdings , Inc. LG Capital Funding",( Steinmetz Michael Bowens Priscilla,COMPLAINT against -ORG- -ORG- -ORG- -ORG- fili...,"COMPLAINT against filing fee $ 400 , receipt n...",[complaint file fee receipt number disclosure ...,"[Service of Process, Complaints]"
1,Case assigned to Judge Ann M Donnelly and Magi...,Individual Practices of the assigned Judges,Ann M Donnelly Vera M. Scanlon Bowens Priscilla,Case assigned to Judge -NAME- -NAME- -NAME- an...,Case assigned to Judge and Magistrate Judge . ...,"[assign ., please download locate website ., r...",[]
2,"Summons Issued as to Cardiogenics Holdings, In...",Cardiogenics Holdings,Bowens Priscilla,"Summons Issued as to -ORG- -ORG- , Inc.. ( -NA...","Summons Issued as to , Inc.. ( , ) ( Entered :...",[summons issue],[Service of Process]
3,NOTICE - emailed attorney regarding missing se...,,Bowens Priscilla,NOTICE - emailed attorney regarding missing se...,NOTICE - emailed attorney regarding missing se...,[notice email miss civil cover sheet .],[Notices]
4,In accordance with Rule 73 of the Federal Rule...,,Bowens Priscilla,In accordance with Rule 73 of the Federal Rule...,In accordance with Rule 73 of the Federal Rule...,[accordance rule federal rule civil procedure ...,"[Motions, Notices]"


#### Decision tree to identify keywords and topics based on Chris' feedback

In [103]:
manual_topics_df = pd.read_csv('mannual_topics.csv')
manual_topics_df = manual_topics_df.apply(lambda x: x.astype(str).str.lower())
manual_topics_dict = manual_topics_df.to_dict('list')
for topic in manual_topics_dict.keys():
    manual_topics_dict[topic] = [keyword for keyword in manual_topics_dict[topic] if keyword != 'nan']

In [104]:
def mannual_topic_assignment(a_list):
    text = ' '.join(a_list).split()
    #print(text)
    output = []
    for topic in manual_topics_dict.keys():
        if set(text).intersection(manual_topics_dict[topic]):
            output.append(topic)
        
    return output

In [106]:
docket_texts_output_DT = []
topics_DT = []

for text in docket_texts_output:
    topic = mannual_topic_assignment(text)
    if topic != []:
        docket_texts_output_DT.append([])
        topics_DT.append(topic)
    else:
        docket_texts_output_DT.append(text)
        topics_DT.append([])

In [107]:
print(topics_DT[:5])
print(docket_texts_output_DT[:5])

[['Service of Process', 'Complaints'], [], ['Service of Process'], ['Notices'], ['Motions', 'Notices']]
[[], ['assign .', 'please download locate website .', 'responsible courtesy individual practice .'], [], [], []]


In [108]:
new_df['DT Topics'] = pd.Series(topics_DT)
new_df['Removed unnecessary POS & vocab DT'] = pd.Series(docket_texts_output_DT)
print(new_df.head())

                                Original Docket Text  \
0  COMPLAINT against Cardiogenics Holdings, Inc. ...   
1  Case assigned to Judge Ann M Donnelly and Magi...   
2  Summons Issued as to Cardiogenics Holdings, In...   
3  NOTICE - emailed attorney regarding missing se...   
4  In accordance with Rule 73 of the Federal Rule...   

                              Organization Portion  \
0  Cardiogenics Holdings , Inc. LG Capital Funding   
1      Individual Practices of the assigned Judges   
2                            Cardiogenics Holdings   
3                                                    
4                                                    

                                      Name Portion  \
0             ( Steinmetz Michael Bowens Priscilla   
1  Ann M Donnelly Vera M. Scanlon Bowens Priscilla   
2                                 Bowens Priscilla   
3                                 Bowens Priscilla   
4                                 Bowens Priscilla   

             

In [110]:
#print some examples
for i in range(10):
    print(i)
    if new_df['DT Topics'].iloc[i] != []:
        print(new_df['DT Topics'].iloc[i])
        print(new_df['Removed unnecessary POS & vocab'].iloc[i])
        print(new_df['Removed unnecessary POS & vocab DT'].iloc[i])

0
['Service of Process', 'Complaints']
['complaint file fee receipt number disclosure civil cover sheet complete yes file .', 'civil cover sheet propose summons .']
[]
1
2
['Service of Process']
['summons issue']
[]
3
['Notices']
['notice email miss civil cover sheet .']
[]
4
['Motions', 'Notices']
['accordance rule federal rule civil procedure local rule notify consent united state available conduct civil trial order entry final judgment .', 'notice blank consent fill file electronically wish consent .', 'also access link http www.uscourts.govuscourtsformsandfeesformsao085.pdf .', 'withhold consent without adverse substantive consequence .', 'file consent unless consent .']
[]
5
6
['Notices']
['notice appearance']
[]
7
['Complaints']
['backend note .', 'document complaint file .', '']
[]
8
['Complaints']
['ta letter .', 'document complaint file .', '']
[]
9
['Notices', 'Complaints']
['c notice conversion .', 'document complaint file .', '']
[]


In [111]:
unigram_sentences_filepath = 'docket_texts/train/DT/unigram_nltk_noorgnoname.txt'

In [113]:
%%time
# turn the lemmatized corpus into unigram sentences
with codecs.open(unigram_sentences_filepath, 'w', encoding = 'utf_8') as f:
    for docket_text in docket_texts_output_DT:
        for sentence in docket_text:
            f.write(sentence + '\n')

Wall time: 0 ns


In [114]:
unigram_sentences = LineSentence(unigram_sentences_filepath)

In [115]:
#let's do some comparision between the original text and unigram sentences, shouldn't be that different.
print('Original text:')
print(new_df['Removed unnecessary POS & vocab'].iloc[0])
print(new_df['Removed unnecessary POS & vocab DT'].iloc[0])
#print(df['Docket Text'].iloc[1])

print('\nUnigram_sentence:')
for unigram_sentence in it.islice(unigram_sentences, 0, 10):
    print(' '.join(unigram_sentence))
    print('')

Original text:
['complaint file fee receipt number disclosure civil cover sheet complete yes file .', 'civil cover sheet propose summons .']
[]

Unigram_sentence:
assign .

please download locate website .

responsible courtesy individual practice .

open filing check quality control .

correction .

propose schedule order

letter propose schedule order

order respond letter .

order .

order respond letter motion .



In [116]:
bigram_model_filepath = 'docket_texts/train/DT/bigram_model_noorgnoname' 

In [117]:
%%time

# store our bigram model
bigram_model = Phrases(unigram_sentences)
bigram_model.save(bigram_model_filepath)
    
# load the finished model from disk if we don't want to run this again
#bigram_model = Phrases.load(bigram_model_filepath)

Wall time: 15.6 ms


In [118]:
bigram_sentences_filepath = 'docket_texts/train/DT/bigram_sentences_noorgnoname.txt'

In [119]:
%%time

# apply the bigram model, and write it to file
with codecs.open(bigram_sentences_filepath, 'w', encoding = 'utf_8') as f:
    for unigram_sentence in unigram_sentences:
        bigram_sentence = ' '.join(bigram_model[unigram_sentence])
        f.write(bigram_sentence + '\n')

Wall time: 69.1 ms




In [120]:
bigram_sentences = LineSentence(bigram_sentences_filepath)
print('unigram length = {}, bigram length = {}'.format(len(list(unigram_sentences)), len(list(bigram_sentences))))

unigram length = 1206, bigram length = 1206


In [121]:
#original v. unigram v. bigram. Some phrases should be combined already
start = 0
finish = 10
print('Original text:')
print(new_df['Removed unnecessary POS & vocab'].iloc[0])
print(new_df['Removed unnecessary POS & vocab'].iloc[1])

print('\nUnigram sentence:')
for unigram_sentence in it.islice(unigram_sentences, 0, 10):
    print(' '.join(unigram_sentence))
print('\nBigram sentence:')
for bigram_sentence in it.islice(bigram_sentences, start, finish):
    print(' '.join(bigram_sentence))

Original text:
['complaint file fee receipt number disclosure civil cover sheet complete yes file .', 'civil cover sheet propose summons .']
['assign .', 'please download locate website .', 'responsible courtesy individual practice .']

Unigram sentence:
assign .
please download locate website .
responsible courtesy individual practice .
open filing check quality control .
correction .
propose schedule order
letter propose schedule order
order respond letter .
order .
order respond letter motion .

Bigram sentence:
assign .
please_download locate_website .
responsible_courtesy individual_practice .
open_filing check_quality control .
correction .
propose schedule order
letter propose schedule order
order respond letter .
order .
order respond letter motion .


In [122]:
trigram_model_filepath = 'docket_texts/train/DT/trigram_model_nonamenoorg'

In [123]:
%%time

# again, using Phrases to attach more words to phrases already formed
trigram_model = Phrases(bigram_sentences)
trigram_model.save(trigram_model_filepath)

# load the finished model from disk
#trigram_model = Phrases.load(trigram_model_filepath)

Wall time: 53 ms


In [124]:
trigram_sentences_filepath = 'docket_texts/train/DT/trigram_sentences_nonamenoorg.txt'

In [125]:
%%time

with codecs.open(trigram_sentences_filepath, 'w', encoding = 'utf_8') as f:
    for bigram_sentence in bigram_sentences:
        #print('Bi', bigram_sentence)
        trigram_sentence = ' '.join(trigram_model[bigram_sentence])
        #print('Tri', trigram_sentence)
        f.write(trigram_sentence + '\n')

Wall time: 37.7 ms




In [126]:
trigram_sentences = LineSentence(trigram_sentences_filepath)

In [127]:
start = 0
finish = 15
print('Original text:')
print(new_df['Removed unnecessary POS & vocab'].iloc[0],'\n')
print(new_df['Removed unnecessary POS & vocab'].iloc[1],'\n')
print(new_df['Removed unnecessary POS & vocab'].iloc[2],'\n')
print(new_df['Removed unnecessary POS & vocab'].iloc[3],'\n')

print('\nUNIGRAM Sentence:')
for unigram_sentence in it.islice(unigram_sentences, start, finish):
    print(' '.join(unigram_sentence))
print('\nBIGRAM Sentence:')
for bigram_sentence in it.islice(bigram_sentences, start, finish):
    print(' '.join(bigram_sentence))
print('\nTRIGRAM Sentence:')
for trigram_sentence in it.islice(trigram_sentences, start, finish):
    print(' '.join(trigram_sentence))

Original text:
['complaint file fee receipt number disclosure civil cover sheet complete yes file .', 'civil cover sheet propose summons .'] 

['assign .', 'please download locate website .', 'responsible courtesy individual practice .'] 

['summons issue'] 

['notice email miss civil cover sheet .'] 


UNIGRAM Sentence:
assign .
please download locate website .
responsible courtesy individual practice .
open filing check quality control .
correction .
propose schedule order
letter propose schedule order
order respond letter .
order .
order respond letter motion .
order .
letter status
reassign .
longer assign .
please download locate website .

BIGRAM Sentence:
assign .
please_download locate_website .
responsible_courtesy individual_practice .
open_filing check_quality control .
correction .
propose schedule order
letter propose schedule order
order respond letter .
order .
order respond letter motion .
order .
letter status
reassign .
longer assign .
please_download locate_website .

In [128]:
def trigram_transform(texts):
    trigram_output = []
    #print(texts)
    remove_trigram = ['calendar_day', 'court_notice_intend', 'minute_entry_proceeding_hold', 'court_reportertranscriber_abovecaptioned_matter',
                      'redaction_calendar_day', 'rule_statement', 'obtain_pacer', 'may_obtain_pacer', 'reportertranscriber_abovecaptioned_matter',
                      'redact_transcript_deadline', 'send_chamber', "official_transcript_notice_give", "notice_intent_request", "proceed_hold", 
                      "fee_receipt_number", "civil_procedure", "pursuant_frcp", "official_transcript_conference", 
                      "purchase_reportertranscriber_deadline_release", "et_al", "mail_chamber", "transcript_restriction", "redaction_transcript", 
                      "transcript_view_public_terminal", "transcript_make_remotely", "associated_et_al", "electronically_available_public_without", 
                      "genesys_id", "release_transcript_restriction", "adar_bay", "redaction_request_due", "new_york", "official_transcript_conference", 
                      "transcript_make_remotely", "transcript_proceeding_conference_hold", "redaction_transcript",
                      'affidavit_jr._c.p.a', 'corporate_parent', 'certain_underwriter', 'federal_rule_civil_procedure', 'redaction_request', 
                      'official_transcript', 'rule_disclosure', 'rule_corporate_disclosure', 'place_vault', 'public_without_redaction_calendar', 
                      'purchase_deadline_release_transcript', 'transcript_proceeding_hold', 'transcript_remotely_electronically_available']
  
    
    for sentence in texts:
        unigram_review = []
        for word in sentence.split():
            unigram_review.append(word)
    
        #print('Uni: ', unigram_review)
        bigram_review = bigram_model[unigram_review]
        #print('Bi: ', bigram_review)
        trigram_review = trigram_model[bigram_review]
        trigram_review = [phrase for phrase in trigram_review if phrase not in remove_trigram]
        #print('Tri: ', trigram_review)
        trigram_output.append(' '.join(trigram_review))
    return trigram_output

In [129]:
new_df['Apply Trigram Phrase Model'] = new_df['Removed unnecessary POS & vocab DT'].apply(trigram_transform)



In [131]:
new_df['Apply Trigram Phrase Model'].iloc[1]

['assign .',
 'please_download_locate_website .',
 'responsible_courtesy_individual_practice .']

In [132]:
#write trigram to file
trigram_dockets_filepath = 'docket_texts/train/DT/trigram_transformed_dockets_noorgnoname.txt'

In [133]:
with codecs.open(trigram_dockets_filepath, 'w', encoding= 'utf_8') as f:
    for i in range(len(new_df['Apply Trigram Phrase Model'])):
        f.write(' '.join(new_df['Apply Trigram Phrase Model'][i]) + '\n')

In [134]:
trigram_dictionary_filepath = 'docket_texts/train/DT/trigram_dict_noorgnoname.dict'

In [135]:
%%time

#some dictionary hyperparameters:
no_below = 10 #reference is 10
no_above = 0.4 #reference is 0.4

trigram_reviews = LineSentence(trigram_dockets_filepath)

# learn the dictionary by iterating over all of the reviews
trigram_dictionary = Dictionary(trigram_reviews)

# filter tokens that are very rare otrigram_reviewsr too common from
# the dictionary (filter_extremes) and reassign integer ids (compactify)
trigram_dictionary.filter_extremes(no_below = no_below, no_above = no_above) #this step is questionable. May need to change the parameters
trigram_dictionary.compactify()

trigram_dictionary.save(trigram_dictionary_filepath)
    
# load the finished dictionary from disk
#trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)

Wall time: 15.6 ms


In [136]:
trigram_bow_filepath = 'docket_texts/train/DT/trigram_bow_corpus_noorgnoname.mm'

In [137]:
def trigram_bow_generator(filepath):
    """
    generator function to read reviews from a file
    and yield a bag-of-words representation
    """
    
    for review in LineSentence(filepath):
        #print(review)
        #print(trigram_dictionary.doc2bow(review))
        yield trigram_dictionary.doc2bow(review)

In [138]:
%%time

# generate bag-of-words representations for
# all reviews and save them as a matrix
MmCorpus.serialize(trigram_bow_filepath, trigram_bow_generator(trigram_sentences_filepath))
    
# load the finished bag-of-words corpus from disk
trigram_bow_corpus = MmCorpus(trigram_bow_filepath)
print(trigram_bow_corpus)

MmCorpus(1206 documents, 39 features, 1012 non-zero entries)
Wall time: 47 ms


In [139]:
def explore_topic(model, topic_number, topn = 10):
    topics = []
    print('{:20} {}'.format('term', 'frequency') + '\n')
    for term, frequency in model.show_topic(topic_number, topn = topn):
        print('{:20} {:.3f}'.format(term, round(frequency, 3)))
        topics.append((term, round(frequency, 3)))
    return topics

In [140]:
def topic_modeling_pipeline(num_topics, model_file_path, trigram_bow_corpus, trigram_dictionary, export = False):

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')

        # workers => sets the parallelism, and should be
        # set to your number of physical cores minus one
        lda = LdaMulticore(trigram_bow_corpus, num_topics = num_topics, id2word = trigram_dictionary, workers = 4)

        lda.save(model_file_path)
    
    topic_dict = {}
    for i in range(num_topics):
        print("\n Topic {}'s make-up:".format(i + 1))
        topic_dict[i] = explore_topic(lda, topic_number = i)
    
    if export:
        pd.DataFrame(topic_dict).to_csv(model_file_path + 'topics.csv', index = False)
    
    LDAvis_data_filepath = model_file_path + '_ldavis'
    
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus, trigram_dictionary)

    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
        
    with open(LDAvis_data_filepath, 'rb') as f:
        LDAvis_prepared = pickle.load(f)
        
    return LDAvis_prepared, lda

In [141]:
def lda_description(docket_text, lda, trigram_dictionary, topic_names, min_topic_freq = 0.05):
    '''
    accept the processed texts (trigram) of a review and 
    1) create a bag-of-words representation, 
    4) create an LDA representation, and
    5) print a sorted list of the top topics in the LDA representation
    '''
    output = []
    analyze_this = []
    for sentence in docket_text:
        analyze_this += sentence.split()
    
    # create a bag-of-words representation
    review_bow = trigram_dictionary.doc2bow(analyze_this)
    
    # create an LDA representation
    review_lda = lda[review_bow]
    
    # sort with the most highly related topics first
    review_lda.sort(key = lambda tup: tup[1], reverse = True)
    #print(review_lda)
    for topic_number, freq in review_lda:
        if freq < min_topic_freq:
            break
            
        # print the most highly related topic names and frequencies
        #print('{:25} {}'.format(topic_names[topic_number], round(freq, 3)))
        output.append((topic_names[topic_number], round(freq, 5)))
    return output

### Providing topic visualizations, topic constituents for each topic, and classifying each docket text. For some reason when I use a pipeline functionality it doesn't work with pyLADvis.display
### 2 Topics:

In [158]:
num_topics = 2
pretrained_model_file_path = 'docket_texts/train/DT/lda_model_noorgnomodel_' + str(num_topics)
column_name = str(num_topics) + '-topic Model Classification'

topic_names = {}
for i in range(num_topics):
    topic_names[i] = 'Topic ' + str(i)

LDAvis_prepared, model = topic_modeling_pipeline(num_topics, pretrained_model_file_path, trigram_bow_corpus, trigram_dictionary, export = True)
topic_summary = []

for docket_text in list(new_df['Apply Trigram Phrase Model']):
    #print(docket_text)
    if docket_text == []:
        topic_summary.append('')
    else:
        topic_summary.append(lda_description(docket_text, model, trigram_dictionary, topic_names))

new_df[column_name] = topic_summary
pyLDAvis.display(LDAvis_prepared)


 Topic 1's make-up:
term                 frequency

order                0.253
letter_address       0.070
hold                 0.056
letter               0.049
minute_entry_hold    0.047
propose              0.035
discovery_hear_hold  0.029
loan                 0.028
discovery            0.028
status_report        0.026

 Topic 2's make-up:
term                 frequency

order                0.167
rule                 0.117
motion               0.089
discovery            0.063
letter_address       0.049
schedule             0.047
report               0.035
letter               0.027
hold                 0.026
deposition           0.026


### 3 Topics:

In [159]:
num_topics = 3
pretrained_model_file_path = 'docket_texts/train/DT/lda_model_noorgnomodel_' + str(num_topics)
column_name = str(num_topics) + '-topic Model Classificaiton'

topic_names = {}
for i in range(num_topics):
    topic_names[i] = 'Topic ' + str(i)

LDAvis_prepared, model = topic_modeling_pipeline(num_topics, pretrained_model_file_path, trigram_bow_corpus, trigram_dictionary, export = True)
topic_summary = []

for docket_text in list(new_df['Apply Trigram Phrase Model']):
    #print(docket_text)
    if docket_text == []:
        topic_summary.append('')
    else:
        topic_summary.append(lda_description(docket_text, model, trigram_dictionary, topic_names))

new_df[column_name] = topic_summary
pyLDAvis.display(LDAvis_prepared)


 Topic 1's make-up:
term                 frequency

rule                 0.133
letter_address       0.122
order                0.117
letter               0.079
hold                 0.068
report               0.051
schedule             0.036
motion               0.033
status_report        0.032
endorsement          0.030

 Topic 2's make-up:
term                 frequency

minute_entry_hold    0.127
discovery_hear_hold  0.077
deposition           0.070
rule                 0.051
error                0.049
hold                 0.048
propose              0.047
motion               0.044
iii                  0.044
order                0.043

 Topic 3's make-up:
term                 frequency

order                0.337
motion               0.085
discovery            0.081
schedule             0.042
rule                 0.034
letter_address       0.025
protective           0.024
propose              0.023
due                  0.022
loan                 0.021


### 4 Topics

In [160]:
num_topics = 4
pretrained_model_file_path = 'docket_texts/train/DT/lda_model_noorgnomodel_' + str(num_topics)
print(pretrained_model_file_path)
column_name = str(num_topics) + '-topic Model Classificaiton'

''' commented out due to new DT direction
#according to Chris' feedback on 2018-5-10
topic_names = {0: 'motion to dismiss',
               1: 'motion for summary judgment',
               2: 'Complaint and motion',
               3: 'Amended Complaint and motion'}
'''
topic_names = {}
for i in range(num_topics):
    topic_names[i] = 'Topic ' + str(i)


LDAvis_prepared, model = topic_modeling_pipeline(num_topics, pretrained_model_file_path, trigram_bow_corpus, trigram_dictionary, export = True)
topic_summary = []

for docket_text in list(new_df['Apply Trigram Phrase Model']):
    #print(docket_text)
    if docket_text == []:
        topic_summary.append('')
    else:
        topic_summary.append(lda_description(docket_text, model, trigram_dictionary, topic_names))

new_df[column_name] = topic_summary
pyLDAvis.display(LDAvis_prepared)

docket_texts/train/DT/lda_model_noorgnomodel_4

 Topic 1's make-up:
term                 frequency

minute_entry_hold    0.140
status_report        0.103
jury_trial_hold      0.077
motion_limine        0.076
motion               0.073
due                  0.051
discovery_hear_hold  0.050
amend                0.043
brief                0.037
endorsed_letter_address 0.036

 Topic 2's make-up:
term                 frequency

rule                 0.231
hold                 0.130
letter_address       0.114
order                0.069
motion               0.058
propose              0.057
schedule             0.055
iii                  0.034
protective           0.025
deadline             0.024

 Topic 3's make-up:
term                 frequency

order                0.395
motion               0.060
report               0.051
deposition           0.047
discovery            0.039
letter_address       0.036
grant                0.036
schedule             0.032
issue                0.030
endorsem

### 5 Topics:

In [161]:
num_topics = 5
pretrained_model_file_path = 'docket_texts/train/DT/lda_model_noorgnomodel_' + str(num_topics)
column_name = str(num_topics) + '-topic Model Classificaiton'

topic_names = {}
for i in range(num_topics):
    topic_names[i] = 'Topic ' + str(i)

LDAvis_prepared, model = topic_modeling_pipeline(num_topics, pretrained_model_file_path, trigram_bow_corpus, trigram_dictionary, export = True)
topic_summary = []

for docket_text in list(new_df['Apply Trigram Phrase Model']):
    #print(docket_text)
    if docket_text == []:
        topic_summary.append('')
    else:
        topic_summary.append(lda_description(docket_text, model, trigram_dictionary, topic_names))

new_df[column_name] = topic_summary
pyLDAvis.display(LDAvis_prepared)


 Topic 1's make-up:
term                 frequency

rule                 0.297
order                0.132
letter_address       0.120
minute_entry_hold    0.054
jury_trial_hold      0.048
issue                0.048
error                0.044
entry                0.030
endorsement          0.029
report               0.021

 Topic 2's make-up:
term                 frequency

order                0.363
minute_entry_hold    0.080
discovery_hear_hold  0.071
schedule             0.067
discovery            0.046
deadline             0.045
deposition           0.036
assign               0.027
filing               0.023
grant                0.023

 Topic 3's make-up:
term                 frequency

order                0.279
letter_address       0.095
propose              0.080
letter               0.054
protective           0.050
schedule             0.042
discovery            0.035
production           0.035
loan                 0.035
motion_limine        0.034

 Topic 4's make-up:
term      

### 6 Topics:

In [162]:
num_topics = 6
pretrained_model_file_path = 'docket_texts/train/DT/lda_model_noorgnomodel_' + str(num_topics)
column_name = str(num_topics) + '-topic Model Classificaiton'

topic_names = {}

for i in range(num_topics):
    topic_names[i] = 'Topic ' + str(i)

LDAvis_prepared, model = topic_modeling_pipeline(num_topics, pretrained_model_file_path, trigram_bow_corpus, trigram_dictionary, export = True)
topic_summary = []

for docket_text in list(new_df['Apply Trigram Phrase Model']):
    #print(docket_text)
    if docket_text == []:
        topic_summary.append('')
    else:
        topic_summary.append(lda_description(docket_text, model, trigram_dictionary, topic_names))

new_df[column_name] = topic_summary
pyLDAvis.display(LDAvis_prepared)


 Topic 1's make-up:
term                 frequency

motion               0.233
report               0.117
error                0.110
letter               0.079
rule                 0.072
hold                 0.063
due                  0.048
entry                0.040
filing               0.032
grant                0.025

 Topic 2's make-up:
term                 frequency

motion               0.162
hold                 0.135
letter_address       0.111
assign               0.068
order                0.062
discovery            0.060
loan                 0.052
propose              0.043
motion_limine        0.043
schedule             0.035

 Topic 3's make-up:
term                 frequency

rule                 0.372
letter_address       0.090
status_report        0.089
due                  0.066
iii                  0.054
hold                 0.043
discovery            0.043
order                0.038
propose              0.019
amend                0.019

 Topic 4's make-up:
term      

### 7 Topics:

In [163]:
num_topics = 7
pretrained_model_file_path = 'docket_texts/train/DT/lda_model_noorgnomodel_' + str(num_topics)
column_name = str(num_topics) + '-topic Model Classificaiton'

topic_names = {}

for i in range(num_topics):
    topic_names[i] = 'Topic ' + str(i)

LDAvis_prepared, model = topic_modeling_pipeline(num_topics, pretrained_model_file_path, trigram_bow_corpus, trigram_dictionary, export = True)
topic_summary = []

for docket_text in list(new_df['Apply Trigram Phrase Model']):
    #print(docket_text)
    if docket_text == []:
        topic_summary.append('')
    else:
        topic_summary.append(lda_description(docket_text, model, trigram_dictionary, topic_names))

new_df[column_name] = topic_summary
pyLDAvis.display(LDAvis_prepared)


 Topic 1's make-up:
term                 frequency

order                0.357
letter               0.063
discovery            0.055
letter_address       0.048
report               0.041
endorsement          0.041
rule                 0.034
minute_entry_hold    0.033
motion               0.030
protective           0.030

 Topic 2's make-up:
term                 frequency

order                0.196
motion               0.133
rule                 0.113
propose              0.067
discovery            0.060
grant                0.060
state                0.054
endorsed_letter_address 0.034
dispute              0.027
letter               0.021

 Topic 3's make-up:
term                 frequency

rule                 0.307
discovery            0.091
error                0.072
due                  0.066
endorsed_letter_address 0.053
trial                0.053
schedule             0.034
letter_address       0.028
minute_entry_hold    0.027
jury_trial_hold      0.027

 Topic 4's make-up:
term

### 10 Topics:

In [164]:
num_topics = 10
pretrained_model_file_path = 'docket_texts/train/DT/lda_model_noorgnomodel_' + str(num_topics)
column_name = str(num_topics) + '-topic Model Classificaiton'

topic_names = {}
for i in range(num_topics):
    topic_names[i] = 'Topic ' + str(i)

LDAvis_prepared, model = topic_modeling_pipeline(num_topics, pretrained_model_file_path, trigram_bow_corpus, trigram_dictionary, export = True)
topic_summary = []

for docket_text in list(new_df['Apply Trigram Phrase Model']):
    #print(docket_text)
    if docket_text == []:
        topic_summary.append('')
    else:
        topic_summary.append(lda_description(docket_text, model, trigram_dictionary, topic_names))

new_df[column_name] = topic_summary
pyLDAvis.display(LDAvis_prepared)


 Topic 1's make-up:
term                 frequency

order                0.419
loan                 0.069
motion               0.056
state                0.049
letter               0.042
endorsement          0.042
deny                 0.042
hold                 0.028
production           0.028
protective           0.021

 Topic 2's make-up:
term                 frequency

order                0.245
rule                 0.175
motion               0.078
minute_entry_hold    0.078
letter_address       0.065
jury_trial_hold      0.065
letter               0.052
issue                0.027
assign               0.027
endorsed_letter_address 0.021

 Topic 3's make-up:
term                 frequency

status_report        0.121
error                0.110
order                0.099
letter               0.077
motion               0.066
minute_entry_hold    0.066
discovery_hear_hold  0.055
due                  0.045
rule                 0.034
letter_address       0.034

 Topic 4's make-up:
term   

### Export DataFrame to .csv

In [165]:
new_df.head()

Unnamed: 0,Original Docket Text,Organization Portion,Name Portion,Identifying Org and Name,Stripped Org and Name,Removed unnecessary POS & vocab,DT Topics,Removed unnecessary POS & vocab DT,Apply Trigram Phrase Model,2-topic Model Classification,3-topic Model Classificaiton,4-topic Model Classificaiton,5-topic Model Classificaiton,6-topic Model Classificaiton,7-topic Model Classificaiton,10-topic Model Classificaiton
0,"COMPLAINT against Cardiogenics Holdings, Inc. ...","Cardiogenics Holdings , Inc. LG Capital Funding",( Steinmetz Michael Bowens Priscilla,COMPLAINT against -ORG- -ORG- -ORG- -ORG- fili...,"COMPLAINT against filing fee $ 400 , receipt n...",[complaint file fee receipt number disclosure ...,"[Service of Process, Complaints]",[],[],,,,,,,
1,Case assigned to Judge Ann M Donnelly and Magi...,Individual Practices of the assigned Judges,Ann M Donnelly Vera M. Scanlon Bowens Priscilla,Case assigned to Judge -NAME- -NAME- -NAME- an...,Case assigned to Judge and Magistrate Judge . ...,"[assign ., please download locate website ., r...",[],"[assign ., please download locate website ., r...","[assign ., please_download_locate_website ., r...","[(Topic 0, 0.71674), (Topic 1, 0.28326)]","[(Topic 2, 0.65544), (Topic 0, 0.17354), (Topi...","[(Topic 2, 0.62166), (Topic 0, 0.1281), (Topic...","[(Topic 1, 0.5974), (Topic 2, 0.10213), (Topic...","[(Topic 1, 0.58295), (Topic 0, 0.08359), (Topi...","[(Topic 4, 0.57064), (Topic 5, 0.0718), (Topic...","[(Topic 9, 0.54999), (Topic 1, 0.05), (Topic 7..."
2,"Summons Issued as to Cardiogenics Holdings, In...",Cardiogenics Holdings,Bowens Priscilla,"Summons Issued as to -ORG- -ORG- , Inc.. ( -NA...","Summons Issued as to , Inc.. ( , ) ( Entered :...",[summons issue],[Service of Process],[],[],,,,,,,
3,NOTICE - emailed attorney regarding missing se...,,Bowens Priscilla,NOTICE - emailed attorney regarding missing se...,NOTICE - emailed attorney regarding missing se...,[notice email miss civil cover sheet .],[Notices],[],[],,,,,,,
4,In accordance with Rule 73 of the Federal Rule...,,Bowens Priscilla,In accordance with Rule 73 of the Federal Rule...,In accordance with Rule 73 of the Federal Rule...,[accordance rule federal rule civil procedure ...,"[Motions, Notices]",[],[],,,,,,,


In [166]:
new_df.columns

Index(['Original Docket Text', 'Organization Portion', 'Name Portion',
       'Identifying Org and Name', 'Stripped Org and Name',
       'Removed unnecessary POS & vocab', 'DT Topics',
       'Removed unnecessary POS & vocab DT', 'Apply Trigram Phrase Model',
       '2-topic Model Classification', '3-topic Model Classificaiton',
       '4-topic Model Classificaiton', '5-topic Model Classificaiton',
       '6-topic Model Classificaiton', '7-topic Model Classificaiton',
       '10-topic Model Classificaiton'],
      dtype='object')

In [167]:
new_df[['Original Docket Text', 'Removed unnecessary POS & vocab', 'Removed unnecessary POS & vocab DT', 'Apply Trigram Phrase Model', 
        'DT Topics', '2-topic Model Classification', '3-topic Model Classificaiton', '4-topic Model Classificaiton', 
        '5-topic Model Classificaiton', '6-topic Model Classificaiton', '7-topic Model Classificaiton', 
        '10-topic Model Classificaiton']].to_csv('docket_texts\Train\DT\examine_this.csv', index = False)
