## Notebook Goal:  
Using existing NLP and LDA methodologies to perform topic modeling on docket texts. Three hyperparameters to consider:
1. to remove organization or not in docket texts, so organizations themselves won't become topics.
2. to remove names or not in docket texts, so names themselves won't become topics.
3. variations in topic numbers: [2, 3, 5, 10]

Will then perform visualizations and model summary output on every permutation/iteration.

In [2]:
import nltk
from nltk.tag.stanford import StanfordNERTagger
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords

In [3]:
from gensim.models.word2vec import LineSentence
from gensim.models import Phrases
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

#visualization libraries
import pyLDAvis
import pyLDAvis.gensim



In [163]:
import os
java_path = 'C:/Program Files/Java/jdk-10.0.1/bin/java.exe'
os.environ['JAVAHOME'] = java_path

import pandas as pd
import numpy as np
import codecs
import itertools as it
from bs4 import BeautifulSoup
import warnings
import pickle
from collections import Counter
import re

In [13]:
#import corpus/docket texts from html to pandas DataFrame
def grab_dockets():
    files = []
    #get all .html files in the folder (all docket files are in .html)
    for file in os.listdir('docket_texts/'):
        if file.endswith('.html'):
            files.append(os.path.join('docket_texts/', file))

    df_docket_texts = pd.DataFrame()
    
    for i in range(len(files)): #gather all docket texts
    #for i in [0, 1]: #for testing purposes
        
        content = codecs.open(files[i], 'r', 'utf-8').read()
        #use beautiful soup to get the case ID
        soup = BeautifulSoup(content, 'lxml')
        case_id = str(soup.find_all('h3'))    
        bookmark1 = case_id.find('CASE #:') + len('CASE #:')
        bookmark2 = case_id.find('</h3>')
        case_id = case_id[bookmark1:bookmark2]

        #use pandas to grab tables in the html files
        docket_tables = pd.read_html(content)

        #error checking: gotta do this because there's different length of docket_list/
        #usually docket texts are in docket_list[3], but not always
        n = 0
        while docket_tables[n].isin(['Docket Text']).sum().sum() == 0:
            #print(n, docket_tables[n].isin(['Docket Text']).sum().sum())
            n += 1
                        
        #print(i, files[i])
        #print(docket_tables[n].head())

        #docket_tables[n] is the docket text table
        new_header = docket_tables[n].iloc[0]
        docket_tables[n] = docket_tables[n][1:]
        docket_tables[n].columns = new_header
        
        docket_tables[n]['#'] = pd.to_numeric(docket_tables[n]['#'],
                                              downcast = 'signed', errors = 'coerce')
        docket_tables[n]['Date Filed'] = pd.to_datetime(docket_tables[n]['Date Filed'])
        docket_tables[n]['Case ID'] = case_id

        df_docket_texts = pd.concat([df_docket_texts, docket_tables[n]])
    #reorder a column
    cols = list(df_docket_texts.columns)
    df_docket_texts = df_docket_texts[[cols[-1]] + cols[:-1]]
    
    print('current docket text table size/shape: {}'.format(df_docket_texts.shape))
    return df_docket_texts

### Pull from dir .html files

In [14]:
df = grab_dockets()
docket_original = list(df['Docket Text'])
for i in range(5):
    print('docket text {}'.format(i))
    print(docket_original[i], '\n')

  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)


current docket text table size/shape: (3244, 4)
docket text 0
COMPLAINT against Cardiogenics Holdings, Inc. filing fee $ 400, receipt number 0207-8445206 Was the Disclosure Statement on Civil Cover Sheet completed -YES,, filed by LG Capital Funding, LLC. (Steinmetz, Michael) (Additional attachment(s) added on 3/11/2016: # 1 Civil Cover Sheet, # 2 Proposed Summons) (Bowens, Priscilla). (Entered: 03/10/2016) 

docket text 1
Case assigned to Judge Ann M Donnelly and Magistrate Judge Vera M. Scanlon. Please download and review the Individual Practices of the assigned Judges, located on our website. Attorneys are responsible for providing courtesy copies to judges where their Individual Practices require such. (Bowens, Priscilla) (Entered: 03/11/2016) 

docket text 2
Summons Issued as to Cardiogenics Holdings, Inc.. (Bowens, Priscilla) (Entered: 03/11/2016) 

docket text 3
NOTICE - emailed attorney regarding missing second page of the civil cover sheet. (Bowens, Priscilla) (Entered: 03/11/2

### Used Stanford NER to identy Names and Entities

In [104]:
%%time
path_to_model = r'C:\Users\inves\AppData\Local\Programs\Python\Python35\Lib\site-packages\nltk\stanford-ner-2018-02-27\classifiers\english.all.3class.distsim.crf.ser.gz'
path_to_jar = r'C:\Users\inves\AppData\Local\Programs\Python\Python35\Lib\site-packages\nltk\stanford-ner-2018-02-27\stanford-ner.jar'
tagger = StanfordNERTagger(path_to_model, path_to_jar = path_to_jar)

output = []
#length = 100 
length = len(docket_original)
for i in range(length):
    org_str = []
    name_str = []
    stripped_str1 = []
    stripped_str2 = []
    tokens = nltk.tokenize.word_tokenize(docket_original[i])
    for label in tagger.tag(tokens):
        #print(label)
        if label[1] == 'ORGANIZATION':
            org_str.append(label[0])
            stripped_str1.append('-ORG-')
        elif label[1] == 'PERSON':
            name_str.append(label[0])
            stripped_str1.append('-NAME-')
        else:
            stripped_str1.append(label[0])
            stripped_str2.append(label[0])
    
    output.append([docket_original[i],
                   ' '.join(org_str),
                   ' '.join(name_str),
                   ' '.join(stripped_str1),
                   ' '.join(stripped_str2)])
    

Wall time: 1h 34min 53s


In [107]:
NER_df = pd.DataFrame(output, columns = ['Original Docket Text', 'Organization Portion', 'Name Portion', 
                                         'Identifying Org and Name', 'Stripped Org and Name'])

### To re-build new_df, start here

In [254]:
new_df = NER_df.copy()

In [255]:
print(new_df.head())
docket_text_list = list(new_df['Stripped Org and Name'])

                                Original Docket Text  \
0  COMPLAINT against Cardiogenics Holdings, Inc. ...   
1  Case assigned to Judge Ann M Donnelly and Magi...   
2  Summons Issued as to Cardiogenics Holdings, In...   
3  NOTICE - emailed attorney regarding missing se...   
4  In accordance with Rule 73 of the Federal Rule...   

                              Organization Portion  \
0  Cardiogenics Holdings , Inc. LG Capital Funding   
1      Individual Practices of the assigned Judges   
2                            Cardiogenics Holdings   
3                                                    
4                                                    

                                      Name Portion  \
0             ( Steinmetz Michael Bowens Priscilla   
1  Ann M Donnelly Vera M. Scanlon Bowens Priscilla   
2                                 Bowens Priscilla   
3                                 Bowens Priscilla   
4                                 Bowens Priscilla   

             

In [256]:
def text_preprocess1(text):
    text = re.sub("[\(\[].*?[\)\]]", "", text)
    text = text.replace('-', '')
    text = text.replace('(', ' ')
    text = text.replace(')', ' ')
    text = text.replace('(s)', 's')
    text = text.replace("'s", 's')
    text = text.replace('*', '')
    text = text.replace('', '')
    text = text.replace('<', '')
    text = text.replace('/', ' ')
    text = text.replace('\\', '')
    return text

def text_preprocess2(text):
    text = text.replace('.', '')
    return text

def remove_stop(sentence):
    output = []
    for word in sentence.split():
        if word not in set(stopwords.words('english')):
            output.append(word)
    return ' '.join(output)

keywords = pd.read_csv('docket_texts/keywords.csv', header = None)
keywords.columns = ['keywords']
keyword_list = list(keywords['keywords'])

In [257]:
print(docket_text_list[1])
docket_text_list = [text_preprocess1(sentence).lower() for sentence in docket_text_list]
print(docket_text_list[1])
print(len(docket_text_list))

Case assigned to Judge and Magistrate Judge . Please download and review the , located on our website . Attorneys are responsible for providing courtesy copies to judges where their Individual Practices require such . ( , ) ( Entered : 03112016 )
case assigned to judge and magistrate judge . please download and review the , located on our website . attorneys are responsible for providing courtesy copies to judges where their individual practices require such .  
3244


In [258]:
class Splitter(object):

    def __init__(self):
        self.splitter = nltk.data.load('tokenizers/punkt/english.pickle')
        self.tokenizer = nltk.tokenize.TreebankWordTokenizer()

    def split(self,text):

        # split into single sentence
        sentences = self.splitter.tokenize(text)
        # tokenization in each sentences
        tokens = [self.tokenizer.tokenize(remove_stop(sent)) for sent in sentences]
        return tokens


class LemmatizationWithPOSTagger(object):
    def __init__(self):
        pass
    def get_wordnet_pos(self,treebank_tag):
        """
        return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v) 
        """
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            # As default pos in lemmatization is Noun
            return wordnet.NOUN

    def pos_tag(self,tokens):
        # find the pos tagginf for each tokens [('What', 'WP'), ('can', 'MD'), ('I', 'PRP') ....
        pos_tokens = [nltk.pos_tag(token) for token in tokens]

        # lemmatization using pos tagg   
        # convert into feature set of [('What', 'What', ['WP']), ('can', 'can', ['MD']), ... ie [original WORD, Lemmatized word, POS tag]
        pos_tokens = [ [(word, lemmatizer.lemmatize(word,self.get_wordnet_pos(pos_tag)), [pos_tag]) for (word,pos_tag) in pos] for pos in pos_tokens]
        return pos_tokens

In [259]:
%%time
lemmatizer = WordNetLemmatizer()
splitter = Splitter()
lemmatization_using_pos_tagger = LemmatizationWithPOSTagger()

lemma_docket_text_list = []
for docket_text in docket_text_list:
    #step 1 split document into sentence followed by tokenization
    tokens = splitter.split(docket_text)

    #step 2 lemmatization using pos tagger 
    lemma_pos_token = lemmatization_using_pos_tagger.pos_tag(tokens)
    lemma_docket_text_list.append(lemma_pos_token)

Wall time: 1min 48s


In [260]:
print(len(lemma_docket_text_list)) #docket text document level
print(len(lemma_docket_text_list[0])) #docket text sentence level
print(len(lemma_docket_text_list[0][0])) #docket text word level
print(lemma_docket_text_list[0][0][0]) #docket text token level
print(lemma_docket_text_list[0][0][0][0]) #docket text tuple level

3244
2
22
('complaint', 'complaint', ['NN'])
complaint


In [261]:
#lets do a collection of what we have
collection = {}
for lemma_pos_token in lemma_docket_text_list:
    for sentence in lemma_pos_token:
        for token in sentence:
            #print(token[2][0])
            if token[2][0] not in list(collection.keys()):
                collection[token[2][0]] = []
                collection[token[2][0]].append(token[1])
            else:
                if token[1] not in collection[token[2][0]]:
                    collection[token[2][0]].append(token[1])

In [262]:
pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in collection.items()])).to_csv('NLP_pos.csv', index = False)

In [263]:
%%time
remove_pos = ["``", "NNPS", "NNP", "CD", '#', '$', "''", ",", "0", ":"]
remove_word = ["'s", "judge", "party", "defendant", "ex", "plantiff", "shall", "date", "b", "exhibit", "pennsylvania", "sign_judge", 
               "Inc..", "inc..", "llc", "'", "[_]", "action", "clerk", "july", "kw", "regard", "sac", "attachment", "c.d", "cal", "case", "cd", "l.p.", 
               "claim", "copy", "court", "direct", "form", "hereby", "magistrate", "p.c", "pl", "plaintiff", "regard", "sign", "time", "mr.", 
               "docket", "follow", "set", "matter" "agreement" "proceeding", "cotton", "january", "february", "march", "april", "may", "june", 
               "july", "august", "september", "october", "november", "december"]
#rebuild corpus
docket_texts_output = [] #ultimate output after cleaning

for lemma_pos_token in lemma_docket_text_list:
    docket_text_output = [] 
    for sentence in lemma_pos_token:
        sentence_output = []
        for token in sentence:
            #print(token[1])
            
            if token[2][0] not in remove_pos: #if the pos is not in the remove_pos list
                if token[1] not in remove_word: #these are the intentionally left out words
                    sentence_output.append(token[1]) #append the the sentence
        docket_text_output.append(' '.join(sentence_output))
    docket_texts_output.append(docket_text_output)
print(docket_texts_output[:10])

[['complaint file fee receipt number disclosure statement civil cover sheet complete yes file .', 'add civil cover sheet propose summons .'], ['assign .', 'please download review locate website .', 'attorney responsible provide courtesy individual practice require .'], ['summons issue'], ['notice email attorney miss second page civil cover sheet .'], ['accordance rule federal rule civil procedure local rule notify consent united state available conduct proceeding civil include trial order entry final judgment .', 'attach notice blank consent fill file electronically wish consent .', 'also access link http www.uscourts.govuscourtsformsandfeesformsao085.pdf .', 'withhold consent without adverse substantive consequence .', 'return file consent unless consent .'], ['attorney open filing check quality control .', 'see correction make .'], ['notice appearance behalf'], ['backend note .', 'related document complaint file .', ''], ['ta letter .', 'related document complaint file .', ''], ['c n

In [264]:
new_df['Removed unnecessary POS & vocab'] = pd.Series(docket_texts_output)
new_df.head()

Unnamed: 0,Original Docket Text,Organization Portion,Name Portion,Identifying Org and Name,Stripped Org and Name,Removed unnecessary POS & vocab
0,"COMPLAINT against Cardiogenics Holdings, Inc. ...","Cardiogenics Holdings , Inc. LG Capital Funding",( Steinmetz Michael Bowens Priscilla,COMPLAINT against -ORG- -ORG- -ORG- -ORG- fili...,"COMPLAINT against filing fee $ 400 , receipt n...",[complaint file fee receipt number disclosure ...
1,Case assigned to Judge Ann M Donnelly and Magi...,Individual Practices of the assigned Judges,Ann M Donnelly Vera M. Scanlon Bowens Priscilla,Case assigned to Judge -NAME- -NAME- -NAME- an...,Case assigned to Judge and Magistrate Judge . ...,"[assign ., please download review locate websi..."
2,"Summons Issued as to Cardiogenics Holdings, In...",Cardiogenics Holdings,Bowens Priscilla,"Summons Issued as to -ORG- -ORG- , Inc.. ( -NA...","Summons Issued as to , Inc.. ( , ) ( Entered :...",[summons issue]
3,NOTICE - emailed attorney regarding missing se...,,Bowens Priscilla,NOTICE - emailed attorney regarding missing se...,NOTICE - emailed attorney regarding missing se...,[notice email attorney miss second page civil ...
4,In accordance with Rule 73 of the Federal Rule...,,Bowens Priscilla,In accordance with Rule 73 of the Federal Rule...,In accordance with Rule 73 of the Federal Rule...,[accordance rule federal rule civil procedure ...


In [265]:
unigram_sentences_filepath = 'docket_texts/unigram_nltk_noorgnoname.txt'

In [266]:
%%time
# turn the lemmatized corpus into unigram sentences
with codecs.open(unigram_sentences_filepath, 'w', encoding = 'utf_8') as f:
    for docket_text in docket_texts_output:
        for sentence in docket_text:
            f.write(sentence + '\n')

Wall time: 15 ms


In [267]:
unigram_sentences = LineSentence(unigram_sentences_filepath)

In [268]:
#let's do some comparision between the original text and unigram sentences, shouldn't be that different.
print('Original text:')
print(new_df['Removed unnecessary POS & vocab'].iloc[0])
#print(df['Docket Text'].iloc[1])

print('\nUnigram_sentence:')
for unigram_sentence in it.islice(unigram_sentences, 0, 10):
    print(' '.join(unigram_sentence))
    print('')

Original text:
['complaint file fee receipt number disclosure statement civil cover sheet complete yes file .', 'add civil cover sheet propose summons .']

Unigram_sentence:
complaint file fee receipt number disclosure statement civil cover sheet complete yes file .

add civil cover sheet propose summons .

assign .

please download review locate website .

attorney responsible provide courtesy individual practice require .

summons issue

notice email attorney miss second page civil cover sheet .

accordance rule federal rule civil procedure local rule notify consent united state available conduct proceeding civil include trial order entry final judgment .

attach notice blank consent fill file electronically wish consent .

also access link http www.uscourts.govuscourtsformsandfeesformsao085.pdf .



In [269]:
bigram_model_filepath = 'docket_texts/bigram_model_noorgnoname' 

In [270]:
%%time

# store our bigram model
bigram_model = Phrases(unigram_sentences)
bigram_model.save(bigram_model_filepath)
    
# load the finished model from disk if we don't want to run this again
#bigram_model = Phrases.load(bigram_model_filepath)

Wall time: 158 ms


In [271]:
bigram_sentences_filepath = 'docket_texts/bigram_sentences_noorgnoname.txt'

In [272]:
%%time

# apply the bigram model, and write it to file
with codecs.open(bigram_sentences_filepath, 'w', encoding = 'utf_8') as f:
    for unigram_sentence in unigram_sentences:
        bigram_sentence = ' '.join(bigram_model[unigram_sentence])
        f.write(bigram_sentence + '\n')



Wall time: 562 ms


In [273]:
bigram_sentences = LineSentence(bigram_sentences_filepath)
print('unigram length = {}, bigram length = {}'.format(len(list(unigram_sentences)), len(list(bigram_sentences))))

unigram length = 8649, bigram length = 8649


In [274]:
#original v. unigram v. bigram. Some phrases should be combined already
start = 0
finish = 10
print('Original text:')
print(new_df['Removed unnecessary POS & vocab'].iloc[0])
print(new_df['Removed unnecessary POS & vocab'].iloc[1])

print('\nUnigram sentence:')
for unigram_sentence in it.islice(unigram_sentences, 0, 10):
    print(' '.join(unigram_sentence))
print('\nBigram sentence:')
for bigram_sentence in it.islice(bigram_sentences, start, finish):
    print(' '.join(bigram_sentence))

Original text:
['complaint file fee receipt number disclosure statement civil cover sheet complete yes file .', 'add civil cover sheet propose summons .']
['assign .', 'please download review locate website .', 'attorney responsible provide courtesy individual practice require .']

Unigram sentence:
complaint file fee receipt number disclosure statement civil cover sheet complete yes file .
add civil cover sheet propose summons .
assign .
please download review locate website .
attorney responsible provide courtesy individual practice require .
summons issue
notice email attorney miss second page civil cover sheet .
accordance rule federal rule civil procedure local rule notify consent united state available conduct proceeding civil include trial order entry final judgment .
attach notice blank consent fill file electronically wish consent .
also access link http www.uscourts.govuscourtsformsandfeesformsao085.pdf .

Bigram sentence:
complaint file fee_receipt number disclosure_statemen

In [275]:
trigram_model_filepath = 'docket_texts/trigram_model_nonamenoorg'

In [276]:
%%time

# again, using Phrases to attach more words to phrases already formed
trigram_model = Phrases(bigram_sentences)
trigram_model.save(trigram_model_filepath)

# load the finished model from disk
#trigram_model = Phrases.load(trigram_model_filepath)

Wall time: 149 ms


In [277]:
trigram_sentences_filepath = 'docket_texts/trigram_sentences_nonamenoorg.txt'

In [278]:
%%time

with codecs.open(trigram_sentences_filepath, 'w', encoding = 'utf_8') as f:
    for bigram_sentence in bigram_sentences:
        #print('Bi', bigram_sentence)
        trigram_sentence = ' '.join(trigram_model[bigram_sentence])
        #print('Tri', trigram_sentence)
        f.write(trigram_sentence + '\n')



Wall time: 348 ms


In [279]:
trigram_sentences = LineSentence(trigram_sentences_filepath)

In [280]:
start = 0
finish = 15
print('Original text:')
print(new_df['Removed unnecessary POS & vocab'].iloc[0],'\n')
print(new_df['Removed unnecessary POS & vocab'].iloc[1],'\n')
print(new_df['Removed unnecessary POS & vocab'].iloc[2],'\n')
print(new_df['Removed unnecessary POS & vocab'].iloc[3],'\n')

print('\nUNIGRAM Sentence:')
for unigram_sentence in it.islice(unigram_sentences, start, finish):
    print(' '.join(unigram_sentence))
print('\nBIGRAM Sentence:')
for bigram_sentence in it.islice(bigram_sentences, start, finish):
    print(' '.join(bigram_sentence))
print('\nTRIGRAM Sentence:')
for trigram_sentence in it.islice(trigram_sentences, start, finish):
    print(' '.join(trigram_sentence))

Original text:
['complaint file fee receipt number disclosure statement civil cover sheet complete yes file .', 'add civil cover sheet propose summons .'] 

['assign .', 'please download review locate website .', 'attorney responsible provide courtesy individual practice require .'] 

['summons issue'] 

['notice email attorney miss second page civil cover sheet .'] 


UNIGRAM Sentence:
complaint file fee receipt number disclosure statement civil cover sheet complete yes file .
add civil cover sheet propose summons .
assign .
please download review locate website .
attorney responsible provide courtesy individual practice require .
summons issue
notice email attorney miss second page civil cover sheet .
accordance rule federal rule civil procedure local rule notify consent united state available conduct proceeding civil include trial order entry final judgment .
attach notice blank consent fill file electronically wish consent .
also access link http www.uscourts.govuscourtsformsandfee

In [289]:
def trigram_transform(texts):
    trigram_output = []
    #print(texts)
    remove_trigram = ['calendar_day', 'court_notice_intend', 'minute_entry_proceeding_hold', 'court_reportertranscriber_abovecaptioned_matter',
                      'redaction_calendar_day', 'rule_statement', 'obtain_pacer', 'may_obtain_pacer', 'reportertranscriber_abovecaptioned_matter',
                      'redact_transcript_deadline', 'send_chamber']
    for sentence in texts:
        unigram_review = []
        for word in sentence.split():
            unigram_review.append(word)
    
        #print('Uni: ', unigram_review)
        bigram_review = bigram_model[unigram_review]
        #print('Bi: ', bigram_review)
        trigram_review = trigram_model[bigram_review]
        trigram_review = [phrase for phrase in trigram_review if phrase not in remove_trigram]
        print('Tri: ', trigram_review)
        trigram_output.append(' '.join(trigram_review))
    return trigram_output

In [290]:
new_df['Apply Trigram Phrase Model'] = new_df['Removed unnecessary POS & vocab'].apply(trigram_transform)



Tri:  ['complaint', 'file', 'fee_receipt_number', 'disclosure_statement_civil_cover', 'sheet_complete_yes', 'file', '.']
Tri:  ['add', 'civil_cover_sheet', 'propose', 'summons', '.']
Tri:  ['assign', '.']
Tri:  ['please_download_review_locate', 'website', '.']
Tri:  ['attorney_responsible_provide_courtesy', 'individual_practice_require', '.']
Tri:  ['summons_issue']
Tri:  ['notice', 'email', 'attorney', 'miss', 'second', 'page', 'civil_cover_sheet', '.']
Tri:  ['accordance_rule_federal', 'rule_civil_procedure_local', 'rule_notify_consent', 'united_state_available_conduct', 'proceeding_civil', 'include', 'trial', 'order', 'entry', 'final_judgment', '.']
Tri:  ['attach', 'notice', 'blank_consent_fill', 'file', 'electronically_wish_consent', '.']
Tri:  ['also_access_link_http', 'www.uscourts.govuscourtsformsandfeesformsao085.pdf', '.']
Tri:  ['withhold_consent_without_adverse', 'substantive_consequence', '.']
Tri:  ['return', 'file', 'consent_unless_consent', '.']
Tri:  ['attorney_open', 

Tri:  ['first', 'motion', 'extension', 'file', 'responsereply', 'pre', 'motion', 'conference', 'extend', 'week', 'opposition', 'summary_judgment', 'consent', 'obtain', 'international']
Tri:  ['order', 'grant', 'motion', 'extension', 'file', 'responsereply', '.']
Tri:  ['oppose', 'motion', 'summary_judgment', 'extend', '.']
Tri:  ['file', 'reply', '.']
Tri:  ['remind', 'file', 'fullybriefed', 'motion', 'reply', 'due', 'provide', 'chamber', 'courtesy', 'fullybriefed', 'motion', '.']
Tri:  ['order', '.']
Tri:  ['letter', 'courtesy', 'cross', 'motion', 'summary_judgment']
Tri:  ['letter', 'summary_judgment', 'motion', 'per', 'honor', 'individual_rule', 'c.1']
Tri:  ['motion', 'summary_judgment', '.']
Tri:  ['reply', 'support', 'motion', 'summary_judgment', 'memorandum', 'opposition', 'crossmotion', 'file', '.']
Tri:  []
Tri:  ['response', 'opposition', 'motion', 'summary_judgment', 'cross', 'motion', 'summary_judgment', 'opposition', 'motion', 'summary_judgment', 'file', 'international']
T

Tri:  ['amend_complaint', 'safe_harbor_asset', 'management', 'llc.document', 'file', 'master_account_balentine_global', 'hedge_fund_select', 'balentine_global_hedge_fund', 'estate_m.l', '.']
Tri:  ['hedge_strategy_fund_ibex', 'absolute_return_fund_magnum', 'growth_fund_magnum_special', 'situation_fund_alternative_investment', 'sgr_spa_stillwater_market', 'neutral_fund', 'van_diversify_low', 'volatility', 'fund', 'ltd.ernst', '&', 'young', 'cayman_island', '.']
Tri:  ['modify', '.']
Tri:  ['affidavit_service_summons', 'amend_complaint', '.']
Tri:  ['.']
Tri:  ['serve_answer_due', '.']
Tri:  ['service_accept', '.']
Tri:  ['document', 'file']
Tri:  ['stipulation', 'order', 'within', 'undersigned', 'move', 'answer_otherwise_respond', 'complaint_extend_include', 'answer', 'paper', 'due', 'reply_paper', 'due', '.']
Tri:  ['extension', '.']
Tri:  ['modify', '.']
Tri:  ['answer_due', 'amend_complaint', 'trustee', 'limit', 'answer_due_answer_due', 'answer_due_answer_due', 'answer_due_answer_due

Tri:  ['mail', '.']
Tri:  ['pretrial_conference', 'order', 'consent', 'order', 'file', 'within', 'provide', 'pretrial_conference_hold', 'p.m._courtroom', 'foley_square', 'new_york_ny', '.']
Tri:  ['opinion', 'order', 'deny', 'motion', 'dismiss_lack_jurisdiction', 'deny', 'motion', 'order', 'dismiss', 'complaint', 'newvest_portfolio_newvest_portfolic', 'han_kook', 'state', 'law', 'assert', 'thru', 'cause', 'dismiss_without_prejudice', '.']
Tri:  ['assert_defts', 'appear', '&', 'cause', 'dismiss', 'defts', 'without_prejudice', 'entirety', 'caption', 'amend', 'accordingly', '.']
Tri:  ['mail', '.']
Tri:  ['order', 'opinion', 'order', 'issue', 'undersigned', '.']
Tri:  ['cover', 'page', 'opinion', 'order', 'change', 'read', 'attach', '.']
Tri:  ['mail', '.']
Tri:  ['memoendorsement_letter_address', 'reset', 'answer', 'complaint', 'due', 'han_kook', 'newvest', 'newvest_portfolio', '.']
Tri:  ['mail', '.']
Tri:  ['certificate_service', 'memo_endorsement', 'esq', '.']
Tri:  ['attorney', 'esq'

Tri:  ['affidavit_jr._c.p.a', '.']
Tri:  ['administration', '.']
Tri:  ['document', 'file', '.']
Tri:  ['file', 'associated']
Tri:  ['motion', 'award_attorney_fee', 'expense', '.']
Tri:  ['document', 'file', 'wachsmuth.filed_associate']
Tri:  ['memorandum_law_support', 'motion', 'attorney_fee', '.']
Tri:  ['document', 'file', '.']
Tri:  ['file', 'associated']
Tri:  ['memorandum_law_support', 'motion', 'final_approval', 'allocation_plan', '.']
Tri:  ['document', 'file', '.']
Tri:  ['file', 'associated']
Tri:  ['memorandum_law_support', 'scheduling', 'order', 'motion', 'service_award', '.']
Tri:  ['document', 'file', '.']
Tri:  ['file', 'associated']
Tri:  ['endorsed_letter_address', 'accordingly', 'jointly', 'respectfully_request', 'grant', 'additional', 'week', 'report', 'back', '.']
Tri:  ['endorsement_application_grant', '.']
Tri:  ['order', '.']
Tri:  ['file', 'associated_thk']
Tri:  ['petition', 'service_award', '.']
Tri:  ['document', 'file', 'wachsmuth.filed_associate']
Tri:  ['r

Tri:  ['memo_endorsement', 'granting', '.']
Tri:  ['notice', 'motion', 'withdraw', 'appearance', 'counsel', '.']
Tri:  ['order', 'order', 'next', 'status_report', 'loan', 'file', 'reunderwriting', 'protocol', 'due', '.']
Tri:  ['subsequent', 'status_report', 'due', 'every', 'week', 'thereafter', '.']
Tri:  ['answer_amend_complaint', '.']
Tri:  ['document', 'file', '.']
Tri:  ['seal', 'document', 'place_vault', '.']
Tri:  ['order', '.the', 'present', 'request', 'extension', 'submit', 'expert', 'rebuttal', 'report', 'well', 'oral_argument', 'deny', '.']
Tri:  []
Tri:  ['letter_address', 'discovery_dispute', 'involve', 'production', '.']
Tri:  ['document', 'file', '.']
Tri:  ['letter_address', 'letter', 'discovery_dispute', 'involve', 'production', '.']
Tri:  ['document', 'file', '.']
Tri:  ['order', 'letter', 'present', 'request', 'discovery', 'coordinate', '.']
Tri:  ['opposes', 'request', 'submission', '..having', 'review', 'request', 'herebyordered', 'meet_confer', 'ncuas', 'request',

Tri:  ['document', 'file', 'nj', '.']
Tri:  ['declaration', 'supplemental_submission', 'nj', 'motion', 'dismiss', 'deft', '.']
Tri:  ['.']
Tri:  ['reply', 'supplemental_submission', 'new_york_new_jersey', '.']
Tri:  ['document', 'file']
Tri:  ['declaration_support', 'reply', 'supplemental_submission', 'new_york_new_jersey', '.']
Tri:  ['document', 'file']
Tri:  ['notice_adopt_master', 'answer', '.']
Tri:  ['document', 'file']
Tri:  ['dft', 'notice_adoption_master', 'answer', '.']
Tri:  ['document', 'file', 'co..']
Tri:  ['notice_adopt_master', 'answer', 'fifth', 'amend', 'master_liability_complaint', '.']
Tri:  ['document', 'file', 'related', 'document', 'amend_complaint', '.']
Tri:  ['notice', 'adopt', 'answer', '.']
Tri:  ['document', 'file']
Tri:  ['notice_adopt_master', 'property', 'answer', 'n.v.document', 'file', '.']
Tri:  ['related', 'document', 'amend_complaint', '.']
Tri:  ['opinion', 'order', 'reason_state', 'motion', 'city', 'new_york', 'dismiss', 'thecomplaint', 'civ', '.'

Tri:  ['document', 'file', 'file', 'associated_et_al', '.']
Tri:  ['supplemental_amend', 'declaration', '.']
Tri:  ['document', 'file', 'accept_filing_honorable', '.']
Tri:  ['supplemental_amend', 'second_declaration', '.']
Tri:  ['document', 'file', 'accept_filing_honorable', '.']
Tri:  ['supplemental_amend', 'second_declaration', '.']
Tri:  ['document', 'file', 'accept_filing_honorable', '.']
Tri:  ['supplemental_amend', 'second_declaration', '.']
Tri:  ['document', 'file', 'accept_filing_honorable', '.']
Tri:  ['supplemental_amend', 'second_declaration', '.']
Tri:  ['document', 'file', 'accept_filing_honorable', '.']
Tri:  ['modify', '.']
Tri:  ['supplemental_amend', 'second_declaration', '.']
Tri:  ['document', 'file', 'accept_filing_honorable', '.']
Tri:  ['supplemental_amend', 'second_declaration', '.']
Tri:  ['document', 'file', 'accept_filing_honorable', '.']
Tri:  ['memo_endorsement', 'notice', 'motion', 'leave', 'file', 'supplemental_amend', 'expert_declaration_incorporate', 

Tri:  ['order', 'service', 'order', 'grant', 'request', 'proceed', 'without', 'prepayment', 'fee', 'informa', 'pauperis', '.']
Tri:  ['allow', 'effect', 'service', 'u.s', '.']
Tri:  ['instruct', 'fill', 'u.s', '.']
Tri:  ['marshal_service', 'process_receipt', 'return', '.']
Tri:  ['service', 'address', 'machael', 'spenceredwards', 'hegeman', 'avenue', 'brooklyn', 'new_york', '.']
Tri:  ['service', 'address', 'avenue', 'brooklyn', 'new_york', '.']
Tri:  ['service', 'address', 'church', 'street_new_york_new', 'york', '.']
Tri:  ['instructed', 'issue', 'summons', 'deliver', 'marshal_service', 'paperwork', 'necessary', 'marshal_service', 'effect', 'service', 'upon', '.']
Tri:  ['responsibility', 'ensure', 'service_make', 'within_day', 'summons_issue', 'necessary', 'request', 'extension', 'service', '.']
Tri:  ['see', 'v.', 'strong', 'f.3d', '.']
Tri:  ['also', 'must', 'notify', 'write', 'address', 'change', 'dismiss', 'fails', '.']
Tri:  ['order', '.']
Tri:  ['summons_issue', '.']
Tri:  ['

Tri:  ['transcript_view_public_terminal', 'purchase_reportertranscriber_deadline_release', 'transcript_restriction', '.']
Tri:  ['.']
Tri:  ['redaction_request_due', '.']
Tri:  ['.']
Tri:  ['release_transcript_restriction', '.']
Tri:  ['notice', 'file', 'official_transcript_notice_give', 'official_transcript_conference', 'proceed_hold', 'file', '.']
Tri:  ['file', 'notice_intent_request', 'redaction_transcript', '.']
Tri:  ['notice', 'file', 'transcript_make_remotely', 'electronically_available_public_without']
Tri:  ['endorsed_letter_address', 'request', 'hearing', 'reschedule', 'pm', '.']
Tri:  ['endorsement', 'adjourned', 'pm_courtroom', '.']
Tri:  ['motion', 'withdraw_attorney', '.']
Tri:  ['document', 'file', 'ltd..']
Tri:  ['motion', 'withdraw_attorney', '.']
Tri:  ['document', 'file', 'ltd..']
Tri:  ['order', 'comply_discovery_ruling_conference', '.']
Tri:  ['conference_pm', 'courtroom_pearl_street_new', 'york_ny', '.']
Tri:  ['.']
Tri:  ['memo_endorse', 'grant', 'motion', 'with

In [291]:
new_df['Apply Trigram Phrase Model'].iloc[0]

['complaint file fee_receipt_number disclosure_statement_civil_cover sheet_complete_yes file .',
 'add civil_cover_sheet propose summons .']

In [292]:
#write trigram to file
trigram_dockets_filepath = 'docket_texts/trigram_transformed_dockets_noorgnoname.txt'

In [293]:
with codecs.open(trigram_dockets_filepath, 'w', encoding= 'utf_8') as f:
    for i in range(len(new_df['Apply Trigram Phrase Model'])):
        f.write(' '.join(new_df['Apply Trigram Phrase Model'][i]) + '\n')

In [294]:
trigram_dictionary_filepath = 'docket_texts/trigram_dict_noorgnoname.dict'

In [295]:
%%time

#some dictionary hyperparameters:
no_below = 10 #reference is 10
no_above = 0.4 #reference is 0.4

trigram_reviews = LineSentence(trigram_dockets_filepath)

# learn the dictionary by iterating over all of the reviews
trigram_dictionary = Dictionary(trigram_reviews)

# filter tokens that are very rare otrigram_reviewsr too common from
# the dictionary (filter_extremes) and reassign integer ids (compactify)
trigram_dictionary.filter_extremes(no_below = no_below, no_above = no_above) #this step is questionable. May need to change the parameters
trigram_dictionary.compactify()

trigram_dictionary.save(trigram_dictionary_filepath)
    
# load the finished dictionary from disk
#trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)

Wall time: 89.2 ms


In [296]:
trigram_bow_filepath = 'docket_texts/trigram_bow_corpus_noorgnoname.mm'

In [297]:
def trigram_bow_generator(filepath):
    """
    generator function to read reviews from a file
    and yield a bag-of-words representation
    """
    
    for review in LineSentence(filepath):
        #print(review)
        #print(trigram_dictionary.doc2bow(review))
        yield trigram_dictionary.doc2bow(review)

In [298]:
%%time

# generate bag-of-words representations for
# all reviews and save them as a matrix
MmCorpus.serialize(trigram_bow_filepath, trigram_bow_generator(trigram_sentences_filepath))
    
# load the finished bag-of-words corpus from disk
trigram_bow_corpus = MmCorpus(trigram_bow_filepath)
print(trigram_bow_corpus)

MmCorpus(8649 documents, 617 features, 23578 non-zero entries)
Wall time: 161 ms


In [299]:
def explore_topic(model, topic_number, topn = 10):
    topics = []
    print('{:20} {}'.format('term', 'frequency') + '\n')
    for term, frequency in model.show_topic(topic_number, topn = topn):
        print('{:20} {:.3f}'.format(term, round(frequency, 3)))
        topics.append((term, round(frequency, 3)))
    return topics

In [300]:
def topic_modeling_pipeline(num_topics, model_file_path, trigram_bow_corpus, trigram_dictionary, export = False):

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')

        # workers => sets the parallelism, and should be
        # set to your number of physical cores minus one
        lda = LdaMulticore(trigram_bow_corpus, num_topics = num_topics, id2word = trigram_dictionary, workers = 4)

        lda.save(model_file_path)
    
    topic_dict = {}
    for i in range(num_topics):
        print("\n Topic {}'s make-up:".format(i + 1))
        topic_dict[i] = explore_topic(lda, topic_number = i)
    
    if export:
        pd.DataFrame(topic_dict).to_csv(model_file_path + 'topics.csv', index = False)
    
    LDAvis_data_filepath = model_file_path + '_ldavis'
    
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus, trigram_dictionary)

    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
        
    with open(LDAvis_data_filepath, 'rb') as f:
        LDAvis_prepared = pickle.load(f)
        
    return LDAvis_prepared, lda

In [301]:
def lda_description(docket_text, lda, trigram_dictionary, topic_names, min_topic_freq = 0.05):
    '''
    accept the processed texts (trigram) of a review and 
    1) create a bag-of-words representation, 
    4) create an LDA representation, and
    5) print a sorted list of the top topics in the LDA representation
    '''
    output = []
    analyze_this = []
    for sentence in docket_text:
        analyze_this += sentence.split()
    
    # create a bag-of-words representation
    review_bow = trigram_dictionary.doc2bow(analyze_this)
    
    # create an LDA representation
    review_lda = lda[review_bow]
    
    # sort with the most highly related topics first
    review_lda.sort(key = lambda tup: tup[1], reverse = True)
    #print(review_lda)
    for topic_number, freq in review_lda:
        if freq < min_topic_freq:
            break
            
        # print the most highly related topic names and frequencies
        #print('{:25} {}'.format(topic_names[topic_number], round(freq, 3)))
        output.append((topic_names[topic_number], round(freq, 5)))
    return output

### Providing topic visualizations, topic constituents for each topic, and classifying each docket text. For some reason when I use a pipeline functionality it doesn't work with pyLADvis.display
### 2 Topics:

In [302]:
num_topics = 2
pretrained_model_file_path = 'docket_texts/lda_model_noorgnomodel_' + str(num_topics)
column_name = str(num_topics) + '-topic Model Classification'

topic_names = {}
for i in range(num_topics):
    topic_names[i] = 'Topic ' + str(i)

LDAvis_prepared, model = topic_modeling_pipeline(num_topics, pretrained_model_file_path, trigram_bow_corpus, trigram_dictionary, export = True)
topic_summary = []

for docket_text in list(new_df['Apply Trigram Phrase Model']):
    #print(docket_text)
    topic_summary.append(lda_description(docket_text, model, trigram_dictionary, topic_names))

new_df[column_name] = topic_summary
pyLDAvis.display(LDAvis_prepared)


 Topic 1's make-up:
term                 frequency

order                0.084
motion               0.036
request              0.020
counsel              0.015
discovery            0.013
modify               0.012
schedule             0.011
serve                0.011
conference           0.011
submit               0.010

 Topic 2's make-up:
term                 frequency

motion               0.083
order                0.043
notice               0.034
dismiss              0.015
complaint            0.013
request              0.012
letter               0.012
schedule             0.009
official_transcript_notice_give 0.009
notice_intent_request 0.009


### 3 Topics:

In [303]:
num_topics = 3
pretrained_model_file_path = 'docket_texts/lda_model_noorgnomodel_' + str(num_topics)
column_name = str(num_topics) + '-topic Model Classificaiton'

topic_names = {}
for i in range(num_topics):
    topic_names[i] = 'Topic ' + str(i)

LDAvis_prepared, model = topic_modeling_pipeline(num_topics, pretrained_model_file_path, trigram_bow_corpus, trigram_dictionary, export = True)
topic_summary = []

for docket_text in list(new_df['Apply Trigram Phrase Model']):
    #print(docket_text)
    topic_summary.append(lda_description(docket_text, model, trigram_dictionary, topic_names))

new_df[column_name] = topic_summary
pyLDAvis.display(LDAvis_prepared)


 Topic 1's make-up:
term                 frequency

order                0.092
request              0.033
notice               0.025
electronically_available_public_without 0.016
transcript_make_remotely 0.016
reportertranscriber  0.015
letter               0.014
schedule             0.014
answer               0.013
brookfield           0.013

 Topic 2's make-up:
term                 frequency

motion               0.064
order                0.048
notice               0.019
modify               0.017
conference           0.017
counsel              0.016
dismiss              0.014
official_transcript_notice_give 0.012
letter               0.012
request              0.011

 Topic 3's make-up:
term                 frequency

motion               0.109
order                0.053
complaint            0.021
serve                0.019
notice               0.017
redaction_transcript 0.016
notice_intent_request 0.016
memorandum_law_support 0.015
dismiss              0.015
redaction_request_due

### 4 Topics:

In [304]:
num_topics = 4
pretrained_model_file_path = 'docket_texts/lda_model_noorgnomodel_' + str(num_topics)
column_name = str(num_topics) + '-topic Model Classificaiton'

topic_names = {}
for i in range(num_topics):
    topic_names[i] = 'Topic ' + str(i)

LDAvis_prepared, model = topic_modeling_pipeline(num_topics, pretrained_model_file_path, trigram_bow_corpus, trigram_dictionary, export = True)
topic_summary = []

for docket_text in list(new_df['Apply Trigram Phrase Model']):
    #print(docket_text)
    topic_summary.append(lda_description(docket_text, model, trigram_dictionary, topic_names))

new_df[column_name] = topic_summary
pyLDAvis.display(LDAvis_prepared)


 Topic 1's make-up:
term                 frequency

motion               0.061
notice               0.030
request              0.027
associated           0.022
order                0.020
memorandum_law_support 0.020
summary_judgment     0.018
dismiss              0.012
notice_appeal        0.011
letter_address       0.010

 Topic 2's make-up:
term                 frequency

order                0.120
motion               0.060
letter               0.023
counsel              0.018
stipulation          0.014
serve_answer_due     0.012
response             0.010
dismiss              0.010
serve                0.010
schedule             0.009

 Topic 3's make-up:
term                 frequency

motion               0.062
order                0.054
complaint            0.029
support              0.022
schedule             0.017
letter               0.013
notice               0.012
associated           0.011
propose              0.010
affidavit_service_summons 0.009

 Topic 4's make-up:
ter

### 5 Topics:

In [305]:
num_topics = 5
pretrained_model_file_path = 'docket_texts/lda_model_noorgnomodel_' + str(num_topics)
column_name = str(num_topics) + '-topic Model Classificaiton'

topic_names = {}
for i in range(num_topics):
    topic_names[i] = 'Topic ' + str(i)

LDAvis_prepared, model = topic_modeling_pipeline(num_topics, pretrained_model_file_path, trigram_bow_corpus, trigram_dictionary, export = True)
topic_summary = []

for docket_text in list(new_df['Apply Trigram Phrase Model']):
    #print(docket_text)
    topic_summary.append(lda_description(docket_text, model, trigram_dictionary, topic_names))

new_df[column_name] = topic_summary
pyLDAvis.display(LDAvis_prepared)


 Topic 1's make-up:
term                 frequency

order                0.086
motion               0.048
schedule             0.025
letter               0.016
serve                0.015
counsel              0.015
conference           0.013
request              0.013
due                  0.011
submit               0.009

 Topic 2's make-up:
term                 frequency

motion               0.157
order                0.060
dismiss              0.023
support              0.019
memorandum_law_support 0.019
modify               0.018
complaint            0.017
notice               0.016
declaration_support  0.011
memorandum_law_opposition 0.011

 Topic 3's make-up:
term                 frequency

order                0.061
schedule             0.031
notice_appeal        0.017
notice_appearance_behalf 0.016
motion               0.016
service_accept       0.016
trial                0.015
judgment             0.013
due                  0.013
discovery            0.013

 Topic 4's make-up:

### 6 Topics:

In [306]:
num_topics = 6
pretrained_model_file_path = 'docket_texts/lda_model_noorgnomodel_' + str(num_topics)
column_name = str(num_topics) + '-topic Model Classificaiton'

topic_names = {}

for i in range(num_topics):
    topic_names[i] = 'Topic ' + str(i)

LDAvis_prepared, model = topic_modeling_pipeline(num_topics, pretrained_model_file_path, trigram_bow_corpus, trigram_dictionary, export = True)
topic_summary = []

for docket_text in list(new_df['Apply Trigram Phrase Model']):
    #print(docket_text)
    topic_summary.append(lda_description(docket_text, model, trigram_dictionary, topic_names))

new_df[column_name] = topic_summary
pyLDAvis.display(LDAvis_prepared)


 Topic 1's make-up:
term                 frequency

motion               0.125
order                0.093
notice               0.032
dismiss              0.026
due                  0.017
serve                0.014
stipulation          0.011
complaint            0.011
request              0.011
memorandum_law_support 0.011

 Topic 2's make-up:
term                 frequency

notice_appearance_behalf 0.037
request              0.036
conference           0.027
endorsed_letter_address 0.023
answer               0.020
issue                0.018
brookfield           0.017
schedule             0.016
serve                0.015
counsel              0.015

 Topic 3's make-up:
term                 frequency

order                0.076
motion               0.060
counsel              0.020
schedule             0.016
transcript_restriction 0.016
transcript_view_public_terminal 0.016
purchase_reportertranscriber_deadline_release 0.016
request              0.015
redaction_request_due 0.012
letter    

### 10 Topics:

In [307]:
num_topics = 10
pretrained_model_file_path = 'docket_texts/lda_model_noorgnomodel_' + str(num_topics)
column_name = str(num_topics) + '-topic Model Classificaiton'

topic_names = {}
for i in range(num_topics):
    topic_names[i] = 'Topic ' + str(i)

LDAvis_prepared, model = topic_modeling_pipeline(num_topics, pretrained_model_file_path, trigram_bow_corpus, trigram_dictionary, export = True)
topic_summary = []

for docket_text in list(new_df['Apply Trigram Phrase Model']):
    #print(docket_text)
    topic_summary.append(lda_description(docket_text, model, trigram_dictionary, topic_names))

new_df[column_name] = topic_summary
pyLDAvis.display(LDAvis_prepared)


 Topic 1's make-up:
term                 frequency

order                0.098
modify               0.036
letter               0.022
request              0.022
discovery            0.018
schedule             0.018
notice               0.017
judgment             0.015
amend_complaint      0.013
attorney             0.011

 Topic 2's make-up:
term                 frequency

order                0.062
motion               0.055
counsel              0.027
serve                0.024
complaint            0.020
request              0.016
schedule             0.015
mail                 0.014
opposition           0.013
notice               0.013

 Topic 3's make-up:
term                 frequency

motion               0.072
order                0.037
service_accept       0.032
notice               0.019
complaint            0.019
letter               0.018
amend_complaint      0.014
conference           0.014
counsel              0.012
grant                0.011

 Topic 4's make-up:
term      

### Export DataFrame to .csv

In [308]:
new_df.to_csv('examine_this.csv', index = False)