In [1]:
import nltk
from nltk.tag.stanford import StanfordNERTagger
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords

In [2]:
from gensim.models.word2vec import LineSentence
from gensim.models import Phrases
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

#visualization libraries
import pyLDAvis
import pyLDAvis.gensim



In [3]:
import os
java_path = 'C:/Program Files/Java/jdk-10.0.1/bin/java.exe'
os.environ['JAVAHOME'] = java_path

import pandas as pd
import numpy as np
import codecs
import itertools as it
from bs4 import BeautifulSoup
import warnings
import pickle
from collections import Counter
import re

In [4]:
#import corpus/docket texts from html to pandas DataFrame
def grab_docket_test():
    files = []
    #get all .html files in the folder (all docket files are in .html)
    for file in os.listdir('docket_texts/test/'):
        if file.endswith('.html'):
            files.append(os.path.join('docket_texts/test/', file))

    df_docket_texts = pd.DataFrame()
    
    for i in range(len(files)): #gather all docket texts
    #for i in [0, 1]: #for testing purposes
        
        content = codecs.open(files[i], 'r', 'utf-8').read()
        #use beautiful soup to get the case ID
        soup = BeautifulSoup(content, 'lxml')
        case_id = str(soup.find_all('h3'))    
        bookmark1 = case_id.find('CASE #:') + len('CASE #:')
        bookmark2 = case_id.find('</h3>')
        case_id = case_id[bookmark1:bookmark2]

        #use pandas to grab tables in the html files
        docket_tables = pd.read_html(content)

        #error checking: gotta do this because there's different length of docket_list/
        #usually docket texts are in docket_list[3], but not always
        n = 0
        while docket_tables[n].isin(['Docket Text']).sum().sum() == 0:
            #print(n, docket_tables[n].isin(['Docket Text']).sum().sum())
            n += 1
                        
        #print(i, files[i])
        #print(docket_tables[n].head())

        #docket_tables[n] is the docket text table
        new_header = docket_tables[n].iloc[0]
        docket_tables[n] = docket_tables[n][1:]
        docket_tables[n].columns = new_header
        
        docket_tables[n]['#'] = pd.to_numeric(docket_tables[n]['#'],
                                              downcast = 'signed', errors = 'coerce')
        docket_tables[n]['Date Filed'] = pd.to_datetime(docket_tables[n]['Date Filed'])
        docket_tables[n]['Case ID'] = case_id

        df_docket_texts = pd.concat([df_docket_texts, docket_tables[n]])
    #reorder a column
    cols = list(df_docket_texts.columns)
    df_docket_texts = df_docket_texts[[cols[-1]] + cols[:-1]]
    
    print('current docket text table size/shape: {}'.format(df_docket_texts.shape))
    return df_docket_texts

In [5]:
%%time
df = grab_docket_test()
docket_original = list(df['Docket Text'])
for i in range(5):
    print('docket text {}'.format(i))
    print(docket_original[i], '\n')

  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)


current docket text table size/shape: (1804, 4)
docket text 0
FILING ERROR - DEFICIENT PLEADING - FILED AGAINST PARTY ERROR COMPLAINT against All Defendants. (Filing Fee $ 400.00, Receipt Number 0208-11793625)Document filed by Majid Soueidan.(Rosen, Samuel) Modified on 1/5/2016 (pc). (Entered: 01/04/2016) 

docket text 1
FILING ERROR PDF ERROR CIVIL COVER SHEET filed. (Rosen, Samuel) Modified on 1/5/2016 (pc). (Entered: 01/04/2016) 

docket text 2
REQUEST FOR ISSUANCE OF SUMMONS as to BREEZE-EASTERN CORPORATION, BRAD PEDERSEN, ROBERT J. KELLY, NELSON OBUS, WILLIAM M. SHOCKLEY, and SERGE DUPUIS, re: 1 Complaint. Document filed by Majid Soueidan. (Rosen, Samuel) (Entered: 01/04/2016) 

docket text 3
RULE 7.1 CORPORATE DISCLOSURE STATEMENT. No Corporate Parent. Document filed by Majid Soueidan.(Rosen, Samuel) (Entered: 01/04/2016) 

docket text 4
***NOTICE TO ATTORNEY REGARDING DEFICIENT PLEADING. Notice to Attorney Samuel Kenneth Rosen to RE-FILE Document No. 1 Complaint,. The filing is 

In [6]:
%%time
path_to_model = r'C:\Users\inves\AppData\Local\Programs\Python\Python35\Lib\site-packages\nltk\stanford-ner-2018-02-27\classifiers\english.all.3class.distsim.crf.ser.gz'
path_to_jar = r'C:\Users\inves\AppData\Local\Programs\Python\Python35\Lib\site-packages\nltk\stanford-ner-2018-02-27\stanford-ner.jar'
tagger = StanfordNERTagger(path_to_model, path_to_jar = path_to_jar)

output = []
#length = 100 
length = len(docket_original)
for i in range(length):
    org_str = []
    name_str = []
    stripped_str1 = []
    stripped_str2 = []
    tokens = nltk.tokenize.word_tokenize(docket_original[i])
    for label in tagger.tag(tokens):
        #print(label)
        if label[1] == 'ORGANIZATION':
            org_str.append(label[0])
            stripped_str1.append('-ORG-')
        elif label[1] == 'PERSON':
            name_str.append(label[0])
            stripped_str1.append('-NAME-')
        else:
            stripped_str1.append(label[0])
            stripped_str2.append(label[0])
    
    output.append([docket_original[i],
                   ' '.join(org_str),
                   ' '.join(name_str),
                   ' '.join(stripped_str1),
                   ' '.join(stripped_str2)])

Wall time: 49min 45s


In [7]:
NER_df = pd.DataFrame(output, columns = ['Original Docket Text', 'Organization Portion', 'Name Portion', 
                                         'Identifying Org and Name', 'Stripped Org and Name'])

In [8]:
new_df = NER_df.copy()

In [9]:
print(new_df.head())
docket_text_list = list(new_df['Stripped Org and Name'])

                                Original Docket Text Organization Portion  \
0  FILING ERROR - DEFICIENT PLEADING - FILED AGAI...                        
1  FILING ERROR PDF ERROR CIVIL COVER SHEET filed...                        
2  REQUEST FOR ISSUANCE OF SUMMONS as to BREEZE-E...                        
3  RULE 7.1 CORPORATE DISCLOSURE STATEMENT. No Co...                        
4  ***NOTICE TO ATTORNEY REGARDING DEFICIENT PLEA...                        

                                        Name Portion  \
0                        Majid Soueidan Rosen Samuel   
1                                       Rosen Samuel   
2  BRAD PEDERSEN ROBERT J. KELLY NELSON OBUS WILL...   
3                        Majid Soueidan Rosen Samuel   
4                               Samuel Kenneth Rosen   

                            Identifying Org and Name  \
0  FILING ERROR - DEFICIENT PLEADING - FILED AGAI...   
1  FILING ERROR PDF ERROR CIVIL COVER SHEET filed...   
2  REQUEST FOR ISSUANCE OF SUMMO

In [10]:
def text_preprocess1(text):
    text = re.sub("[\(\[].*?[\)\]]", "", text)
    text = text.replace('-', '')
    text = text.replace('(', ' ')
    text = text.replace(')', ' ')
    text = text.replace('(s)', 's')
    text = text.replace("'s", 's')
    text = text.replace('*', '')
    text = text.replace('', '')
    text = text.replace('<', '')
    text = text.replace('>', '')
    text = text.replace('/', ' ')
    text = text.replace('\\', '')
    text = text.replace('&', ' ')
    return text

def text_preprocess2(text):
    text = text.replace('.', '')
    return text

def remove_stop(sentence):
    output = []
    for word in sentence.split():
        if word not in set(stopwords.words('english')):
            output.append(word)
    return ' '.join(output)

keywords = pd.read_csv('docket_texts/keywords.csv', header = None)
keywords.columns = ['keywords']
keyword_list = list(keywords['keywords'])

In [11]:
print(docket_text_list[1])
docket_text_list = [text_preprocess1(sentence).lower() for sentence in docket_text_list]
docket_text_list = [text_preprocess2(sentence) for sentence in docket_text_list]
print(docket_text_list[1])
print(len(docket_text_list))

FILING ERROR PDF ERROR CIVIL COVER SHEET filed . ( , ) Modified on 152016 ( pc ) . ( Entered : 01042016 )
filing error pdf error civil cover sheet filed   modified on 152016   
1804


In [12]:
class Splitter(object):

    def __init__(self):
        self.splitter = nltk.data.load('tokenizers/punkt/english.pickle')
        self.tokenizer = nltk.tokenize.TreebankWordTokenizer()

    def split(self,text):

        # split into single sentence
        sentences = self.splitter.tokenize(text)
        # tokenization in each sentences
        tokens = [self.tokenizer.tokenize(remove_stop(sent)) for sent in sentences]
        return tokens


class LemmatizationWithPOSTagger(object):
    def __init__(self):
        pass
    def get_wordnet_pos(self,treebank_tag):
        """
        return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v) 
        """
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            # As default pos in lemmatization is Noun
            return wordnet.NOUN

    def pos_tag(self,tokens):
        # find the pos tagginf for each tokens [('What', 'WP'), ('can', 'MD'), ('I', 'PRP') ....
        pos_tokens = [nltk.pos_tag(token) for token in tokens]

        # lemmatization using pos tagg   
        # convert into feature set of [('What', 'What', ['WP']), ('can', 'can', ['MD']), ... ie [original WORD, Lemmatized word, POS tag]
        pos_tokens = [ [(word, lemmatizer.lemmatize(word,self.get_wordnet_pos(pos_tag)), [pos_tag]) for (word,pos_tag) in pos] for pos in pos_tokens]
        return pos_tokens

In [13]:
%%time
lemmatizer = WordNetLemmatizer()
splitter = Splitter()
lemmatization_using_pos_tagger = LemmatizationWithPOSTagger()

lemma_docket_text_list = []
for docket_text in docket_text_list:
    #step 1 split document into sentence followed by tokenization
    tokens = splitter.split(docket_text)

    #step 2 lemmatization using pos tagger 
    lemma_pos_token = lemmatization_using_pos_tagger.pos_tag(tokens)
    lemma_docket_text_list.append(lemma_pos_token)

Wall time: 40.8 s


In [14]:
print(len(lemma_docket_text_list)) #docket text document level
print(len(lemma_docket_text_list[0])) #docket text sentence level
print(len(lemma_docket_text_list[0][0])) #docket text word level
print(lemma_docket_text_list[0][0][0]) #docket text token level
print(lemma_docket_text_list[0][0][0][0]) #docket text tuple level

1804
1
13
('filing', 'file', ['VBG'])
filing


In [15]:
#let's do a collection of what we have
collection = {}
for lemma_pos_token in lemma_docket_text_list:
    for sentence in lemma_pos_token:
        for token in sentence:
            #print(token[2][0])
            if token[2][0] not in list(collection.keys()):
                collection[token[2][0]] = []
                collection[token[2][0]].append(token[1])
            else:
                if token[1] not in collection[token[2][0]]:
                    collection[token[2][0]].append(token[1])

In [None]:
#pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in collection.items()])).to_csv('NLP_pos.csv', index = False)

In [16]:
%%time
remove_pos = ["``", "NNPS", "NNP", "CD", '#', '$', "''", ",", "0", ":"]
remove_word = ["'s", "judge", "party", "defendant", "ex", "plantiff", "shall", "date", "b", "exhibit", "pennsylvania", "sign_judge", 
               "Inc..", "inc..", "llc", "'", "[_]", "action", "clerk", "july", "kw", "regard", "sac", "attachment", "c.d", "cal", "case", "cd", "l.p.", 
               "claim", "copy", "court", "direct", "form", "hereby", "magistrate", "p.c", "pl", "plaintiff", "regard", "sign", "time", "mr.", 
               "docket", "follow", "set", "matter" "agreement" "proceeding", "cotton", "january", "february", "march", "april", "may", "june", 
               "july", "august", "september", "october", "november", "december",
               "agreement", "v.", "place_vault", "modify", "fund", "associated", "provide", "material", "amount", "accordingly", "additional", 
               "second", "esq", "transmission", "g.c.", "seal", "review", "honor", "submit", "counsel", "witness", "civ", "first", "ltd..", "enter", 
               "stay", "forth", "matter", "whether", "class", "master", "information", "statement", "submission", "related", "see", "make", "paper", 
               "brookfield", "designate", "remain", "reportertranscriber", "submit", "include", "mail", "fact", "refer", "take", "pursuant", "amount", 
               "behalf", "I.p..", "must", "attorney",
               'abovecapitoned', 'attach', 'add', 'concern', 'chamber', 'close', 'district', 'damage', 'later', 
               'relate', 'return', 'require', 'restriction', 'respect', 'ny', 'seek', 'write', 'expert', 'transcript', 
               'day', 'h.o', 'damage', 'pre', 'proceeding', 'present', 'page', 'pending', 'p.m.', 'frcp', 'g.c.', 'record', 'r.']

    
#rebuild corpus
docket_texts_output = [] #ultimate output after cleaning

for lemma_pos_token in lemma_docket_text_list:
    docket_text_output = [] 
    for sentence in lemma_pos_token:
        sentence_output = []
        for token in sentence:
            #print(token[1])
            
            if token[2][0] not in remove_pos: #if the pos is not in the remove_pos list
                if token[1] not in remove_word: #these are the intentionally left out words
                    sentence_output.append(token[1]) #append the the sentence
        docket_text_output.append(' '.join(sentence_output))
    docket_texts_output.append(docket_text_output)
print(docket_texts_output[:10])

[['file error deficient plead file error complaint document file'], ['file error pdf error civil cover sheet file'], ['request issuance summons breezeeastern corporation complaint document file'], ['rule corporate disclosure corporate parent document file'], ['notice deficient pleading notice refile document complaint file deficient reason radio button select refile plead use event type complaint find event list complaint initiate document correct pdf select individually name filerfilers select individually name partyparties plead'], ['notice modification partyparties partyparties reasonreasons name contain typographical error text omit'], ['notice deficient civil cover sheet notice refile document civil cover sheet filing deficient reason citizenship principal fill despite federal question jurisdiction'], ['electronic summons issue'], ['notice open statistical error correction notice open statistical erroneously selectedentered fee status code due correction entry fee status code pd']

In [17]:
new_df['Removed unnecessary POS & vocab'] = pd.Series(docket_texts_output)
new_df.head()

Unnamed: 0,Original Docket Text,Organization Portion,Name Portion,Identifying Org and Name,Stripped Org and Name,Removed unnecessary POS & vocab
0,FILING ERROR - DEFICIENT PLEADING - FILED AGAI...,,Majid Soueidan Rosen Samuel,FILING ERROR - DEFICIENT PLEADING - FILED AGAI...,FILING ERROR - DEFICIENT PLEADING - FILED AGAI...,[file error deficient plead file error complai...
1,FILING ERROR PDF ERROR CIVIL COVER SHEET filed...,,Rosen Samuel,FILING ERROR PDF ERROR CIVIL COVER SHEET filed...,FILING ERROR PDF ERROR CIVIL COVER SHEET filed...,[file error pdf error civil cover sheet file]
2,REQUEST FOR ISSUANCE OF SUMMONS as to BREEZE-E...,,BRAD PEDERSEN ROBERT J. KELLY NELSON OBUS WILL...,REQUEST FOR ISSUANCE OF SUMMONS as to BREEZE-E...,REQUEST FOR ISSUANCE OF SUMMONS as to BREEZE-E...,[request issuance summons breezeeastern corpor...
3,RULE 7.1 CORPORATE DISCLOSURE STATEMENT. No Co...,,Majid Soueidan Rosen Samuel,RULE 7.1 CORPORATE DISCLOSURE STATEMENT . No C...,RULE 7.1 CORPORATE DISCLOSURE STATEMENT . No C...,[rule corporate disclosure corporate parent do...
4,***NOTICE TO ATTORNEY REGARDING DEFICIENT PLEA...,,Samuel Kenneth Rosen,***NOTICE TO ATTORNEY REGARDING DEFICIENT PLEA...,***NOTICE TO ATTORNEY REGARDING DEFICIENT PLEA...,[notice deficient pleading notice refile docum...


In [18]:
manual_topics_df = pd.read_csv('mannual_topics.csv')
manual_topics_df = manual_topics_df.apply(lambda x: x.astype(str).str.lower())
manual_topics_dict = manual_topics_df.to_dict('list')
for topic in manual_topics_dict.keys():
    manual_topics_dict[topic] = [keyword for keyword in manual_topics_dict[topic] if keyword != 'nan']

In [19]:
def mannual_topic_assignment(a_list):
    text = ' '.join(a_list).split()
    #print(text)
    output = []
    for topic in manual_topics_dict.keys():
        if set(text).intersection(manual_topics_dict[topic]):
            output.append(topic)
        
    return output

In [20]:
docket_texts_output_DT = []
topics_DT = []

for text in docket_texts_output:
    topic = mannual_topic_assignment(text)
    if topic != []:
        docket_texts_output_DT.append([])
        topics_DT.append(topic)
    else:
        docket_texts_output_DT.append(text)
        topics_DT.append([])

In [21]:
print(topics_DT[:5])
print(docket_texts_output_DT[:5])

[['Complaints'], [], ['Others', 'Complaints', 'Service of Process'], [], ['Complaints', 'Notices']]
[[], ['file error pdf error civil cover sheet file'], [], ['rule corporate disclosure corporate parent document file'], []]


In [22]:
new_df['DT Topics'] = pd.Series(topics_DT)
new_df['Removed unnecessary POS & vocab DT'] = pd.Series(docket_texts_output_DT)
print(new_df.head())

                                Original Docket Text Organization Portion  \
0  FILING ERROR - DEFICIENT PLEADING - FILED AGAI...                        
1  FILING ERROR PDF ERROR CIVIL COVER SHEET filed...                        
2  REQUEST FOR ISSUANCE OF SUMMONS as to BREEZE-E...                        
3  RULE 7.1 CORPORATE DISCLOSURE STATEMENT. No Co...                        
4  ***NOTICE TO ATTORNEY REGARDING DEFICIENT PLEA...                        

                                        Name Portion  \
0                        Majid Soueidan Rosen Samuel   
1                                       Rosen Samuel   
2  BRAD PEDERSEN ROBERT J. KELLY NELSON OBUS WILL...   
3                        Majid Soueidan Rosen Samuel   
4                               Samuel Kenneth Rosen   

                            Identifying Org and Name  \
0  FILING ERROR - DEFICIENT PLEADING - FILED AGAI...   
1  FILING ERROR PDF ERROR CIVIL COVER SHEET filed...   
2  REQUEST FOR ISSUANCE OF SUMMO

In [23]:
#print some examples
for i in range(10):
    print(i)
    if new_df['DT Topics'].iloc[i] != []:
        print(new_df['DT Topics'].iloc[i])
        print(new_df['Removed unnecessary POS & vocab'].iloc[i])
        print(new_df['Removed unnecessary POS & vocab DT'].iloc[i])

0
['Complaints']
['file error deficient plead file error complaint document file']
[]
1
2
['Others', 'Complaints', 'Service of Process']
['request issuance summons breezeeastern corporation complaint document file']
[]
3
4
['Complaints', 'Notices']
['notice deficient pleading notice refile document complaint file deficient reason radio button select refile plead use event type complaint find event list complaint initiate document correct pdf select individually name filerfilers select individually name partyparties plead']
[]
5
['Notices']
['notice modification partyparties partyparties reasonreasons name contain typographical error text omit']
[]
6
['Notices']
['notice deficient civil cover sheet notice refile document civil cover sheet filing deficient reason citizenship principal fill despite federal question jurisdiction']
[]
7
['Service of Process']
['electronic summons issue']
[]
8
['Notices']
['notice open statistical error correction notice open statistical erroneously selected

In [25]:
def trigram_transform(texts):
    trigram_output = []

    remove_trigram = ['calendar_day', 'court_notice_intend', 'minute_entry_proceeding_hold', 'court_reportertranscriber_abovecaptioned_matter',
                      'redaction_calendar_day', 'rule_statement', 'obtain_pacer', 'may_obtain_pacer', 'reportertranscriber_abovecaptioned_matter',
                      'redact_transcript_deadline', 'send_chamber', "official_transcript_notice_give", "notice_intent_request", "proceed_hold", 
                      "fee_receipt_number", "civil_procedure", "pursuant_frcp", "official_transcript_conference", 
                      "purchase_reportertranscriber_deadline_release", "et_al", "mail_chamber", "transcript_restriction", "redaction_transcript", 
                      "transcript_view_public_terminal", "transcript_make_remotely", "associated_et_al", "electronically_available_public_without", 
                      "genesys_id", "release_transcript_restriction", "adar_bay", "redaction_request_due", "new_york", "official_transcript_conference", 
                      "transcript_make_remotely", "transcript_proceeding_conference_hold", "redaction_transcript",
                      'affidavit_jr._c.p.a', 'corporate_parent', 'certain_underwriter', 'federal_rule_civil_procedure', 'redaction_request', 
                      'official_transcript', 'rule_disclosure', 'rule_corporate_disclosure', 'place_vault', 'public_without_redaction_calendar', 
                      'purchase_deadline_release_transcript', 'transcript_proceeding_hold', 'transcript_remotely_electronically_available']
  

    for sentence in texts:
        unigram_review = []
        for word in sentence.split():
            unigram_review.append(word)
    
        #print('Uni: ', unigram_review)
        bigram_review = bigram_model[unigram_review]
        #print('Bi: ', bigram_review)
        trigram_review = trigram_model[bigram_review]
        trigram_review = [phrase for phrase in trigram_review if phrase not in remove_trigram]
        #print('Tri: ', trigram_review)
        trigram_output.append(' '.join(trigram_review))
    return trigram_output

In [28]:
bigram_model_filepath = 'docket_texts/train/DT/bigram_model_noorgnoname'
trigram_model_filepath = 'docket_texts/train/DT/trigram_model_nonamenoorg'

bigram_model = Phrases.load(bigram_model_filepath)
trigram_model = Phrases.load(trigram_model_filepath)

new_df['Apply Trigram Phrase Model'] = new_df['Removed unnecessary POS & vocab DT'].apply(trigram_transform)



In [29]:
new_df['Apply Trigram Phrase Model'].head()

0                                               []
1    [file error pdf error civil_cover_sheet file]
2                                               []
3                                  [document file]
4                                               []
Name: Apply Trigram Phrase Model, dtype: object

In [30]:
#write trigram to file, not sure why...
trigram_dockets_filepath = 'docket_texts/test/DT/trigram_transformed_dockets_noorgnoname.txt'

with codecs.open(trigram_dockets_filepath, 'w', encoding= 'utf_8') as f:
    for i in range(len(new_df['Apply Trigram Phrase Model'])):
        f.write(' '.join(new_df['Apply Trigram Phrase Model'][i]) + '\n')

In [31]:
trigram_dictionary_filepath = 'docket_texts/test/DT/trigram_dict_noorgnoname.dict'

In [32]:
%%time

#some dictionary hyperparameters:
no_below = 10 #reference is 10
no_above = 0.4 #reference is 0.4

trigram_reviews = LineSentence(trigram_dockets_filepath)

# learn the dictionary by iterating over all of the reviews
trigram_dictionary = Dictionary(trigram_reviews)

# filter tokens that are very rare otrigram_reviewsr too common from
# the dictionary (filter_extremes) and reassign integer ids (compactify)
trigram_dictionary.filter_extremes(no_below = no_below, no_above = no_above) #this step is questionable. May need to change the parameters
trigram_dictionary.compactify()

trigram_dictionary.save(trigram_dictionary_filepath)
    
# load the finished dictionary from disk
#trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)

Wall time: 50.7 ms


In [33]:
trigram_bow_filepath = 'docket_texts/test/DT/trigram_bow_corpus_noorgnoname.mm'

In [34]:
def trigram_bow_generator(filepath):
    """
    generator function to read reviews from a file
    and yield a bag-of-words representation
    """
    
    for review in LineSentence(filepath):
        #print(review)
        #print(trigram_dictionary.doc2bow(review))
        yield trigram_dictionary.doc2bow(review)

In [35]:
trigram_sentences_filepath = 'docket_texts/test/DT/trigram_sentences_nonamenoorg.txt'

with codecs.open(trigram_sentences_filepath, 'w', encoding = 'utf_8') as f:
    for trigram_sentence in list(new_df['Apply Trigram Phrase Model']):
        #print(' '.join(trigram_sentence))
        if 
        f.write(' '.join(trigram_sentence) + '\n')
        
trigram_sentences = LineSentence(trigram_sentences_filepath)

In [36]:
trigram_bow_filepath = 'docket_texts/test/DT/trigram_bow_corpus_noorgnoname.mm'
trigram_dictionary_filepath = 'docket_texts/test/DT/trigram_dict_noorgnoname.dict'

In [37]:
%%time

# generate bag-of-words representations for
# all reviews and save them as a matrix
MmCorpus.serialize(trigram_bow_filepath, trigram_bow_generator(trigram_sentences_filepath))
    
# load the finished bag-of-words corpus from disk
trigram_bow_corpus = MmCorpus(trigram_bow_filepath)
print(trigram_bow_corpus)

MmCorpus(263 documents, 25 features, 486 non-zero entries)
Wall time: 35.1 ms


In [51]:
trigram_bow_corpus.

<gensim.corpora.mmcorpus.MmCorpus at 0x262b6abf188>

In [38]:
%%time

#some dictionary hyperparameters:
no_below = 10 #reference is 10
no_above = 0.4 #reference is 0.4

trigram_reviews = LineSentence(trigram_dockets_filepath)

# learn the dictionary by iterating over all of the reviews
trigram_dictionary = Dictionary(trigram_reviews)

# filter tokens that are very rare otrigram_reviewsr too common from
# the dictionary (filter_extremes) and reassign integer ids (compactify)
trigram_dictionary.filter_extremes(no_below = no_below, no_above = no_above) #this step is questionable. May need to change the parameters
trigram_dictionary.compactify()

trigram_dictionary.save(trigram_dictionary_filepath)

Wall time: 9.52 ms


### Applying models to docket text test sets

In [41]:
# guessing we are using 4 topics
num_topics = 4
lda = LdaMulticore.load('docket_texts/train/DT/lda_model_noorgnomodel_' + str(num_topics))
train_dict = Dictionary.load('docket_texts/train/DT/trigram_dict_noorgnoname.dict')

test_dict = Dictionary.load(trigram_dictionary_filepath)

In [42]:
for new_id, token in test_dict.iteritems():
    print(new_id, token)

17 minute_entry_hold
19 associate
6 letter
2 affiliate
24 receiver
16 hold
22 bench_trial
7 order
23 fee
5 letter_address
0 error
15 schedule
4 inc
10 rule
13 motion
11 endorsement
18 pak6
3 create
20 motion_limine
12 memo_endorsement
8 application
14 mediation
1 deadline
21 grant
9 discovery


In [68]:
def lda_description(docket_text, lda, trigram_dictionary, topic_names, min_topic_freq = 0.05):
    '''
    accept the processed texts (trigram) of a review and 
    1) create a bag-of-words representation, 
    4) create an LDA representation, and
    5) print a sorted list of the top topics in the LDA representation
    '''
    output = []
    analyze_this = []
    for sentence in docket_text:
        analyze_this += sentence.split()
    
    # create a bag-of-words representation
    review_bow = trigram_dictionary.doc2bow(analyze_this)
    
    # create an LDA representation
    review_lda = lda[review_bow]
    
    # sort with the most highly related topics first
    review_lda.sort(key = lambda tup: tup[1], reverse = True)
    #print(review_lda)
    for topic_number, freq in review_lda:
        if freq < min_topic_freq:
            break
            
        # print the most highly related topic names and frequencies
        #print('{:25} {}'.format(topic_names[topic_number], round(freq, 3)))
        output.append((topic_names[topic_number], round(freq, 5)))
    return output

In [70]:
''' commented out due to new DT direction
# according to Chris' feedback on 2018-5-10
topic_names = {0: 'motion to dismiss',
               1: 'motion for summary judgment',
               2: 'Complaint and motion',
               3: 'Amended Complaint and motion'}
'''

topic_names = {}
for i in range(num_topics):
    topic_names[i] = 'Topic ' + str(i)

topic_summary = []
    
for docket_text in list(new_df['Apply Trigram Phrase Model']):
    #print(docket_text)
    if docket_text == []:
        topic_summary.append('')
    else:
        topic_summary.append(lda_description(docket_text, lda, trigram_dictionary, topic_names))


In [73]:
new_df['Topic Model Output'] = topic_summary
new_df.head()

Unnamed: 0,Original Docket Text,Organization Portion,Name Portion,Identifying Org and Name,Stripped Org and Name,Removed unnecessary POS & vocab,DT Topics,Removed unnecessary POS & vocab DT,Apply Trigram Phrase Model,Topic Model Output
0,FILING ERROR - DEFICIENT PLEADING - FILED AGAI...,,Majid Soueidan Rosen Samuel,FILING ERROR - DEFICIENT PLEADING - FILED AGAI...,FILING ERROR - DEFICIENT PLEADING - FILED AGAI...,[file error deficient plead file error complai...,[Complaints],[],[],
1,FILING ERROR PDF ERROR CIVIL COVER SHEET filed...,,Rosen Samuel,FILING ERROR PDF ERROR CIVIL COVER SHEET filed...,FILING ERROR PDF ERROR CIVIL COVER SHEET filed...,[file error pdf error civil cover sheet file],[],[file error pdf error civil cover sheet file],[file error pdf error civil_cover_sheet file],"[(Topic 2, 0.74803), (Topic 0, 0.08516), (Topi..."
2,REQUEST FOR ISSUANCE OF SUMMONS as to BREEZE-E...,,BRAD PEDERSEN ROBERT J. KELLY NELSON OBUS WILL...,REQUEST FOR ISSUANCE OF SUMMONS as to BREEZE-E...,REQUEST FOR ISSUANCE OF SUMMONS as to BREEZE-E...,[request issuance summons breezeeastern corpor...,"[Others, Complaints, Service of Process]",[],[],
3,RULE 7.1 CORPORATE DISCLOSURE STATEMENT. No Co...,,Majid Soueidan Rosen Samuel,RULE 7.1 CORPORATE DISCLOSURE STATEMENT . No C...,RULE 7.1 CORPORATE DISCLOSURE STATEMENT . No C...,[rule corporate disclosure corporate parent do...,[],[rule corporate disclosure corporate parent do...,[document file],"[(Topic 0, 0.25), (Topic 1, 0.25), (Topic 2, 0..."
4,***NOTICE TO ATTORNEY REGARDING DEFICIENT PLEA...,,Samuel Kenneth Rosen,***NOTICE TO ATTORNEY REGARDING DEFICIENT PLEA...,***NOTICE TO ATTORNEY REGARDING DEFICIENT PLEA...,[notice deficient pleading notice refile docum...,"[Complaints, Notices]",[],[],


In [74]:
new_df.columns

Index(['Original Docket Text', 'Organization Portion', 'Name Portion',
       'Identifying Org and Name', 'Stripped Org and Name',
       'Removed unnecessary POS & vocab', 'DT Topics',
       'Removed unnecessary POS & vocab DT', 'Apply Trigram Phrase Model',
       'Topic Model Output'],
      dtype='object')

In [114]:
new_df[['Original Docket Text', 'Removed unnecessary POS & vocab DT', 'Apply Trigram Phrase Model', 
        'DT Topics', 'Topic Output']].to_csv('docket_texts/test/DT/verify_this.csv', index = False)