## Notebook Goal:  
Using existing NLP and LDA methodologies to perform topic modeling on docket texts. Three hyperparameters to consider:
1. to remove organization or not in docket texts, so organizations themselves won't become topics.
2. to remove names or not in docket texts, so names themselves won't become topics.
3. variations in topic numbers: [2, 3, 5, 10]

Will then perform visualizations and model summary output on every permutation/iteration.

In [2]:
import nltk
from nltk.tag.stanford import StanfordNERTagger
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords
from nltk import pos_tag, ne_chunk
from nltk.tokenize import word_tokenize
from nltk.chunk import conlltags2tree, tree2conlltags

sw = stopwords.words("english")

path_to_model = r'C:\Users\inves\AppData\Local\Programs\Python\Python35\Lib\site-packages\nltk\stanford-ner-2018-02-27\classifiers\english.all.3class.distsim.crf.ser.gz'
path_to_jar = r'C:\Users\inves\AppData\Local\Programs\Python\Python35\Lib\site-packages\nltk\stanford-ner-2018-02-27\stanford-ner.jar'
tagger = StanfordNERTagger(path_to_model, path_to_jar = path_to_jar)

In [3]:
from gensim.models.word2vec import LineSentence
from gensim.models import Phrases
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

#visualization libraries
import pyLDAvis
import pyLDAvis.gensim



In [4]:
import os
import pandas as pd
import numpy as np
import codecs
import itertools as it
from bs4 import BeautifulSoup
import warnings
import pickle
from collections import Counter
import re
import datetime
import string
import time

java_path = 'C:/Program Files/Java/jdk-10.0.1/bin/java.exe'
os.environ['JAVAHOME'] = java_path

In [5]:
nltk.download('words')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\inves\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\inves\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\inves\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


True

In [78]:
filename = 'docket_texts/train/DT/basic_df.pickle'

In [79]:
#to load
with open(filename, 'rb') as handle:
    NER_df = pickle.load(handle)

In [80]:
new_df = NER_df.copy()

In [81]:
docket_original = list(new_df['Original Docket Text'])

### Actually we can do some deduping... but can wait as well

In [9]:
len(set(docket_original))

3203

In [11]:
def valid_date(datestring):
    try:
        mat = re.match('(\d{1,2})[/.-](\d{2})[/.-](\d{4})$', datestring)
        if mat is not None:
            datetime.datetime(*(map(int, mat.groups()[-1::-1])))
            return True
    except ValueError:
        pass
    return False
    
valid_date('003/11/2016')

False

### 1. Normalize

In [78]:
url_regex1 = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
url_regex2 = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
date_regex = '(\d{1,2}[\/ ](\d{2}|January|Jan|February|Feb|March|Mar|April|Apr|May|May|June|Jun|July|Jul|August|Aug|September|Sep|October|Oct|November|Nov|December|Dec)[\/ ]\d{2,4})'
punct_regex = r"[^a-zA-Z0-9]"
num_regex = "\d+"
extraspace_regex = " +"

docket_normalized = [text.lower() for text in docket_original]
docket_nourl = [re.sub(url_regex2, "URL", text) for text in docket_normalized]
docket_nourl = [re.sub(url_regex1, "URL", text) for text in docket_nourl]
docket_nodate = [re.sub(date_regex, "DATE", text) for text in docket_nourl]
docket_nopunct = [re.sub(punct_regex, " ", text) for text in docket_nodate]
docket_nonum = [re.sub(num_regex, " ", text) for text in docket_nopunct]
docket_noextraspace = [re.sub(extraspace_regex, " ", text) for text in docket_nonum]

COMPLAINT against Cardiogenics Holdings, Inc. filing fee $ 400, receipt number 0207-8445206 Was the Disclosure Statement on Civil Cover Sheet completed -YES,, filed by LG Capital Funding, LLC. (Steinmetz, Michael) (Additional attachment(s) added on 3/11/2016: # 1 Civil Cover Sheet, # 2 Proposed Summons) (Bowens, Priscilla). (Entered: 03/10/2016) 

complaint against cardiogenics holdings inc filing fee receipt number was the disclosure statement on civil cover sheet completed yes filed by lg capital funding llc steinmetz michael additional attachment s added on date civil cover sheet proposed summons bowens priscilla entered date 


In [13]:
print(docket_original[0], '\n')

print(docket_noextraspace[0])

COMPLAINT against Cardiogenics Holdings, Inc. filing fee $ 400, receipt number 0207-8445206 Was the Disclosure Statement on Civil Cover Sheet completed -YES,, filed by LG Capital Funding, LLC. (Steinmetz, Michael) (Additional attachment(s) added on 3/11/2016: # 1 Civil Cover Sheet, # 2 Proposed Summons) (Bowens, Priscilla). (Entered: 03/10/2016) 

complaint against cardiogenics holdings inc filing fee receipt number was the disclosure statement on civil cover sheet completed yes filed by lg capital funding llc steinmetz michael additional attachment s added on date civil cover sheet proposed summons bowens priscilla entered date 


### 2. Split and tokenize

In [14]:
docket_tokenized = [word_tokenize(text) for text in docket_noextraspace]
docket_tokenized[0]

['complaint',
 'against',
 'cardiogenics',
 'holdings',
 'inc',
 'filing',
 'fee',
 'receipt',
 'number',
 'was',
 'the',
 'disclosure',
 'statement',
 'on',
 'civil',
 'cover',
 'sheet',
 'completed',
 'yes',
 'filed',
 'by',
 'lg',
 'capital',
 'funding',
 'llc',
 'steinmetz',
 'michael',
 'additional',
 'attachment',
 's',
 'added',
 'on',
 'date',
 'civil',
 'cover',
 'sheet',
 'proposed',
 'summons',
 'bowens',
 'priscilla',
 'entered',
 'date']

### 3. Remove Stop words

In [79]:
docket_nostop = [[w for w in words if w not in stopwords.words("english")] for words in docket_tokenized]
print(docket_nostop[0])

['complaint', 'cardiogenics', 'holdings', 'inc', 'filing', 'fee', 'receipt', 'number', 'disclosure', 'statement', 'civil', 'cover', 'sheet', 'completed', 'yes', 'filed', 'lg', 'capital', 'funding', 'llc', 'steinmetz', 'michael', 'additional', 'attachment', 'added', 'date', 'civil', 'cover', 'sheet', 'proposed', 'summons', 'bowens', 'priscilla', 'entered', 'date']


### 4. Lemmatization

In [80]:
docket_lemmed = [[WordNetLemmatizer().lemmatize(w, pos='v') for w in words] for words in docket_nostop]
print(docket_lemmed[0])

['complaint', 'cardiogenics', 'hold', 'inc', 'file', 'fee', 'receipt', 'number', 'disclosure', 'statement', 'civil', 'cover', 'sheet', 'complete', 'yes', 'file', 'lg', 'capital', 'fund', 'llc', 'steinmetz', 'michael', 'additional', 'attachment', 'add', 'date', 'civil', 'cover', 'sheet', 'propose', 'summon', 'bowens', 'priscilla', 'enter', 'date']


### 5. Phrase Modeling

In [81]:
docket_phrase1 = [' '.join(text) for text in docket_lemmed]
docket_phrase1[:5]

['complaint cardiogenics hold inc file fee receipt number disclosure statement civil cover sheet complete yes file lg capital fund llc steinmetz michael additional attachment add date civil cover sheet propose summon bowens priscilla enter date',
 'case assign judge ann donnelly magistrate judge vera scanlon please download review individual practice assign judge locate website attorneys responsible provide courtesy copy judge individual practice require bowens priscilla enter date',
 'summon issue cardiogenics hold inc bowens priscilla enter date',
 'notice email attorney regard miss second page civil cover sheet bowens priscilla enter date',
 'accordance rule federal rule civil procedure local rule party notify party consent unite state magistrate judge court available conduct proceed civil action include jury nonjury trial order entry final judgment attach notice blank copy consent form fill sign file electronically party wish consent form may also access follow link url may withhol

In [82]:
unigram_sentences_filepath = 'docket_texts/train/DT/unigram_nltk_newsop.txt'

In [83]:
%%time
# turn the lemmatized corpus into unigram sentences
with codecs.open(unigram_sentences_filepath, 'w', encoding = 'utf_8') as f:
    for sentence in docket_phrase1:
        f.write(sentence + '\n')

Wall time: 15.4 ms


In [84]:
unigram_sentences = LineSentence(unigram_sentences_filepath)

In [85]:
bigram_model_filepath = 'docket_texts/train/DT/bigram_model_newsop' 

In [86]:
%%time

# store our bigram model
bigram_model = Phrases(unigram_sentences)
bigram_model.save(bigram_model_filepath)
    
# load the finished model from disk if we don't want to run this again
#bigram_model = Phrases.load(bigram_model_filepath)

Wall time: 250 ms


In [87]:
bigram_sentences_filepath = 'docket_texts/train/DT/bigram_sentences_newsop.txt'

In [88]:
%%time

# apply the bigram model, and write it to file
with codecs.open(bigram_sentences_filepath, 'w', encoding = 'utf_8') as f:
    for unigram_sentence in unigram_sentences:
        bigram_sentence = ' '.join(bigram_model[unigram_sentence])
        f.write(bigram_sentence + '\n')



Wall time: 484 ms


In [89]:
bigram_sentences = LineSentence(bigram_sentences_filepath)

In [90]:
print('\nUnigram sentence:')
for unigram_sentence in it.islice(unigram_sentences, 0, 10):
    print(' '.join(unigram_sentence))
print('\nBigram sentence:')
for bigram_sentence in it.islice(bigram_sentences, 0, 10):
    print(' '.join(bigram_sentence))


Unigram sentence:
complaint cardiogenics hold inc file fee receipt number disclosure statement civil cover sheet complete yes file lg capital fund llc steinmetz michael additional attachment add date civil cover sheet propose summon bowens priscilla enter date
case assign judge ann donnelly magistrate judge vera scanlon please download review individual practice assign judge locate website attorneys responsible provide courtesy copy judge individual practice require bowens priscilla enter date
summon issue cardiogenics hold inc bowens priscilla enter date
notice email attorney regard miss second page civil cover sheet bowens priscilla enter date
accordance rule federal rule civil procedure local rule party notify party consent unite state magistrate judge court available conduct proceed civil action include jury nonjury trial order entry final judgment attach notice blank copy consent form fill sign file electronically party wish consent form may also access follow link url may withho

In [91]:
trigram_model_filepath = 'docket_texts/train/DT/trigram_model_newsop'

In [92]:
%%time

# again, using Phrases to attach more words to phrases already formed
trigram_model = Phrases(bigram_sentences)
trigram_model.save(trigram_model_filepath)

# load the finished model from disk
#trigram_model = Phrases.load(trigram_model_filepath)

Wall time: 250 ms


In [93]:
trigram_sentences_filepath = 'docket_texts/train/DT/trigram_sentences_newsop.txt'

In [94]:
%%time

with codecs.open(trigram_sentences_filepath, 'w', encoding = 'utf_8') as f:
    for bigram_sentence in bigram_sentences:
        #print('Bi', bigram_sentence)
        trigram_sentence = ' '.join(trigram_model[bigram_sentence])
        #print('Tri', trigram_sentence)
        f.write(trigram_sentence + '\n')



Wall time: 437 ms


In [95]:
trigram_sentences = LineSentence(trigram_sentences_filepath)

In [96]:
start = 0
finish = 5
print('Original text:')
print(docket_phrase1[start:finish], '\n')

print('\nUNIGRAM Sentence:')
for unigram_sentence in it.islice(unigram_sentences, start, finish):
    print(' '.join(unigram_sentence))
print('\nBIGRAM Sentence:')
for bigram_sentence in it.islice(bigram_sentences, start, finish):
    print(' '.join(bigram_sentence))
print('\nTRIGRAM Sentence:')
for trigram_sentence in it.islice(trigram_sentences, start, finish):
    print(' '.join(trigram_sentence))

Original text:
['complaint cardiogenics hold inc file fee receipt number disclosure statement civil cover sheet complete yes file lg capital fund llc steinmetz michael additional attachment add date civil cover sheet propose summon bowens priscilla enter date', 'case assign judge ann donnelly magistrate judge vera scanlon please download review individual practice assign judge locate website attorneys responsible provide courtesy copy judge individual practice require bowens priscilla enter date', 'summon issue cardiogenics hold inc bowens priscilla enter date', 'notice email attorney regard miss second page civil cover sheet bowens priscilla enter date', 'accordance rule federal rule civil procedure local rule party notify party consent unite state magistrate judge court available conduct proceed civil action include jury nonjury trial order entry final judgment attach notice blank copy consent form fill sign file electronically party wish consent form may also access follow link url 

In [64]:
def trigram_transform(texts):
    display = False
    texts = str(texts)
    trigram_output = ''
    #print(texts)

    remove_trigram = ['calendar_day', 'court_notice_intend', 'minute_entry_proceeding_hold', 'court_reportertranscriber_abovecaptioned_matter',
                      'redaction_calendar_day', 'rule_statement', 'obtain_pacer', 'may_obtain_pacer', 'reportertranscriber_abovecaptioned_matter',
                      'redact_transcript_deadline', 'send_chamber', "official_transcript_notice_give", "notice_intent_request", "proceed_hold", 
                      "fee_receipt_number", "civil_procedure", "pursuant_frcp", "official_transcript_conference", 
                      "purchase_reportertranscriber_deadline_release", "et_al", "mail_chamber", "transcript_restriction", "redaction_transcript", 
                      "transcript_view_public_terminal", "transcript_make_remotely", "associated_et_al", "electronically_available_public_without", 
                      "genesys_id", "release_transcript_restriction", "adar_bay", "redaction_request_due", "new_york", "official_transcript_conference", 
                      "transcript_make_remotely", "transcript_proceeding_conference_hold", "redaction_transcript",
                      'affidavit_jr._c.p.a', 'corporate_parent', 'certain_underwriter', 'federal_rule_civil_procedure', 'redaction_request', 
                      'official_transcript', 'rule_disclosure', 'rule_corporate_disclosure', 'place_vault', 'public_without_redaction_calendar', 
                      'purchase_deadline_release_transcript', 'transcript_proceeding_hold', 'transcript_remotely_electronically_available',
                      'minute_entry_hold', 'discovery_hear_hold', 'jury_trial_hold', "sign_judge",'place_vault']

    if texts == None:
        return None
    
    unigram_review = []
    for word in texts.split():
        unigram_review.append(word)
    if display:
        print('Uni: ', unigram_review)
    bigram_review = bigram_model[unigram_review]
    if display:
        print('Bi: ', bigram_review)
    trigram_review = trigram_model[bigram_review]
    if display:
        print('Tri: ', trigram_review)
    trigram_review = [phrase for phrase in trigram_review if phrase not in remove_trigram]
    if display:
        print('Tri removed: ', trigram_review)
    trigram_output += ' '.join(trigram_review)
    
    return trigram_output

In [97]:
docket_phrase2 = [trigram_transform(text) for text in docket_phrase1]



In [99]:
print(docket_original[:5])
print(docket_phrase2[:5])
len(set(docket_phrase2))

['COMPLAINT against Cardiogenics Holdings, Inc. filing fee $ 400, receipt number 0207-8445206 Was the Disclosure Statement on Civil Cover Sheet completed -YES,, filed by LG Capital Funding, LLC. (Steinmetz, Michael) (Additional attachment(s) added on 3/11/2016: # 1 Civil Cover Sheet, # 2 Proposed Summons) (Bowens, Priscilla). (Entered: 03/10/2016)', 'Case assigned to Judge Ann M Donnelly and Magistrate Judge Vera M. Scanlon. Please download and review the Individual Practices of the assigned Judges, located on our website. Attorneys are responsible for providing courtesy copies to judges where their Individual Practices require such. (Bowens, Priscilla) (Entered: 03/11/2016)', 'Summons Issued as to Cardiogenics Holdings, Inc.. (Bowens, Priscilla) (Entered: 03/11/2016)', 'NOTICE - emailed attorney regarding missing second page of the civil cover sheet. (Bowens, Priscilla) (Entered: 03/11/2016)', 'In accordance with Rule 73 of the Federal Rules of Civil Procedure and Local Rule 73.1, the

3049

### 6. Use NER (named entity recognition) and GPE (geo-political entity)
This did not work as well as I thought. Also, the standford version worked better than the nltk version.
Seems like after some text treatments, it's hard for the NER to be identified.

In [106]:
#after the inital normalization, there's no sense of the NER or GEP by NLTK NER and GPE. Seems that it heavily depend on the cases
iob_tagged = tree2conlltags(ne_chunk(pos_tag(word_tokenize(docket_original[0]))))
print('orignal text: ')
print(iob_tagged, '\n')
iob_tagged = tree2conlltags(ne_chunk(pos_tag(word_tokenize(docket_normalized[0]))))
print('normalized: ')
print(iob_tagged, '\n')
iob_tagged = tree2conlltags(ne_chunk(pos_tag(word_tokenize(docket_nourl[0]))))
print('no url: ')
print(iob_tagged, '\n')
iob_tagged = tree2conlltags(ne_chunk(pos_tag(word_tokenize(docket_nodate[0]))))
print('no date: ')
print(iob_tagged, '\n')
iob_tagged = tree2conlltags(ne_chunk(pos_tag(word_tokenize(docket_nopunct[0]))))
print('no punctuation: ')
print(iob_tagged, '\n')
iob_tagged = tree2conlltags(ne_chunk(pos_tag(word_tokenize(docket_nonum[0]))))
print('no numbers: ')
print(iob_tagged, '\n')
iob_tagged = tree2conlltags(ne_chunk(pos_tag(word_tokenize(docket_noextraspace[0]))))
print('no extra spaces: ')
print(iob_tagged, '\n')
iob_tagged = tree2conlltags(ne_chunk(pos_tag(word_tokenize(docket_phrase1[0]))))
print('lemmentized: ')
print(iob_tagged, '\n')
iob_tagged = tree2conlltags(ne_chunk(pos_tag(word_tokenize(docket_phrase2[0]))))
print('phrase modeled: ')
print(iob_tagged)

orignal text: 
[('COMPLAINT', 'NNP', 'B-ORGANIZATION'), ('against', 'IN', 'O'), ('Cardiogenics', 'NNP', 'B-ORGANIZATION'), ('Holdings', 'NNPS', 'I-ORGANIZATION'), (',', ',', 'O'), ('Inc.', 'NNP', 'O'), ('filing', 'VBG', 'O'), ('fee', 'JJ', 'O'), ('$', '$', 'O'), ('400', 'CD', 'O'), (',', ',', 'O'), ('receipt', 'JJ', 'O'), ('number', 'NN', 'O'), ('0207-8445206', 'NN', 'O'), ('Was', 'NNP', 'O'), ('the', 'DT', 'O'), ('Disclosure', 'NNP', 'B-ORGANIZATION'), ('Statement', 'NNP', 'O'), ('on', 'IN', 'O'), ('Civil', 'NNP', 'B-PERSON'), ('Cover', 'NNP', 'I-PERSON'), ('Sheet', 'NNP', 'I-PERSON'), ('completed', 'VBD', 'O'), ('-YES', 'NNP', 'O'), (',', ',', 'O'), (',', ',', 'O'), ('filed', 'VBN', 'O'), ('by', 'IN', 'O'), ('LG', 'NNP', 'B-ORGANIZATION'), ('Capital', 'NNP', 'I-ORGANIZATION'), ('Funding', 'NNP', 'I-ORGANIZATION'), (',', ',', 'O'), ('LLC', 'NNP', 'B-ORGANIZATION'), ('.', '.', 'O'), ('(', '(', 'O'), ('Steinmetz', 'NNP', 'B-PERSON'), (',', ',', 'O'), ('Michael', 'NNP', 'B-GPE'), (')', '

In [108]:
#after the inital normalization, there's no sense of the NER or GEP by NLTK NER and GPE. Seems that it heavily depend on the cases
iob_tagged = tagger.tag(word_tokenize(docket_original[0]))
print('orignal text: ')
print(iob_tagged, '\n')
iob_tagged = tagger.tag(word_tokenize(docket_normalized[0]))
print('normalized: ')
print(iob_tagged, '\n')
iob_tagged = tagger.tag(word_tokenize(docket_nourl[0]))
print('no url: ')
print(iob_tagged, '\n')
iob_tagged = tagger.tag(word_tokenize(docket_nodate[0]))
print('no date: ')
print(iob_tagged, '\n')
iob_tagged = tagger.tag(word_tokenize(docket_nopunct[0]))
print('no punctuation: ')
print(iob_tagged, '\n')
iob_tagged = tagger.tag(word_tokenize(docket_nonum[0]))
print('no numbers: ')
print(iob_tagged, '\n')
iob_tagged = tagger.tag(word_tokenize(docket_noextraspace[0]))
print('no extra spaces: ')
print(iob_tagged, '\n')
iob_tagged = tagger.tag(word_tokenize(docket_phrase1[0]))
print('lemmentized: ')
print(iob_tagged, '\n')
iob_tagged = tagger.tag(word_tokenize(docket_phrase2[0]))
print('phrase modeled: ')
print(iob_tagged)

orignal text: 
[('COMPLAINT', 'O'), ('against', 'O'), ('Cardiogenics', 'ORGANIZATION'), ('Holdings', 'ORGANIZATION'), (',', 'ORGANIZATION'), ('Inc.', 'ORGANIZATION'), ('filing', 'O'), ('fee', 'O'), ('$', 'O'), ('400', 'O'), (',', 'O'), ('receipt', 'O'), ('number', 'O'), ('0207-8445206', 'O'), ('Was', 'O'), ('the', 'O'), ('Disclosure', 'O'), ('Statement', 'O'), ('on', 'O'), ('Civil', 'O'), ('Cover', 'O'), ('Sheet', 'O'), ('completed', 'O'), ('-YES', 'O'), (',', 'O'), (',', 'O'), ('filed', 'O'), ('by', 'O'), ('LG', 'ORGANIZATION'), ('Capital', 'ORGANIZATION'), ('Funding', 'ORGANIZATION'), (',', 'O'), ('LLC', 'O'), ('.', 'O'), ('(', 'PERSON'), ('Steinmetz', 'PERSON'), (',', 'O'), ('Michael', 'PERSON'), (')', 'O'), ('(', 'O'), ('Additional', 'O'), ('attachment', 'O'), ('(', 'O'), ('s', 'O'), (')', 'O'), ('added', 'O'), ('on', 'O'), ('3112016', 'O'), (':', 'O'), ('#', 'O'), ('1', 'O'), ('Civil', 'O'), ('Cover', 'O'), ('Sheet', 'O'), (',', 'O'), ('#', 'O'), ('2', 'O'), ('Proposed', 'O'), ('S

### So let's start over

In [116]:
docket_original = list(new_df['Original Docket Text'])

### 1. NER
Stanford is much better at knowing what's going on

In [82]:
output = []
for i in range(5):
    org_str = []
    name_str = []
    stripped_str1 = []
    stripped_str2 = []

    tokens = word_tokenize(docket_original[i])
    for SFlabel, NLlabel, token in zip(tagger.tag(tokens), tree2conlltags(ne_chunk(pos_tag(tokens))), tokens):
        print(SFlabel, NLlabel, token)
        if SFlabel[1] == 'ORGANIZATION':
            org_str.append(SFlabel[0])
            stripped_str1.append('-ORG-')
        elif SFlabel[1] == 'PERSON':
            name_str.append(SFlabel[0])
            stripped_str1.append('-NAME-')
        else:
            stripped_str1.append(token)
            stripped_str2.append(token)

    output.append([docket_original[i],
                   ' '.join(org_str),
                   ' '.join(name_str),
                   ' '.join(stripped_str1),
                   ' '.join(stripped_str2)])
    
for i in range(5):
    print('docket text:', i)
    print(output[i], '\n')

('COMPLAINT', 'O') ('COMPLAINT', 'NNP', 'B-ORGANIZATION') COMPLAINT
('against', 'O') ('against', 'IN', 'O') against
('Cardiogenics', 'ORGANIZATION') ('Cardiogenics', 'NNP', 'B-ORGANIZATION') Cardiogenics
('Holdings', 'ORGANIZATION') ('Holdings', 'NNPS', 'I-ORGANIZATION') Holdings
(',', 'ORGANIZATION') (',', ',', 'O') ,
('Inc.', 'ORGANIZATION') ('Inc.', 'NNP', 'O') Inc.
('filing', 'O') ('filing', 'VBG', 'O') filing
('fee', 'O') ('fee', 'JJ', 'O') fee
('$', 'O') ('$', '$', 'O') $
('400', 'O') ('400', 'CD', 'O') 400
(',', 'O') (',', ',', 'O') ,
('receipt', 'O') ('receipt', 'JJ', 'O') receipt
('number', 'O') ('number', 'NN', 'O') number
('0207-8445206', 'O') ('0207-8445206', 'NN', 'O') 0207-8445206
('Was', 'O') ('Was', 'NNP', 'O') Was
('the', 'O') ('the', 'DT', 'O') the
('Disclosure', 'O') ('Disclosure', 'NNP', 'B-ORGANIZATION') Disclosure
('Statement', 'O') ('Statement', 'NNP', 'O') Statement
('on', 'O') ('on', 'IN', 'O') on
('Civil', 'O') ('Civil', 'NNP', 'B-PERSON') Civil
('Cover', 'O')

After some experiments, we'll stick with the Stanford NER first... leveraging the existing dataframe.
### 2. Normalize

In [117]:
filename = 'docket_texts/train/DT/basic_df.pickle'
#to load
with open(filename, 'rb') as handle:
    NER_df = pickle.load(handle)
    
new_df = NER_df.copy()
new_df.head()

Unnamed: 0,Original Docket Text,Organization Portion,Name Portion,Identifying Org and Name,Stripped Org and Name
0,"COMPLAINT against Cardiogenics Holdings, Inc. ...","Cardiogenics Holdings , Inc. LG Capital Funding",( Steinmetz Michael Bowens Priscilla,COMPLAINT against -ORG- -ORG- -ORG- -ORG- fili...,"COMPLAINT against filing fee $ 400 , receipt n..."
1,Case assigned to Judge Ann M Donnelly and Magi...,Individual Practices of the assigned Judges,Ann M Donnelly Vera M. Scanlon Bowens Priscilla,Case assigned to Judge -NAME- -NAME- -NAME- an...,Case assigned to Judge and Magistrate Judge . ...
2,"Summons Issued as to Cardiogenics Holdings, In...",Cardiogenics Holdings,Bowens Priscilla,"Summons Issued as to -ORG- -ORG- , Inc.. ( -NA...","Summons Issued as to , Inc.. ( , ) ( Entered :..."
3,NOTICE - emailed attorney regarding missing se...,,Bowens Priscilla,NOTICE - emailed attorney regarding missing se...,NOTICE - emailed attorney regarding missing se...
4,In accordance with Rule 73 of the Federal Rule...,,Bowens Priscilla,In accordance with Rule 73 of the Federal Rule...,In accordance with Rule 73 of the Federal Rule...


In [118]:
### Normalize
def normalize(docket_original):

    url_regex1 = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
    url_regex2 = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    date_regex = '(\d{1,2}[\/ ](\d{2}|January|Jan|February|Feb|March|Mar|April|Apr|May|May|June|Jun|July|Jul|August|Aug|September|Sep|October|Oct|November|Nov|December|Dec)[\/ ]\d{2,4})'
    punct_regex = r"[^a-zA-Z0-9]"
    num_regex = "\d+"
    extraspace_regex = " +"
    docket_original = docket_original.split(' ')
    docket_normalized = [text.lower() for text in docket_original]
    docket_nourl1 = [re.sub(url_regex1, "url", text) for text in docket_normalized]
    docket_nourl2 = [re.sub(url_regex2, "url", text) for text in docket_nourl1]
    docket_nodate = [re.sub(date_regex, "date", text) for text in docket_nourl2]
    docket_nopunct = [re.sub(punct_regex, " ", text) for text in docket_nodate]
    docket_nonum = [re.sub(num_regex, " ", text) for text in docket_nopunct]
    docket_noextraspace = re.sub(extraspace_regex, " ", ' '.join(docket_nonum))
    
    return docket_noextraspace

In [119]:
t0 = time.time()
new_df['normalized'] = new_df['Stripped Org and Name'].apply(lambda x: normalize(x))
print('took {} seconds'.format(time.time() - t0))

took 1.1314857006072998 seconds


### 3. Split and tokenize

In [120]:
def tokenize(docket_noextraspace):
    return word_tokenize(docket_noextraspace)

In [121]:
t0 = time.time()
new_df['tokenize'] = new_df['normalized'].apply(tokenize)
print('took {} seconds'.format(time.time() - t0))

took 0.8651938438415527 seconds


### 4. Remove Stop words

In [122]:
def stopwords(docket_tokenized):  
    return [word for word in docket_tokenized if word not in sw]

In [123]:
t0 = time.time()
new_df['nostop'] = new_df['tokenize'].apply(stopwords)
print('took {} seconds'.format(time.time() - t0))

took 0.2962186336517334 seconds


### 5. Manual Topic Assignment - Decision tree to identify keywords and topics based on Chris' feedback

In [124]:
manual_topics_df = pd.read_csv('mannual_topics_20180828.csv') #this is updated
manual_topics_df = manual_topics_df.apply(lambda x: x.astype(str).str.lower())
manual_topics_dict = manual_topics_df.to_dict('list')
for topic in manual_topics_dict.keys():
    manual_topics_dict[topic] = [keyword for keyword in manual_topics_dict[topic] if keyword != 'nan']

In [125]:
#output a list of topics
def mannual_topic_assignment(text):
    #text = text.split()
    #print(text)
    output = []
    for topic in manual_topics_dict.keys():
        for keyword in manual_topics_dict[topic]:
            if ' '.join(text).find(keyword) != -1:
                output.append(topic)
    #print(output)
    return ', '.join(set(output))

In [126]:
docket_texts_output = list(new_df['nostop'])

docket_texts_output_DT = []
topics_DT = []

for text in docket_texts_output:
    topic = mannual_topic_assignment(text)
    #print(topic)
    if topic != '':
        docket_texts_output_DT.append('')
        topics_DT.append(topic)
    else:
        docket_texts_output_DT.append(text)
        topics_DT.append('')

In [127]:
new_df['DT Topics'] = pd.Series(topics_DT)
new_df['Removed unnecessary POS & vocab DT'] = pd.Series(docket_texts_output_DT)
new_df.head()

Unnamed: 0,Original Docket Text,Organization Portion,Name Portion,Identifying Org and Name,Stripped Org and Name,normalized,tokenize,nostop,DT Topics,Removed unnecessary POS & vocab DT
0,"COMPLAINT against Cardiogenics Holdings, Inc. ...","Cardiogenics Holdings , Inc. LG Capital Funding",( Steinmetz Michael Bowens Priscilla,COMPLAINT against -ORG- -ORG- -ORG- -ORG- fili...,"COMPLAINT against filing fee $ 400 , receipt n...",complaint against filing fee receipt number wa...,"[complaint, against, filing, fee, receipt, num...","[complaint, filing, fee, receipt, number, disc...","Motions, Complaints, Service of Process",
1,Case assigned to Judge Ann M Donnelly and Magi...,Individual Practices of the assigned Judges,Ann M Donnelly Vera M. Scanlon Bowens Priscilla,Case assigned to Judge -NAME- -NAME- -NAME- an...,Case assigned to Judge and Magistrate Judge . ...,case assigned to judge and magistrate judge pl...,"[case, assigned, to, judge, and, magistrate, j...","[case, assigned, judge, magistrate, judge, ple...",,"[case, assigned, judge, magistrate, judge, ple..."
2,"Summons Issued as to Cardiogenics Holdings, In...",Cardiogenics Holdings,Bowens Priscilla,"Summons Issued as to -ORG- -ORG- , Inc.. ( -NA...","Summons Issued as to , Inc.. ( , ) ( Entered :...",summons issued as to inc entered date,"[summons, issued, as, to, inc, entered, date]","[summons, issued, inc, entered, date]",Service of Process,
3,NOTICE - emailed attorney regarding missing se...,,Bowens Priscilla,NOTICE - emailed attorney regarding missing se...,NOTICE - emailed attorney regarding missing se...,notice emailed attorney regarding missing seco...,"[notice, emailed, attorney, regarding, missing...","[notice, emailed, attorney, regarding, missing...",Notices,
4,In accordance with Rule 73 of the Federal Rule...,,Bowens Priscilla,In accordance with Rule 73 of the Federal Rule...,In accordance with Rule 73 of the Federal Rule...,in accordance with rule of the federal rules o...,"[in, accordance, with, rule, of, the, federal,...","[accordance, rule, federal, rules, civil, proc...","Notices, Motions, Judgment, Order",


### 6. Lemmatization

In [128]:
def lemm(docket_nostop):
    return ' '.join([WordNetLemmatizer().lemmatize(word, pos='v') for word in docket_nostop])

In [129]:
t0 = time.time()
new_df['lemmed'] = new_df['Removed unnecessary POS & vocab DT'].apply(lemm)
print('took {} seconds'.format(time.time() - t0))

took 3.4011993408203125 seconds


In [130]:
new_df.head()

Unnamed: 0,Original Docket Text,Organization Portion,Name Portion,Identifying Org and Name,Stripped Org and Name,normalized,tokenize,nostop,DT Topics,Removed unnecessary POS & vocab DT,lemmed
0,"COMPLAINT against Cardiogenics Holdings, Inc. ...","Cardiogenics Holdings , Inc. LG Capital Funding",( Steinmetz Michael Bowens Priscilla,COMPLAINT against -ORG- -ORG- -ORG- -ORG- fili...,"COMPLAINT against filing fee $ 400 , receipt n...",complaint against filing fee receipt number wa...,"[complaint, against, filing, fee, receipt, num...","[complaint, filing, fee, receipt, number, disc...","Motions, Complaints, Service of Process",,
1,Case assigned to Judge Ann M Donnelly and Magi...,Individual Practices of the assigned Judges,Ann M Donnelly Vera M. Scanlon Bowens Priscilla,Case assigned to Judge -NAME- -NAME- -NAME- an...,Case assigned to Judge and Magistrate Judge . ...,case assigned to judge and magistrate judge pl...,"[case, assigned, to, judge, and, magistrate, j...","[case, assigned, judge, magistrate, judge, ple...",,"[case, assigned, judge, magistrate, judge, ple...",case assign judge magistrate judge please down...
2,"Summons Issued as to Cardiogenics Holdings, In...",Cardiogenics Holdings,Bowens Priscilla,"Summons Issued as to -ORG- -ORG- , Inc.. ( -NA...","Summons Issued as to , Inc.. ( , ) ( Entered :...",summons issued as to inc entered date,"[summons, issued, as, to, inc, entered, date]","[summons, issued, inc, entered, date]",Service of Process,,
3,NOTICE - emailed attorney regarding missing se...,,Bowens Priscilla,NOTICE - emailed attorney regarding missing se...,NOTICE - emailed attorney regarding missing se...,notice emailed attorney regarding missing seco...,"[notice, emailed, attorney, regarding, missing...","[notice, emailed, attorney, regarding, missing...",Notices,,
4,In accordance with Rule 73 of the Federal Rule...,,Bowens Priscilla,In accordance with Rule 73 of the Federal Rule...,In accordance with Rule 73 of the Federal Rule...,in accordance with rule of the federal rules o...,"[in, accordance, with, rule, of, the, federal,...","[accordance, rule, federal, rules, civil, proc...","Notices, Motions, Judgment, Order",,


### 7. Phrase Modeling

In [131]:
docket_phrase2 = list(new_df['lemmed'])
docket_phrase2[:5]

['',
 'case assign judge magistrate judge please download review locate website attorneys responsible provide courtesy copy judge individual practice require enter date',
 '',
 '',
 '']

In [132]:
unigram_sentences_filepath = 'docket_texts/train/DT/unigram_nltk_newsop2.txt'

In [133]:
%%time
# turn the lemmatized corpus into unigram sentences
with codecs.open(unigram_sentences_filepath, 'w', encoding = 'utf_8') as f:
    for sentence in docket_phrase2:
        f.write(sentence + '\n')

Wall time: 3.99 ms


In [134]:
unigram_sentences = LineSentence(unigram_sentences_filepath)

In [135]:
bigram_model_filepath = 'docket_texts/train/DT/bigram_model_newsop2' 

In [136]:
%%time

# store our bigram model
bigram_model = Phrases(unigram_sentences)
bigram_model.save(bigram_model_filepath)
    
# load the finished model from disk if we don't want to run this again
#bigram_model = Phrases.load(bigram_model_filepath)

Wall time: 151 ms


In [137]:
bigram_sentences_filepath = 'docket_texts/train/DT/bigram_sentences_newsop2.txt'

In [138]:
%%time

# apply the bigram model, and write it to file
with codecs.open(bigram_sentences_filepath, 'w', encoding = 'utf_8') as f:
    for unigram_sentence in unigram_sentences:
        bigram_sentence = ' '.join(bigram_model[unigram_sentence])
        f.write(bigram_sentence + '\n')

Wall time: 58.8 ms




In [139]:
bigram_sentences = LineSentence(bigram_sentences_filepath)

In [140]:
print('\nUnigram sentence:')
for unigram_sentence in it.islice(unigram_sentences, 0, 10):
    print(' '.join(unigram_sentence))
print('\nBigram sentence:')
for bigram_sentence in it.islice(bigram_sentences, 0, 10):
    print(' '.join(bigram_sentence))


Unigram sentence:
case assign judge magistrate judge please download review locate website attorneys responsible provide courtesy copy judge individual practice require enter date
case reassign magistrate judge magistrate judge longer assign case please download review locate website attorneys responsible provide courtesy copy judge individual practice require enter date
status report llc kehrli enter date
case assign judge magistrate judge please download review locate website attorneys responsible provide courtesy copy judge individual practice require enter date
case reassign magistrate judge chief mag judge longer assign case please download review locate website attorneys responsible provide courtesy copy judge individual practice require marziliano august enter date
selection mediator select mediator first mediation session take place january upon completion mediation party must submit mediation report find https url mediation report must e mail robyn weinstein url within two we

In [141]:
trigram_model_filepath = 'docket_texts/train/DT/trigram_model_newsop2'

In [142]:
%%time

# again, using Phrases to attach more words to phrases already formed
trigram_model = Phrases(bigram_sentences)
trigram_model.save(trigram_model_filepath)

# load the finished model from disk
#trigram_model = Phrases.load(trigram_model_filepath)

Wall time: 22.9 ms


In [143]:
trigram_sentences_filepath = 'docket_texts/train/DT/trigram_sentences_newsop2.txt'

In [144]:
%%time

with codecs.open(trigram_sentences_filepath, 'w', encoding = 'utf_8') as f:
    for bigram_sentence in bigram_sentences:
        #print('Bi', bigram_sentence)
        trigram_sentence = ' '.join(trigram_model[bigram_sentence])
        #print('Tri', trigram_sentence)
        f.write(trigram_sentence + '\n')

Wall time: 23 ms




In [145]:
trigram_sentences = LineSentence(trigram_sentences_filepath)

In [147]:
start = 0
finish = 5
print('Original text:')
print(docket_phrase2[start:finish], '\n')

print('\nUNIGRAM Sentence:')
for unigram_sentence in it.islice(unigram_sentences, start, finish):
    print(' '.join(unigram_sentence))
print('\nBIGRAM Sentence:')
for bigram_sentence in it.islice(bigram_sentences, start, finish):
    print(' '.join(bigram_sentence))
print('\nTRIGRAM Sentence:')
for trigram_sentence in it.islice(trigram_sentences, start, finish):
    print(' '.join(trigram_sentence))

Original text:
['', 'case assign judge magistrate judge please download review locate website attorneys responsible provide courtesy copy judge individual practice require enter date', '', '', ''] 


UNIGRAM Sentence:
case assign judge magistrate judge please download review locate website attorneys responsible provide courtesy copy judge individual practice require enter date
case reassign magistrate judge magistrate judge longer assign case please download review locate website attorneys responsible provide courtesy copy judge individual practice require enter date
status report llc kehrli enter date
case assign judge magistrate judge please download review locate website attorneys responsible provide courtesy copy judge individual practice require enter date
case reassign magistrate judge chief mag judge longer assign case please download review locate website attorneys responsible provide courtesy copy judge individual practice require marziliano august enter date

BIGRAM Sentence:

In [148]:
def trigram_transform(texts):
    display = False
    texts = str(texts)
    trigram_output = ''
    #print(texts)

    remove_trigram = ['calendar_day', 'court_notice_intend', 'minute_entry_proceeding_hold', 'court_reportertranscriber_abovecaptioned_matter',
                      'redaction_calendar_day', 'rule_statement', 'obtain_pacer', 'may_obtain_pacer', 'reportertranscriber_abovecaptioned_matter',
                      'redact_transcript_deadline', 'send_chamber', "official_transcript_notice_give", "notice_intent_request", "proceed_hold", 
                      "fee_receipt_number", "civil_procedure", "pursuant_frcp", "official_transcript_conference", 
                      "purchase_reportertranscriber_deadline_release", "et_al", "mail_chamber", "transcript_restriction", "redaction_transcript", 
                      "transcript_view_public_terminal", "transcript_make_remotely", "associated_et_al", "electronically_available_public_without", 
                      "genesys_id", "release_transcript_restriction", "adar_bay", "redaction_request_due", "new_york", "official_transcript_conference", 
                      "transcript_make_remotely", "transcript_proceeding_conference_hold", "redaction_transcript",
                      'affidavit_jr._c.p.a', 'corporate_parent', 'certain_underwriter', 'federal_rule_civil_procedure', 'redaction_request', 
                      'official_transcript', 'rule_disclosure', 'rule_corporate_disclosure', 'place_vault', 'public_without_redaction_calendar', 
                      'purchase_deadline_release_transcript', 'transcript_proceeding_hold', 'transcript_remotely_electronically_available',
                      'minute_entry_hold', 'discovery_hear_hold', 'jury_trial_hold', "sign_judge",'place_vault']

    if texts == None:
        return None
    
    unigram_review = []
    for word in texts.split():
        unigram_review.append(word)
    if display:
        print('Uni: ', unigram_review)
    bigram_review = bigram_model[unigram_review]
    if display:
        print('Bi: ', bigram_review)
    trigram_review = trigram_model[bigram_review]
    if display:
        print('Tri: ', trigram_review)
    trigram_review = [phrase for phrase in trigram_review if phrase not in remove_trigram]
    if display:
        print('Tri removed: ', trigram_review)
    trigram_output += ' '.join(trigram_review)
    
    return trigram_output

In [149]:
docket_phrase3 = [trigram_transform(text) for text in docket_phrase2]



In [150]:
print(docket_phrase2[:5])
print(docket_phrase3[:5])
len(set(docket_phrase3))

['', 'case assign judge magistrate judge please download review locate website attorneys responsible provide courtesy copy judge individual practice require enter date', '', '', '']
['', 'case assign judge magistrate judge please_download_review_locate website_attorneys_responsible_provide courtesy_copy judge individual_practice_require enter date', '', '', '']


209

In [151]:
new_df['phrases'] = pd.Series(docket_phrase3)

In [152]:
new_df.columns

Index(['Original Docket Text', 'Organization Portion', 'Name Portion',
       'Identifying Org and Name', 'Stripped Org and Name', 'normalized',
       'tokenize', 'nostop', 'DT Topics', 'Removed unnecessary POS & vocab DT',
       'lemmed', 'phrases'],
      dtype='object')

In [153]:
i = 1
print('orignal text: ')
print(new_df['Original Docket Text'].iloc[i], '\n')
print('org and name removed: ')
print(new_df['Stripped Org and Name'].iloc[i], '\n')
print('normalized: ')
print(new_df['normalized'].iloc[i], '\n')
print('stopwords removed: ')
print(' '.join(new_df['nostop'].iloc[i]), '\n')
print('after lemmetization: ')
print(new_df['lemmed'].iloc[i], '\n')
print('after phrase modeling: ')
print(new_df['phrases'].iloc[i], '\n')

orignal text: 
Case assigned to Judge Ann M Donnelly and Magistrate Judge Vera M. Scanlon. Please download and review the Individual Practices of the assigned Judges, located on our website. Attorneys are responsible for providing courtesy copies to judges where their Individual Practices require such. (Bowens, Priscilla) (Entered: 03/11/2016) 

org and name removed: 
Case assigned to Judge and Magistrate Judge . Please download and review the , located on our website . Attorneys are responsible for providing courtesy copies to judges where their Individual Practices require such . ( , ) ( Entered : 03/11/2016 ) 

normalized: 
case assigned to judge and magistrate judge please download and review the located on our website attorneys are responsible for providing courtesy copies to judges where their individual practices require such entered date  

stopwords removed: 
case assigned judge magistrate judge please download review located website attorneys responsible providing courtesy co

### 7a. Some stuff needs to be removed, per prior experience

In [154]:
remove_pos = list(pd.read_excel('NLP_to_be_removed.xlsx', sheetname = 0, header = None)[0])
remove_word = list(pd.read_excel('NLP_to_be_removed.xlsx', sheetname = 1, header = None)[0])
remove_trigram = list(pd.read_excel('NLP_to_be_removed.xlsx', sheetname = 2, header = None)[0])

In [155]:
def remove_unnecessary(text):
    new_text = []
    text = text.split(' ')
    for word in text:
        if word not in remove_word and word not in remove_trigram:
            new_text.append(word)
    return ' '.join(new_text)

In [156]:
new_df['cleaned'] = new_df['phrases'].apply(remove_unnecessary)
new_df.head()

Unnamed: 0,Original Docket Text,Organization Portion,Name Portion,Identifying Org and Name,Stripped Org and Name,normalized,tokenize,nostop,DT Topics,Removed unnecessary POS & vocab DT,lemmed,phrases,cleaned
0,"COMPLAINT against Cardiogenics Holdings, Inc. ...","Cardiogenics Holdings , Inc. LG Capital Funding",( Steinmetz Michael Bowens Priscilla,COMPLAINT against -ORG- -ORG- -ORG- -ORG- fili...,"COMPLAINT against filing fee $ 400 , receipt n...",complaint against filing fee receipt number wa...,"[complaint, against, filing, fee, receipt, num...","[complaint, filing, fee, receipt, number, disc...","Motions, Complaints, Service of Process",,,,
1,Case assigned to Judge Ann M Donnelly and Magi...,Individual Practices of the assigned Judges,Ann M Donnelly Vera M. Scanlon Bowens Priscilla,Case assigned to Judge -NAME- -NAME- -NAME- an...,Case assigned to Judge and Magistrate Judge . ...,case assigned to judge and magistrate judge pl...,"[case, assigned, to, judge, and, magistrate, j...","[case, assigned, judge, magistrate, judge, ple...",,"[case, assigned, judge, magistrate, judge, ple...",case assign judge magistrate judge please down...,case assign judge magistrate judge please_down...,please_download_review_locate website_attorney...
2,"Summons Issued as to Cardiogenics Holdings, In...",Cardiogenics Holdings,Bowens Priscilla,"Summons Issued as to -ORG- -ORG- , Inc.. ( -NA...","Summons Issued as to , Inc.. ( , ) ( Entered :...",summons issued as to inc entered date,"[summons, issued, as, to, inc, entered, date]","[summons, issued, inc, entered, date]",Service of Process,,,,
3,NOTICE - emailed attorney regarding missing se...,,Bowens Priscilla,NOTICE - emailed attorney regarding missing se...,NOTICE - emailed attorney regarding missing se...,notice emailed attorney regarding missing seco...,"[notice, emailed, attorney, regarding, missing...","[notice, emailed, attorney, regarding, missing...",Notices,,,,
4,In accordance with Rule 73 of the Federal Rule...,,Bowens Priscilla,In accordance with Rule 73 of the Federal Rule...,In accordance with Rule 73 of the Federal Rule...,in accordance with rule of the federal rules o...,"[in, accordance, with, rule, of, the, federal,...","[accordance, rule, federal, rules, civil, proc...","Notices, Motions, Judgment, Order",,,,


In [157]:
filename = 'docket_texts/train/DT/basic_cleaned.pickle'

In [158]:
#to save
with open(filename, 'wb') as handle: 
    pickle.dump(new_df, handle, protocol = pickle.HIGHEST_PROTOCOL)

In [159]:
new_df[['Original Docket Text', 'normalized', 'DT Topics', 'lemmed', 'phrases']].drop_duplicates().to_csv('Check if keywords working.csv', index = False)

In [161]:
print('shape before dedupe: {}'.format(new_df.shape))
print('shape after dedupe: {}'.format(new_df[['Original Docket Text', 'normalized', 'DT Topics', 'lemmed', 'phrases']].drop_duplicates().shape))

shape before dedupe: (3244, 13)
shape after dedupe: (3203, 5)


### Another Way of thinking about this:
We may need to package all these procedures into a function. As we may want to use pipeline to take care of all these things in the near future

### Troubleshooting

In [435]:
text = '''LETTER addressed to Judge Denise L. Cote from Andrew C. Shen dated February 7, 2014 re: adding the Goldman Sachs & Co. case to the Protective Order. Document filed by National Credit Union Administration Board. (Attachments: # 1 Stipulated Amended Protective Order)(Haas, Erik) (Entered: 02/07/2014)'''
new_df[new_df['Original Docket Text'] == text][['DT Topics', 'phrases']].iloc[0]

DT Topics    Other Answers, Complaints
phrases                               
Name: 1506, dtype: object

problem 1: Set/Reset Deadlines: Responses due by 12/27/2017. (anc) (Entered: 12/21/2017)
problem 2: AFFIDAVIT of Teresa A. Gonsalves in Support re: 53 MOTION to Dismiss.. Document filed by John D. Barry, Beacon Hill Asset Management LLC, Thomas Daniels, John Irwin, Mark Miszkiewicz, Safe Harbor Asset Management LLC. (pa, ) (Entered: 09/20/2004)
problem 3: ORDER; that case number 00cv7874 shall be consolidated into lead case number 00cv7872 for all purposes and all subsequent filings shall be filed under that lead case number ; the Clerk of the Court is directed to close case number 00cv7874 (SAS) . ( signed by Judge Shira A. Scheindlin ); Copies mailed. (pl) (Entered: 06/29/2001)

In [392]:
new_df[new_df['Original Docket Text'] == text]

Original Docket Text                  RULE 7.1 CORPORATE DISCLOSURE STATEMENT. No Co...
Organization Portion                                                                LLC
Name Portion                                                             ( Kehrli Kevin
Identifying Org and Name              RULE 7.1 CORPORATE DISCLOSURE STATEMENT . No C...
Stripped Org and Name                 RULE 7.1 CORPORATE DISCLOSURE STATEMENT . No C...
normalized                            rule corporate disclosure statement no corpora...
tokenize                              [rule, corporate, disclosure, statement, no, c...
nostop                                [rule, corporate, disclosure, statement, corpo...
DT Topics                                                                              
Removed unnecessary POS & vocab DT    [rule, corporate, disclosure, statement, corpo...
lemmed                                rule corporate disclosure statement corporate ...
phrases                         

In [None]:
data = pd.read_excel(r'E:\WinUser\Documents\Python Code\AI Paralegal\docket_texts\Train\DT\New Topics - Classification -5.27.2018.xlsx')
data.drop('DT Topics', axis = 1, inplace = True)
print(data.shape)
print(data.drop_duplicates().shape)
data.head()

In [None]:
#because there were some changes, I wanted to make sure that Chris' feedback stayed the same
filename = 'docket_texts/train/DT/basic_cleaned.pickle'
with open(filename, 'rb') as handle:
    train_cleaned_df = pickle.load(handle)

In [None]:
print(train_cleaned_df[train_cleaned_df['cleaned'] != ''].shape)
print(train_cleaned_df[train_cleaned_df['cleaned'] != ''][['Original Docket Text', 'phrases']].drop_duplicates().shape)
train_cleaned_df[train_cleaned_df['cleaned'] != ''].head()

In [None]:
df = data[['Original Docket Text', 'New Topocs', 'Action [Y/N]', 'If Y']].merge(train_cleaned_df[['Original Docket Text', 'DT Topics', 'phrases']], on = 'Original Docket Text', how = 'outer')
df.drop_duplicates().shape

In [None]:
df[(df['DT Topics'] != '') & (df['New Topocs'].isnull() == False)].drop_duplicates().to_csv('test.csv', index = False)

In [None]:
df[(df['phrases'] != '') & (df['New Topocs'].isnull() == True)].drop_duplicates().to_csv('test.csv', index = False)