### Importing optimized model and get performance metrics on the test dataset

In [1]:
import nltk
from nltk.tag.stanford import StanfordNERTagger
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords

In [2]:
from gensim.models.word2vec import LineSentence
from gensim.models import Phrases
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

#visualization libraries
import pyLDAvis
import pyLDAvis.gensim



In [267]:
import os
java_path = 'C:/Program Files/Java/jdk-10.0.1/bin/java.exe'
os.environ['JAVAHOME'] = java_path

import pandas as pd
import numpy as np
import codecs
import itertools as it
from bs4 import BeautifulSoup
import warnings
import pickle
from collections import Counter
import re
import string

In [4]:
#import corpus/docket texts from html to pandas DataFrame
def grab_docket_test():
    files = []
    #get all .html files in the folder (all docket files are in .html)
    for file in os.listdir('docket_texts/test/'):
        if file.endswith('.html'):
            files.append(os.path.join('docket_texts/test/', file))

    df_docket_texts = pd.DataFrame()
    
    for i in range(len(files)): #gather all docket texts
    #for i in [0, 1]: #for testing purposes
        
        content = codecs.open(files[i], 'r', 'utf-8').read()
        #use beautiful soup to get the case ID
        soup = BeautifulSoup(content, 'lxml')
        case_id = str(soup.find_all('h3'))    
        bookmark1 = case_id.find('CASE #:') + len('CASE #:')
        bookmark2 = case_id.find('</h3>')
        case_id = case_id[bookmark1:bookmark2]

        #use pandas to grab tables in the html files
        docket_tables = pd.read_html(content)

        #error checking: gotta do this because there's different length of docket_list/
        #usually docket texts are in docket_list[3], but not always
        n = 0
        while docket_tables[n].isin(['Docket Text']).sum().sum() == 0:
            #print(n, docket_tables[n].isin(['Docket Text']).sum().sum())
            n += 1
                        
        #print(i, files[i])
        #print(docket_tables[n].head())

        #docket_tables[n] is the docket text table
        new_header = docket_tables[n].iloc[0]
        docket_tables[n] = docket_tables[n][1:]
        docket_tables[n].columns = new_header
        
        docket_tables[n]['#'] = pd.to_numeric(docket_tables[n]['#'],
                                              downcast = 'signed', errors = 'coerce')
        docket_tables[n]['Date Filed'] = pd.to_datetime(docket_tables[n]['Date Filed'])
        docket_tables[n]['Case ID'] = case_id

        df_docket_texts = pd.concat([df_docket_texts, docket_tables[n]])
    #reorder a column
    cols = list(df_docket_texts.columns)
    df_docket_texts = df_docket_texts[[cols[-1]] + cols[:-1]]
    
    print('current docket text table size/shape: {}'.format(df_docket_texts.shape))
    return df_docket_texts

In [6]:
%%time
df = grab_docket_test()
docket_original = list(df['Docket Text'])
for i in range(5):
    print('docket text {}'.format(i))
    print(docket_original[i], '\n')

  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)
  self.parser.feed(markup)


current docket text table size/shape: (1804, 4)
docket text 0
FILING ERROR - DEFICIENT PLEADING - FILED AGAINST PARTY ERROR COMPLAINT against All Defendants. (Filing Fee $ 400.00, Receipt Number 0208-11793625)Document filed by Majid Soueidan.(Rosen, Samuel) Modified on 1/5/2016 (pc). (Entered: 01/04/2016) 

docket text 1
FILING ERROR PDF ERROR CIVIL COVER SHEET filed. (Rosen, Samuel) Modified on 1/5/2016 (pc). (Entered: 01/04/2016) 

docket text 2
REQUEST FOR ISSUANCE OF SUMMONS as to BREEZE-EASTERN CORPORATION, BRAD PEDERSEN, ROBERT J. KELLY, NELSON OBUS, WILLIAM M. SHOCKLEY, and SERGE DUPUIS, re: 1 Complaint. Document filed by Majid Soueidan. (Rosen, Samuel) (Entered: 01/04/2016) 

docket text 3
RULE 7.1 CORPORATE DISCLOSURE STATEMENT. No Corporate Parent. Document filed by Majid Soueidan.(Rosen, Samuel) (Entered: 01/04/2016) 

docket text 4
***NOTICE TO ATTORNEY REGARDING DEFICIENT PLEADING. Notice to Attorney Samuel Kenneth Rosen to RE-FILE Document No. 1 Complaint,. The filing is 

### Used Stanford NER to identy Names and Entities

In [248]:
%%time
path_to_model = r'C:\Users\inves\AppData\Local\Programs\Python\Python35\Lib\site-packages\nltk\stanford-ner-2018-02-27\classifiers\english.all.3class.distsim.crf.ser.gz'
path_to_jar = r'C:\Users\inves\AppData\Local\Programs\Python\Python35\Lib\site-packages\nltk\stanford-ner-2018-02-27\stanford-ner.jar'
tagger = StanfordNERTagger(path_to_model, path_to_jar = path_to_jar)

output = []
#length = 10
length = len(docket_original)
for i in range(length):
    org_str = []
    name_str = []
    stripped_str1 = []
    stripped_str2 = []
    tokens = nltk.tokenize.word_tokenize(docket_original[i])
    #print(tokens)
    for label, token in zip(tagger.tag(tokens), tokens):
        #print(label)
        if label[1] == 'ORGANIZATION':
            org_str.append(label[0])
            stripped_str1.append('-ORG-')
        elif label[1] == 'PERSON':
            name_str.append(label[0])
            stripped_str1.append('-NAME-')
        else:
            stripped_str1.append(token)
            stripped_str2.append(token)
    
    output.append([docket_original[i],
                   ' '.join(org_str),
                   ' '.join(name_str),
                   ' '.join(stripped_str1),
                   ' '.join(stripped_str2)])
    

Wall time: 55min 54s


In [249]:
NER_df = pd.DataFrame(output, columns = ['Original Docket Text', 'Organization Portion', 'Name Portion', 
                                         'Identifying Org and Name', 'Stripped Org and Name'])

### To re-build new_df, start here

In [299]:
new_df = NER_df.copy()

In [300]:
new_df['Original Docket Text'].iloc[5]

'***NOTICE TO ATTORNEY REGARDING PARTY MODIFICATION. Notice to attorney Samuel Kenneth Rosen. The party information for the following party/parties has been modified: Majid Soueidan and Robert K. Kelly. The information for the party/parties has been modified for the following reason/reasons: party name contained a typographical error; party text was omitted. (pc) (Entered: 01/05/2016)'

In [301]:
new_df['Identifying Org and Name'].iloc[5]

'***NOTICE TO ATTORNEY REGARDING PARTY MODIFICATION . Notice to attorney -NAME- -NAME- -NAME- . The party information for the following party/parties has been modified : -NAME- -NAME- and -NAME- -NAME- -NAME- . The information for the party/parties has been modified for the following reason/reasons : party name contained a typographical error ; party text was omitted . ( pc ) ( Entered : 01/05/2016 )'

In [302]:
print(new_df.head(7))
docket_text_list = list(new_df['Stripped Org and Name'])

                                Original Docket Text Organization Portion  \
0  FILING ERROR - DEFICIENT PLEADING - FILED AGAI...                        
1  FILING ERROR PDF ERROR CIVIL COVER SHEET filed...                        
2  REQUEST FOR ISSUANCE OF SUMMONS as to BREEZE-E...                        
3  RULE 7.1 CORPORATE DISCLOSURE STATEMENT. No Co...                        
4  ***NOTICE TO ATTORNEY REGARDING DEFICIENT PLEA...                        
5  ***NOTICE TO ATTORNEY REGARDING PARTY MODIFICA...                        
6  ***NOTICE TO ATTORNEY REGARDING DEFICIENT CIVI...                        

                                        Name Portion  \
0                        Majid Soueidan Rosen Samuel   
1                                       Rosen Samuel   
2  BRAD PEDERSEN ROBERT J. KELLY NELSON OBUS WILL...   
3                        Majid Soueidan Rosen Samuel   
4                               Samuel Kenneth Rosen   
5  Samuel Kenneth Rosen Majid Soueidan Robert K

In [303]:
def text_preprocess1(text):
    text = text.replace('/', ' ')
    text = re.sub("[\(\[].*?[\)\]]", "", text)
    text = text.replace('-', '')
    text = text.replace('(', ' ')
    text = text.replace(')', ' ')
    text = text.replace('(s)', 's')
    text = text.replace("'s", 's')
    text = text.replace('*', '')
    text = text.replace('', '')
    text = text.replace('<', '')
    text = text.replace('>', '')
    text = text.replace('\\', '')
    text = text.replace('&', ' ')
    
    table = text.maketrans(string.punctuation, len(string.punctuation) * ' ')
    text = text.translate(table)
    return text

def text_preprocess2(text):
    text = text.replace('.', '')
    return text

def remove_stop(sentence):
    output = []
    for word in sentence.split():
        if word not in set(stopwords.words('english')):
            output.append(word)
    return ' '.join(output)

keywords = pd.read_csv('docket_texts/keywords.csv', header = None)
keywords.columns = ['keywords']
keyword_list = list(keywords['keywords'])

In [304]:
print(docket_text_list[5], '\n')
docket_text_list = [text_preprocess1(sentence).lower() for sentence in docket_text_list]
docket_text_list = [text_preprocess2(sentence) for sentence in docket_text_list]
print(docket_text_list[5])
print('\nlength of docket text dataset: {}'.format(len(docket_text_list)))

***NOTICE TO ATTORNEY REGARDING PARTY MODIFICATION . Notice to attorney . The party information for the following party/parties has been modified : and . The information for the party/parties has been modified for the following reason/reasons : party name contained a typographical error ; party text was omitted . ( pc ) ( Entered : 01/05/2016 ) 

notice to attorney regarding party modification   notice to attorney   the party information for the following party parties has been modified   and   the information for the party parties has been modified for the following reason reasons   party name contained a typographical error   party text was omitted    

length of docket text dataset: 1804


In [305]:
class Splitter(object):

    def __init__(self):
        self.splitter = nltk.data.load('tokenizers/punkt/english.pickle')
        self.tokenizer = nltk.tokenize.TreebankWordTokenizer()

    def split(self,text):

        # split into single sentence
        sentences = self.splitter.tokenize(text)
        # tokenization in each sentences
        tokens = [self.tokenizer.tokenize(remove_stop(sent)) for sent in sentences]
        return tokens


class LemmatizationWithPOSTagger(object):
    def __init__(self):
        pass
    def get_wordnet_pos(self,treebank_tag):
        """
        return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v) 
        """
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            # As default pos in lemmatization is Noun
            return wordnet.NOUN

    def pos_tag(self,tokens):
        # find the pos tagginf for each tokens [('What', 'WP'), ('can', 'MD'), ('I', 'PRP') ....
        pos_tokens = [nltk.pos_tag(token) for token in tokens]

        # lemmatization using pos tagg   
        # convert into feature set of [('What', 'What', ['WP']), ('can', 'can', ['MD']), ... ie [original WORD, Lemmatized word, POS tag]
        pos_tokens = [ [(word, lemmatizer.lemmatize(word,self.get_wordnet_pos(pos_tag)), [pos_tag]) for (word,pos_tag) in pos] for pos in pos_tokens]
        return pos_tokens

In [306]:
%%time
lemmatizer = WordNetLemmatizer()
splitter = Splitter()
lemmatization_using_pos_tagger = LemmatizationWithPOSTagger()

lemma_docket_text_list = []
for docket_text in docket_text_list:
    #step 1 split document into sentence followed by tokenization
    tokens = splitter.split(docket_text)

    #step 2 lemmatization using pos tagger 
    lemma_pos_token = lemmatization_using_pos_tagger.pos_tag(tokens)
    lemma_docket_text_list.append(lemma_pos_token)

Wall time: 41.6 s


In [307]:
print(len(lemma_docket_text_list)) #docket text document level
print(len(lemma_docket_text_list[0])) #docket text sentence level
print(len(lemma_docket_text_list[0][0])) #docket text word level
print(lemma_docket_text_list[0][0][0]) #docket text token level
print(lemma_docket_text_list[0][0][0][0]) #docket text tuple level

1804
1
15
('filing', 'file', ['VBG'])
filing


In [308]:
#let's do a collection of what we have
collection = {}
for lemma_pos_token in lemma_docket_text_list:
    for sentence in lemma_pos_token:
        for token in sentence:
            #print(token[2][0])
            if token[2][0] not in list(collection.keys()):
                collection[token[2][0]] = []
                collection[token[2][0]].append(token[1])
            else:
                if token[1] not in collection[token[2][0]]:
                    collection[token[2][0]].append(token[1])

In [323]:
remove_pos = list(pd.read_excel('NLP_to_be_removed.xlsx', sheetname = 0, header = None)[0])
remove_word = list(pd.read_excel('NLP_to_be_removed.xlsx', sheetname = 1, header = None)[0])
remove_trigram = list(pd.read_excel('NLP_to_be_removed.xlsx', sheetname = 2, header = None)[0])

In [325]:
%%time
'''
remove_pos = ["``", "NNPS", "NNP", "CD", '#', '$', "''", ",", "0", ":"]
remove_word = ["'s", "judge", "party", "defendant", "ex", "plantiff", "shall", "date", "b", "exhibit", "pennsylvania",  
               "Inc..", "inc..", "llc", "'", "[_]", "action", "clerk", "july", "kw", "regard", "sac", "attachment", "c.d", "cal", "case", "cd", "l.p.", 
               "claim", "copy", "court", "direct", "form", "hereby", "magistrate", "p.c", "pl", "plaintiff", "regard", "sign", "time", "mr.", 
               "docket", "follow", "set", "matter" "agreement" "proceeding", "cotton", "january", "february", "march", "april", "may", "june", 
               "july", "august", "september", "october", "november", "december",
               "agreement", "v.", "modify", "fund", "associated", "provide", "material", "amount", "accordingly", "additional", 
               "second", "esq", "transmission", "g.c.", "seal", "review", "honor", "submit", "counsel", "witness", "civ", "first", "ltd..", "enter", 
               "stay", "forth", "matter", "whether", "class", "master", "information", "statement", "submission", "related", "see", "make", "paper", 
               "brookfield", "designate", "remain", "reportertranscriber", "submit", "include", "mail", "fact", "refer", "take", "pursuant", "amount", 
               "behalf", "I.p..", "must", "attorney",
               'abovecapitoned', 'attach', 'add', 'concern', 'chamber', 'close', 'district', 'damage', 'later', 
               'relate', 'return', 'require', 'restriction', 'respect', 'ny', 'seek', 'write', 'expert', 'transcript', 
               'day', 'h.o', 'damage', 'pre', 'proceeding', 'present', 'page', 'pending', 'p.m.', 'frcp', 'g.c.', 'record', 'r.',
              'application', 'filing', 'issue', 'assign', 'iii', 'state', 'protocol', 'loan', 'error', 'file', 'document']
'''
    
#rebuild corpus
docket_texts_output = [] #ultimate output after cleaning

for lemma_pos_token in lemma_docket_text_list:
    docket_text_output = ''
    for sentence in lemma_pos_token:
        sentence_output = []
        for token in sentence:
            #print(token[1])
            
            if token[2][0] not in remove_pos: #if the pos is not in the remove_pos list
                if token[1] not in remove_word: #these are the intentionally left out words
                    sentence_output.append(token[1]) #append the the sentence
        docket_text_output += ' '.join(sentence_output)
    docket_texts_output.append(docket_text_output)
print(docket_texts_output[:10])

['deficient plead complaint', 'pdf civil cover sheet', 'request issuance summons breezeeastern corporation complaint', 'rule corporate disclosure corporate parent', 'notice deficient pleading notice refile complaint deficient reason radio button select refile plead use event type complaint find event list complaint initiate correct pdf select individually name filer filer select individually name plead', 'notice modification reason reason name contain typographical text omit', 'notice deficient civil cover sheet notice refile civil cover sheet deficient reason citizenship principal fill despite federal question jurisdiction', 'electronic summons', 'notice open statistical correction notice open statistical erroneously select fee status code due correction entry fee status code pd', 'open initial assignment notice aboveentitled please download locate http nysd uscourts gov responsible courtesy individual practice please download rule instruction locate http nysd uscourts gov ecf php']
W

In [326]:
new_df['Removed unnecessary POS & vocab'] = pd.Series(docket_texts_output)
new_df.head()

Unnamed: 0,Original Docket Text,Organization Portion,Name Portion,Identifying Org and Name,Stripped Org and Name,Removed unnecessary POS & vocab
0,FILING ERROR - DEFICIENT PLEADING - FILED AGAI...,,Majid Soueidan Rosen Samuel,FILING ERROR - DEFICIENT PLEADING - FILED AGAI...,FILING ERROR - DEFICIENT PLEADING - FILED AGAI...,deficient plead complaint
1,FILING ERROR PDF ERROR CIVIL COVER SHEET filed...,,Rosen Samuel,FILING ERROR PDF ERROR CIVIL COVER SHEET filed...,FILING ERROR PDF ERROR CIVIL COVER SHEET filed...,pdf civil cover sheet
2,REQUEST FOR ISSUANCE OF SUMMONS as to BREEZE-E...,,BRAD PEDERSEN ROBERT J. KELLY NELSON OBUS WILL...,REQUEST FOR ISSUANCE OF SUMMONS as to BREEZE-E...,REQUEST FOR ISSUANCE OF SUMMONS as to BREEZE-E...,request issuance summons breezeeastern corpora...
3,RULE 7.1 CORPORATE DISCLOSURE STATEMENT. No Co...,,Majid Soueidan Rosen Samuel,RULE 7.1 CORPORATE DISCLOSURE STATEMENT . No C...,RULE 7.1 CORPORATE DISCLOSURE STATEMENT . No C...,rule corporate disclosure corporate parent
4,***NOTICE TO ATTORNEY REGARDING DEFICIENT PLEA...,,Samuel Kenneth Rosen,***NOTICE TO ATTORNEY REGARDING DEFICIENT PLEA...,***NOTICE TO ATTORNEY REGARDING DEFICIENT PLEA...,notice deficient pleading notice refile compla...


#### Decision tree to identify keywords and topics based on Chris' feedback

In [327]:
manual_topics_df = pd.read_csv('mannual_topics.csv')
manual_topics_df = manual_topics_df.apply(lambda x: x.astype(str).str.lower())
manual_topics_dict = manual_topics_df.to_dict('list')
for topic in manual_topics_dict.keys():
    manual_topics_dict[topic] = [keyword for keyword in manual_topics_dict[topic] if keyword != 'nan']

In [328]:
def mannual_topic_assignment(text):
    text = text.split()
    #print(text)
    output = []
    for topic in manual_topics_dict.keys():
        if set(text).intersection(manual_topics_dict[topic]):
            output.append(topic)
    #print(output)
    return ', '.join(output)

In [235]:
#docket_texts_output

In [329]:
docket_texts_output_DT = []
topics_DT = []

for text in docket_texts_output:
    topic = mannual_topic_assignment(text)
    #print(topic)
    if topic != '':
        docket_texts_output_DT.append('')
        topics_DT.append(topic)
    else:
        docket_texts_output_DT.append(text)
        topics_DT.append('')

In [330]:
print(topics_DT[:5])
print(docket_texts_output_DT[:5])

['Complaints', '', 'Service of Process, Others, Complaints', '', 'Complaints, Notices']
['', 'pdf civil cover sheet', '', 'rule corporate disclosure corporate parent', '']


In [331]:
new_df['DT Topics'] = pd.Series(topics_DT)
new_df['Removed unnecessary POS & vocab DT'] = pd.Series(docket_texts_output_DT)
new_df.head()

Unnamed: 0,Original Docket Text,Organization Portion,Name Portion,Identifying Org and Name,Stripped Org and Name,Removed unnecessary POS & vocab,DT Topics,Removed unnecessary POS & vocab DT
0,FILING ERROR - DEFICIENT PLEADING - FILED AGAI...,,Majid Soueidan Rosen Samuel,FILING ERROR - DEFICIENT PLEADING - FILED AGAI...,FILING ERROR - DEFICIENT PLEADING - FILED AGAI...,deficient plead complaint,Complaints,
1,FILING ERROR PDF ERROR CIVIL COVER SHEET filed...,,Rosen Samuel,FILING ERROR PDF ERROR CIVIL COVER SHEET filed...,FILING ERROR PDF ERROR CIVIL COVER SHEET filed...,pdf civil cover sheet,,pdf civil cover sheet
2,REQUEST FOR ISSUANCE OF SUMMONS as to BREEZE-E...,,BRAD PEDERSEN ROBERT J. KELLY NELSON OBUS WILL...,REQUEST FOR ISSUANCE OF SUMMONS as to BREEZE-E...,REQUEST FOR ISSUANCE OF SUMMONS as to BREEZE-E...,request issuance summons breezeeastern corpora...,"Service of Process, Others, Complaints",
3,RULE 7.1 CORPORATE DISCLOSURE STATEMENT. No Co...,,Majid Soueidan Rosen Samuel,RULE 7.1 CORPORATE DISCLOSURE STATEMENT . No C...,RULE 7.1 CORPORATE DISCLOSURE STATEMENT . No C...,rule corporate disclosure corporate parent,,rule corporate disclosure corporate parent
4,***NOTICE TO ATTORNEY REGARDING DEFICIENT PLEA...,,Samuel Kenneth Rosen,***NOTICE TO ATTORNEY REGARDING DEFICIENT PLEA...,***NOTICE TO ATTORNEY REGARDING DEFICIENT PLEA...,notice deficient pleading notice refile compla...,"Complaints, Notices",


In [332]:
#print some examples
for i in range(10):
    print(i)
    if new_df['DT Topics'].iloc[i] != []:
        print(new_df['DT Topics'].iloc[i])
        print(new_df['Removed unnecessary POS & vocab'].iloc[i])
        print(new_df['Removed unnecessary POS & vocab DT'].iloc[i])

0
Complaints
deficient plead complaint

1

pdf civil cover sheet
pdf civil cover sheet
2
Service of Process, Others, Complaints
request issuance summons breezeeastern corporation complaint

3

rule corporate disclosure corporate parent
rule corporate disclosure corporate parent
4
Complaints, Notices
notice deficient pleading notice refile complaint deficient reason radio button select refile plead use event type complaint find event list complaint initiate correct pdf select individually name filer filer select individually name plead

5
Notices
notice modification reason reason name contain typographical text omit

6
Notices
notice deficient civil cover sheet notice refile civil cover sheet deficient reason citizenship principal fill despite federal question jurisdiction

7
Service of Process
electronic summons

8
Notices
notice open statistical correction notice open statistical erroneously select fee status code due correction entry fee status code pd

9
Notices
open initial assignm

In [333]:
def trigram_transform(texts):
    display = False
    trigram_output = ''
    #print(texts)
    '''
    remove_trigram = ['calendar_day', 'court_notice_intend', 'minute_entry_proceeding_hold', 'court_reportertranscriber_abovecaptioned_matter',
                      'redaction_calendar_day', 'rule_statement', 'obtain_pacer', 'may_obtain_pacer', 'reportertranscriber_abovecaptioned_matter',
                      'redact_transcript_deadline', 'send_chamber', "official_transcript_notice_give", "notice_intent_request", "proceed_hold", 
                      "fee_receipt_number", "civil_procedure", "pursuant_frcp", "official_transcript_conference", 
                      "purchase_reportertranscriber_deadline_release", "et_al", "mail_chamber", "transcript_restriction", "redaction_transcript", 
                      "transcript_view_public_terminal", "transcript_make_remotely", "associated_et_al", "electronically_available_public_without", 
                      "genesys_id", "release_transcript_restriction", "adar_bay", "redaction_request_due", "new_york", "official_transcript_conference", 
                      "transcript_make_remotely", "transcript_proceeding_conference_hold", "redaction_transcript",
                      'affidavit_jr._c.p.a', 'corporate_parent', 'certain_underwriter', 'federal_rule_civil_procedure', 'redaction_request', 
                      'official_transcript', 'rule_disclosure', 'rule_corporate_disclosure', 'place_vault', 'public_without_redaction_calendar', 
                      'purchase_deadline_release_transcript', 'transcript_proceeding_hold', 'transcript_remotely_electronically_available',
                      'minute_entry_hold', 'discovery_hear_hold', 'jury_trial_hold', "sign_judge",'place_vault']
    '''
    if texts == None:
        return None
    
    unigram_review = []
    for word in texts.split():
        unigram_review.append(word)
    if display:
        print('Uni: ', unigram_review)
    bigram_review = bigram_model[unigram_review]
    if display:
        print('Bi: ', bigram_review)
    trigram_review = trigram_model[bigram_review]
    if display:
        print('Tri: ', trigram_review)
    trigram_review = [phrase for phrase in trigram_review if phrase not in remove_trigram]
    if display:
        print('Tri removed: ', trigram_review)
    trigram_output += ' '.join(trigram_review)
    
    return trigram_output

In [334]:
bigram_model_filepath = 'docket_texts/train/DT/bigram_model_noorgnoname'
trigram_model_filepath = 'docket_texts/train/DT/trigram_model_nonamenoorg'

bigram_model = Phrases.load(bigram_model_filepath)
trigram_model = Phrases.load(trigram_model_filepath)

new_df['Apply Trigram Phrase Model'] = new_df['Removed unnecessary POS & vocab DT'].apply(trigram_transform)
new_df[['Removed unnecessary POS & vocab', 'DT Topics', 'Removed unnecessary POS & vocab DT', 
        'Apply Trigram Phrase Model']].head(20)



Unnamed: 0,Removed unnecessary POS & vocab,DT Topics,Removed unnecessary POS & vocab DT,Apply Trigram Phrase Model
0,deficient plead complaint,Complaints,,
1,pdf civil cover sheet,,pdf civil cover sheet,pdf civil_cover_sheet
2,request issuance summons breezeeastern corpora...,"Service of Process, Others, Complaints",,
3,rule corporate disclosure corporate parent,,rule corporate disclosure corporate parent,
4,notice deficient pleading notice refile compla...,"Complaints, Notices",,
5,notice modification reason reason name contain...,Notices,,
6,notice deficient civil cover sheet notice refi...,Notices,,
7,electronic summons,Service of Process,,
8,notice open statistical correction notice open...,Notices,,
9,open initial assignment notice aboveentitled p...,Notices,,


### Export DataFrame to .csv

In [335]:
new_df.head()

Unnamed: 0,Original Docket Text,Organization Portion,Name Portion,Identifying Org and Name,Stripped Org and Name,Removed unnecessary POS & vocab,DT Topics,Removed unnecessary POS & vocab DT,Apply Trigram Phrase Model
0,FILING ERROR - DEFICIENT PLEADING - FILED AGAI...,,Majid Soueidan Rosen Samuel,FILING ERROR - DEFICIENT PLEADING - FILED AGAI...,FILING ERROR - DEFICIENT PLEADING - FILED AGAI...,deficient plead complaint,Complaints,,
1,FILING ERROR PDF ERROR CIVIL COVER SHEET filed...,,Rosen Samuel,FILING ERROR PDF ERROR CIVIL COVER SHEET filed...,FILING ERROR PDF ERROR CIVIL COVER SHEET filed...,pdf civil cover sheet,,pdf civil cover sheet,pdf civil_cover_sheet
2,REQUEST FOR ISSUANCE OF SUMMONS as to BREEZE-E...,,BRAD PEDERSEN ROBERT J. KELLY NELSON OBUS WILL...,REQUEST FOR ISSUANCE OF SUMMONS as to BREEZE-E...,REQUEST FOR ISSUANCE OF SUMMONS as to BREEZE-E...,request issuance summons breezeeastern corpora...,"Service of Process, Others, Complaints",,
3,RULE 7.1 CORPORATE DISCLOSURE STATEMENT. No Co...,,Majid Soueidan Rosen Samuel,RULE 7.1 CORPORATE DISCLOSURE STATEMENT . No C...,RULE 7.1 CORPORATE DISCLOSURE STATEMENT . No C...,rule corporate disclosure corporate parent,,rule corporate disclosure corporate parent,
4,***NOTICE TO ATTORNEY REGARDING DEFICIENT PLEA...,,Samuel Kenneth Rosen,***NOTICE TO ATTORNEY REGARDING DEFICIENT PLEA...,***NOTICE TO ATTORNEY REGARDING DEFICIENT PLEA...,notice deficient pleading notice refile compla...,"Complaints, Notices",,


In [336]:
new_df.columns

Index(['Original Docket Text', 'Organization Portion', 'Name Portion',
       'Identifying Org and Name', 'Stripped Org and Name',
       'Removed unnecessary POS & vocab', 'DT Topics',
       'Removed unnecessary POS & vocab DT', 'Apply Trigram Phrase Model'],
      dtype='object')

In [337]:
new_df[['Original Docket Text', 'Removed unnecessary POS & vocab', 'Removed unnecessary POS & vocab DT', 
        'Apply Trigram Phrase Model', 'DT Topics']].to_csv('docket_texts\Test\DT\classify_this.csv', index = False)

In [338]:
new_df['DT Topics'].value_counts()

Motions                                                                                           355
                                                                                                  306
Other Answers, Motions                                                                            153
Others, Motions                                                                                   145
Notices                                                                                           105
Other Answers, Answers to Complaints, Motions                                                      79
Others                                                                                             74
Motions, Notices                                                                                   70
Other Answers                                                                                      63
Other Answers, Complaints, Motions                                                