In [1]:
'''Part II Legal Doc Classification 

Part I was the concatenating and cleaning of the Text docs. 

Part II

The purpose of this code is to create a frequency distribution of words aggregated by legal document type.  The words 
in each file are compared to a unique set of words created from all word documents.   These frequency dist can then 
be used to classify text by looking for differences amongst the text. 

'''

'\nThe purpose of this code is to classify legal text by type.\n\nTypes include:\n1.) Complaint\n2.) Order\n3.) Summary judgement\n4.) Cover sheet.\n'

In [None]:
# IMPORT PACKAGES

In [12]:
import nltk
import os
import re
import sys
import pandas as pd

In [None]:
# IMPORT MODULES FOR THIS PROJECT

In [13]:
os.chdir('C:\\Users\\Chris.Cirelli\\Desktop\\Python Programming Docs\\GitHub\\Bros-Coding-master\\Bros-Coding')
import Module_Part_II_Legal_Doc_Classification as mldc

In [None]:
# DEFINE TARGET DIRECTORY

In [3]:
'''Note:    The target directory should be changed by the user to point to the directory within which they have saved their
            text files'''

os.chdir(r'I:\Legal Analytics Sprint-S18\Team Folders\Team Wang\Files Converted to Txt')
Dir_list = os.listdir(r'I:\Legal Analytics Sprint-S18\Team Folders\Team Wang\Files Converted to Txt')

In [None]:
# PART I:  CREATE A DATAFRAME REPRESENTING EACH TEXT IN THE CATEGORY AND COUNT OF UNIQUE WORDS PRESENT IN EACH

In [20]:
def get_frequencyDist_legal_text(List_txt_files, df_unique_words):
    '''Purpose is to get the freq dist of words match in given text
    Input1 = A list of text files for one class.  An external loop will run to feed this function a list of files for each 
             class. 
    Input2 = Dataframe whose index comprises the list of unique words
    Ouput  = Dataframe whose index is a list of unique words and whose columns or data points is a percentage that 
             represents the presence of that words in the underlying texts. 
    '''

    # Define the Initial Dataframe to be df_unique_words.  Then redefine once process has been completed 
    Create_dist_freq_dataframe = df_unique_words
    
    for file in List_txt_files:
    
        # Open & Read Text
        Open_file = open(file, 'rb')
        Read_file = Open_file.read()
        Str_file = str(Read_file)
    
        # Run Text through cleaning pipeline       
        Clean_tokenized_text = mldc.get_clean_text_using_text_clearning_pipeline(Str_file)
         
        # List Catch Values (1/0)
        List_match_count = []

        # Loop over list of unique words and see if there is a match in the cleaned text. 
        for token in df_unique_words.index:
            # If there is a match append 1 to the list. 
            if token in Clean_tokenized_text:
                List_match_count.append(1)
            # If there is no match, append 0
            else:
                List_match_count.append(0)
        
        # Create a column for each file and whose values is the list of matches (1/0)
        Create_dist_freq_dataframe[file] = List_match_count
    
    return Create_dist_freq_dataframe
    

Clean_tokenized_text = get_frequencyDist_legal_text(Dir_list, df_set_index)

In [None]:
# PART II:  IMPORT THE DATAFRAME OF ALL TEXT - EXPORT DATAFRAME W/ VALUES = % FOR THIS GROUP OF DOCS

In [5]:
def Merge_dataframe_columns_calc_percentage(Class_name, Dataframe):
    '''
    Input  = Dataframe that captured the matches for each text
    Output = Dataframe that captures the % of matches for all text files
    '''
    Dict_freq_dist = {}
    List_perct = []
    
    for row in Dataframe.index:
        Sum = sum(Dataframe.loc[row])
        Len = len(Dataframe.loc[row])
        Perct = Sum / Len
        Dict_freq_dist[row] = Perct      
    
    # Recreate Dataframe
    
    List_index = [1]
    
    df_perct = pd.DataFrame(Dict_freq_dist, index = List_index)
    df_transpose = pd.DataFrame.transpose(df_perct)
    df_transpose.columns = [str(Class_name)] 

    return df_transpose

In [None]:
# CREATE A FUNCTION FOR FORM THE LIST OF TEXT BY GROUP

In [6]:
def create_file_list_by_group(Excel_doc):
    '''
    Input   = Excel Doc
    Output  = Dataframe of a single file group. 
    '''
    
    # Create a Dictionary to capture class names & files
    
    Dict_classes_files = {}
    
    # Create Dataframe in memory
    df = pd.read_excel(Excel_doc)
    
    # Limit to File & Class
    df_file_class = df[['FILE','CLASS']]
    
    # Create list of classes
    List_file_classes = list(set(df_file_class['CLASS']))

    # Loop over class list to obtan
    
    for Class in List_file_classes:
        Def_class = df_file_class['CLASS'] == Class 
        Limit_df = df_file_class[Def_class]
        Dict_classes_files[Class] = Limit_df['FILE']
        
    return Dict_classes_files
    

In [None]:
# CREATE MASTER DATAFRAME FROM INDIVIDUAL CLASS WORD DISTRIBUTIONS

In [21]:
def final_code_classify_texts_word_freq():
    
    # Import Packages
    import nltk
    import os
    import pandas as pd
    
    # Change to the target directory
    os.chdir(r'I:\Legal Analytics Sprint-S18\Team Folders\Team Wang\Files Converted to Txt')
    Dir_list = os.listdir(r'I:\Legal Analytics Sprint-S18\Team Folders\Team Wang\Files Converted to Txt')
    
    # Create a list of unique tokens from the clean concatenated text file
    List_uniqueTokens = mldc.get_list_uniqueTokens_from_cleaned_concat_text()
    
    # Create the base dataframe with the index as the set of unique tokens/words
    dataframe_unique_tokens = mldc.get_dataframe_unique_tokens(List_uniqueTokens)
    
    # Define Excel doc where pre-classisified files are located
    Excel_file = 'Text_Classification_Wang.xlsx'
    
    # Create a dictionary whose keys are the file classes and values the files for each group. 
    Dict_files_by_class = create_file_list_by_group(Excel_file)
    
    # Create a Master Dataframe to house final values
    Master_dataframe = pd.DataFrame({}, index = list(dataframe_unique_tokens.index))
    
    # Loop over the dictionary keys
    for Class in Dict_files_by_class.keys():
        
        # Create a list of the files
        File_list = list(Dict_files_by_class[Class])
        
        # Feed each class list trough the word counter
        df_single_class_word_count = get_frequencyDist_legal_text(File_list, dataframe_unique_tokens)
        
        # Merge individual text file columns into one percentage for the given class
        df_merged_columns = Merge_dataframe_columns_calc_percentage(Class, df_single_class_word_count)
        
        # Merge individual dataframes into master dataframe
        Master_dataframe[Class] = df_merged_columns[Class]
        
    # Return the Master Dataframe with the word % for all classes. 
    return Master_dataframe       

In [30]:
Test = final_code_classify_texts_word_freq()

<_io.BufferedReader name='GA_Northern_1_15-cv-04249-ODE_4.txt'>
<_io.BufferedReader name='GA_Northern_1_15-cv-04456-SCJ_4.txt'>
<_io.BufferedReader name='GA_Northern_1_15-cv-04480-LMM_8.txt'>
<_io.BufferedReader name='GA_Northern_1_15-cv-04515-AT_3.txt'>
<_io.BufferedReader name='GA_Northern_1_16-cv-00072-TWT-AJB_3.txt'>
<_io.BufferedReader name='GA_Northern_1_16-cv-00162-WSD_3.txt'>
<_io.BufferedReader name='GA_Northern_1_16-cv-00200-RWS-JSA_3.txt'>
<_io.BufferedReader name='GA_Northern_1_15-cv-04453-SCJ_50.txt'>
<_io.BufferedReader name='GA_Northern_1_15-cv-04453-SCJ_52.txt'>
<_io.BufferedReader name='GA_Northern_1_15-cv-04260-CC_8.txt'>
<_io.BufferedReader name='GA_Northern_1_16-cv-00010-ELR_60.txt'>
<_io.BufferedReader name='GA_Northern_1_15-cv-04249-ODE_4.txt'>
<_io.BufferedReader name='GA_Northern_1_15-cv-04453-SCJ_61.txt'>
<_io.BufferedReader name='GA_Northern_1_15-cv-04456-SCJ_4.txt'>
<_io.BufferedReader name='GA_Northern_1_15-cv-04480-LMM_8.txt'>
<_io.BufferedReader name='GA_N

<_io.BufferedReader name='GA_Northern_1_16-cv-00200-RWS-JSA_33.txt'>


In [31]:
Test.head()

Unnamed: 0,Civil Cover Sheet,Motion,Summary,Junk,Docket Sheet,Final Reprot,Compliant,Order
sowell,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
armor,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
capacity,0.0,0.0,0.0,0.0,0.0,0.0,0.095745,0.07377
ghetto,0.0,0.0,0.0,0.0,0.0,0.0,0.010638,0.008197
capri,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Write the Master Dataframe to Excel_file

In [33]:
 # Import Personal Modules
os.chdir('C:\\Users\\Chris.Cirelli\\Desktop\\Python Programming Docs\\GitHub\\Bros-Coding-master\\Bros-Coding')
import Module_Part_II_Legal_Doc_Classification as mldc

In [34]:
# Write Master Dataframe to Excel 
os.chdir(r'I:\Legal Analytics Sprint-S18\Team Folders\Team Wang\Files Converted to Txt')
mldc.write_to_excel(Test, 'Master Word Dist Dataframe_with_DocketSheet')

In [25]:
os.listdir()

['.DS_Store',
 '.smbdeleteAAA200000017519f',
 '.smbdeleteAAA2000000175373',
 'Classify 100 Text Files.xlsx',
 'Concatenated Text File - Cleaned.txt',
 'Concatenated_text_file_Chris.txt',
 'Docket_sheets.xlsx',
 'Docket_sheets_concat_file_Chris.txt',
 'Docket_Sheet_Classification_Output.xlsx',
 'Docket_Sheet_Classification_Output_lower.xlsx',
 'Freq_Dist.xlsx',
 'GA_Northern_1_15-cv-04247-TWT_26.pdf',
 'GA_Northern_1_15-cv-04247-TWT_26.txt',
 'GA_Northern_1_15-cv-04247-TWT_32.pdf',
 'GA_Northern_1_15-cv-04247-TWT_32.txt',
 'GA_Northern_1_15-cv-04249-ODE_4.pdf',
 'GA_Northern_1_15-cv-04249-ODE_4.txt',
 'GA_Northern_1_15-cv-04258-AT_0.pdf',
 'GA_Northern_1_15-cv-04258-AT_0.txt',
 'GA_Northern_1_15-cv-04260-CC_0.pdf',
 'GA_Northern_1_15-cv-04260-CC_0.txt',
 'GA_Northern_1_15-cv-04260-CC_8.pdf',
 'GA_Northern_1_15-cv-04260-CC_8.txt',
 'GA_Northern_1_15-cv-04264-AT_0.pdf',
 'GA_Northern_1_15-cv-04264-AT_0.txt',
 'GA_Northern_1_15-cv-04281-RWS_0.pdf',
 'GA_Northern_1_15-cv-04281-RWS_0.txt',
 

In [29]:
File = 'GA_Northern_1_15-cv-04249-ODE_4.txt'
Open = open(File, 'rb')
Read = Open.read()

In [28]:
File = 'GA_Northern_1_15-cv-04247-TWT_32.txt'
Open = open(File, 'rb')
Read = Open.read()