In [None]:
# IMPORT PACKAGES

In [7]:
import nltk
import os
import re
import nltk
from nltk.corpus import wordnet as wn
import sys

In [None]:
# DEFINE TARGET DIRECTORY

In [10]:
'''Note:    The target directory should be changed by the user to point to the directory within which they have saved their
            text files'''

os.chdir(r'I:\Legal Analytics Sprint-S18\Team Folders\Team Wang\Complaints')
Target_dir = os.listdir(r'I:\Legal Analytics Sprint-S18\Team Folders\Team Wang\Complaints')

In [5]:
Target_dir     # test the code

['.ipynb_checkpoints',
 'GA_Northern_1_15-cv-04247-TWT_26.txt',
 'GA_Northern_1_15-cv-04247-TWT_32.txt',
 'GA_Northern_1_15-cv-04249-ODE_4.txt',
 'GA_Northern_1_15-cv-04258-AT_0.txt',
 'GA_Northern_1_15-cv-04260-CC_0.txt',
 'GA_Northern_1_15-cv-04260-CC_8.txt',
 'GA_Northern_1_15-cv-04264-AT_0.txt',
 'GA_Northern_1_15-cv-04281-RWS_0.txt',
 'GA_Northern_1_15-cv-04284-WSD_0.txt',
 'GA_Northern_1_15-cv-04285-AT_0.txt',
 'GA_Northern_1_15-cv-04288-AT_0.txt',
 'GA_Northern_1_15-cv-04298-MHC_0.txt',
 'GA_Northern_1_15-cv-04303-ODE_0.txt',
 'GA_Northern_1_15-cv-04310-LMM_0.txt']

In [None]:
# IMPORT CLEANING FUNCTIONS

In [4]:
'''Note:    These are the cleaning modules that were created to clean the text of obvious errors or words that will likely 
            not be material to our ultimate analysis.
            These functions constitute the underlying code for the subsequent text-cleaning pipeline program that is used 
            in the ultimate code (see end of document)'''

def clean_text_4_classification_remove_backslashes(Text_file):
    '''The purpose of this function is to clean the text files of numerous instances of backslashes 
    in order to prepare them for the regex expression search. 
    Input  =   Single text file 
    Output =   Single text file cleaned 
    '''
    
    # Convert text to lowercase
    Text_file_lower = Text_file.lower()
    
    # Split any values in the text on the backslash.  The Text_split_slash should return a list. 
    Text_split_slash = Text_file_lower.split('\\')
        
    # Return the list to a text. 
    Text_rejoined = ' '.join(Text_split_slash)
                
    # Return a list of the cleaned text docs. 
    return Text_rejoined


def clean_text_4_classification_remove_nABC(Text_file):
    '''The purpose of this function is to remove the 'n' that appears before words that begin with an upper case letter.  
    Input  =   Single txt file
    Output =   Clean list of tokens from original txt file
    '''
    # Define the regex expression that you want to search for. 
    Regex_exp = re.compile('n[A-Z*]')
    
    # Create a list to capture the tokens once they are cleaned 
    Text_tokenized_cleaned = []
            
    # Tokenize the given text
    Text_tokenized = nltk.word_tokenize(Text_file)
            
    # Run for loop over tokens for a given text. 
    for token in Text_tokenized:

        # Search for the regex expression
        Regex_search = re.search(Regex_exp, token)
                
        # Test if there was match (None = no match)
        if Regex_search != None:
                     
            # If there was a match, take all letters after the 'n'.   
            token_cleaned = token[1:]
                    
            Text_tokenized_cleaned.append(token_cleaned)
                        
        # If the Regex_search returned None, return the token back to the Text_tokenized_cleaned list
        else:
            Text_tokenized_cleaned.append(token)
    
    # Return a list of clean tokens
    return Text_tokenized_cleaned


def create_dict_punct():
    '''The purpose of this function is to simply create a dictionary of punctuation symbols to use
    in other functions
    Input  = None
    Output = Dict whose keys are the distinct punctuation marks. 
    '''
    import string
    Dict = {}
    Punct = string.punctuation
    for x in Punct:
        Dict[x] = ''
    return Dict 

def strip_punctuation(Token_list):
    '''The purpose of this function is to strip the punctuation from a list of tokens. 
    Input  =  List of tokens
    Output =  List of tokens absent punctuation.  
    '''
    # Import punctuation dictionary
    Dict_punct = create_dict_punct()

    # Create a list to capture the cleaned tokens
    Clean_token_list = []    
        
    # Iterate over the tokens in the txt file
    for x in Token_list:
        if x not in Dict_punct:
            # Append tokens to clean token list
            Clean_token_list.append(x)
    
    # Return a list of cleaned text
    return Clean_token_list

def strip_two_letter_words(Token_list):
    '''The purpose of this function is to remove any two letter tokens from a list of tokens.
    Input  =   List of tokens
    Output =   List of tokens absent two letter words'''
    
    List = [x for x in Token_list if len(x) > 2]
    
    return List

def create_dict_stopwords():
    '''The purpose of this code is to create a dictionary of stop words. 
    Input  = None
    Output = Dictionary of stop words'''
    
    from nltk.corpus import stopwords
    Stopwords = stopwords.words('english')                  
    Dict = {}
    for x in Stopwords:
        Dict[x] = ''
    return Dict

def strip_stop_words(Token_list):
    ''' The purpose of this code is to strip the stop words from a given text
    Input  = List of tokens 
    Outpu  = Text clean of stop words'''
    
    stop_words = create_dict_stopwords()
    List = []
    for x in Token_list:
        if x not in stop_words:
            List.append(x)
    return List


In [None]:
# CREATE TEXT CLEANING PIPELINE

In [5]:
'''This pipeline will be placed inside a larger function that loops over the Target Directory, identifies the text files,
    opens them, etc, and also captures the target file, tokenized text and statistics.  We'll need to create these 
    variables within the master function. 
'''

def text_clearning_pipeline_Input_4_Error_Checker_Function(Text_file):
    '''The purpose of this function is to prepare text for use with the Error Checker Program
    Input  =  Single text file
    Output =  List of clean tokens representing a single text. 
    '''
    # Run Clearning Pipeline (These functions are taken from the ones define above)
    txt_strip_backslashes = clean_text_4_classification_remove_backslashes(Text_file)
    txt_strip_nABC = clean_text_4_classification_remove_nABC(txt_strip_backslashes)
    txt_strip_punct = strip_punctuation(txt_strip_nABC)
    txt_strip_2_letter_words = strip_two_letter_words(txt_strip_punct)
    txt_strip_stop_words = strip_stop_words(txt_strip_2_letter_words)
        
    # Return List of clean tokenized text
    return txt_strip_stop_words
    

In [15]:
def Amanda(Target_dir):
    
    # Create Lists And Objects to Capture Values Generated From the Subjecuent Functions
    List_Text_files_cleaned = []
    
    # Get Text Files in Dir Only
    get_txt_files_only_in_dir = [file for file in Target_dir if '.txt' in file]
    
    
    # Loop over directory, Identify Text Documents, Open+Read Text Documents
    for File in get_txt_files_only_in_dir:
        
        # Convert Files to Text & Append Text to List
        File_open = open(File, 'rb')
        Text_bytes = File_open.read()
        Text_str = str(Text_bytes)
        File_name = str(File)
        
        
        # FUNCTION #1 - TEXT CLEARNING PIPELINE
        
        get_cleaned_text_tokenized = text_clearning_pipeline_Input_4_Error_Checker_Function(Text_str)
        
        # Append Cleaned Text to List to be used for the Get Frequency Distribution Function. 
        Append_file_name = List_Text_files_cleaned.append(File_name)
        List_Text_files_cleaned.append(get_cleaned_text_tokenized)
        
    
    return List_Text_files_cleaned
    

In [16]:
Test = Amanda(Target_dir)

In [17]:
Test

['GA_Northern_1_15-cv-04247-TWT_26.txt',
 ["b'case",
  '1:15-cv-04247-twt',
  'document',
  'filed',
  '10/24/16',
  'page',
  'nin',
  'united',
  'states',
  'district',
  'court',
  'nfor',
  'northern',
  'district',
  'georgia',
  'nmichael',
  'mosely',
  'behalf',
  'nhimself',
  'similarly',
  'situated',
  'nplaintiff',
  'civil',
  'action',
  '1:15',
  'xe2',
  'x80',
  'x94cv-04247-',
  'twt',
  'npittman',
  'consultants',
  'inc',
  'georgia',
  'llc',
  'ndefendants',
  'norder',
  'nthe',
  'court',
  'considered',
  'defendant',
  'xe2',
  'x80',
  'x99s',
  'motion',
  'extension',
  'time',
  'xef',
  'xac',
  'x81le',
  'response',
  'plaintiff',
  'xe2',
  'x80',
  'x99s',
  'motion',
  'summary',
  'judgment',
  'nthe',
  'court',
  'hereby',
  'grants',
  'defendant',
  'xe2',
  'x80',
  'x99s',
  'motion',
  'deadline',
  'xef',
  'xac',
  'x81ling',
  'nresponse',
  'stated',
  'case',
  'october',
  '2016.',
  'norder',
  'entered',
  'day',
  '24th',
  'day',