In [None]:
'''Purpose

The purpose of this code is to match the text files that have been classified with the master Excel spreadsheet of all 
files.

Approach:
- Create a dictionary object whose keys are the names of the folders within which are the text files. 
- Create a regex expression that identifies the unique patter in the text file. 
- Read the Master file into Python as a dataframe. 
- Iterate over the list of text files, match the expression to the column. 
- Create a new column that will capture the file name and match. 
- Append that column to the Master File dataframe in memory. 
- Write back to Excel. 
'''

In [None]:
# IMPORT PACKAGES

In [5]:
import os
import re
import pandas as pd

In [None]:
# DEFINE TARGET DIRECTORY

In [32]:
os.chdir('I:\Legal Analytics Sprint-S18\Team Folders\Team Wang\Files Converted to Txt')

In [None]:
# DEFINE LIST OF TEXT FILES

In [33]:
List_txt_files = [file for file in os.listdir() if '.txt' in file]

In [34]:
List_txt_files[3:10]

['GA_Northern_1_15-cv-04247-TWT_26.txt',
 'GA_Northern_1_15-cv-04247-TWT_32.txt',
 'GA_Northern_1_15-cv-04249-ODE_4.txt',
 'GA_Northern_1_15-cv-04258-AT_0.txt',
 'GA_Northern_1_15-cv-04260-CC_0.txt',
 'GA_Northern_1_15-cv-04260-CC_8.txt',
 'GA_Northern_1_15-cv-04264-AT_0.txt']

In [None]:
# READ IN AS A DATAFRAME THE MASTER CASE FILE

In [11]:
os.chdir(r'C:\Users\Chris.Cirelli\Desktop')
File = 'Master_case_list.xlsx'
df_master_case = pd.read_excel(File)
Case_number = df_master_case['case_number']

In [None]:
# CREATE A REGEX EXPRESSION TO EXTRACT CASE NUMBER

In [25]:
Regex = re.compile('[0-9]*-cv-[0-9]*')

In [None]:
# Test on Master File Case Number Column 

In [46]:
Test_case_number = map(lambda x: re.search(Regex, x), Case_number)

In [49]:
print(next(Test_case_number))

<_sre.SRE_Match object; span=(2, 13), match='10-cv-00002'>


In [50]:
# Test on List of text files

In [45]:
Test_file_list = map(lambda x: re.search(Regex, x), List_txt_files)

In [44]:
print(next(Test_file_list))

<_sre.SRE_Match object; span=(14, 25), match='15-cv-04247'>


In [None]:
# DEFINE MATCH FUNCTION

In [120]:
def match_function():
    '''The purpose of this function is to match the files that have been categorized with those listed in the Master
       case file document.  Once matched, the function should write to the Excel spreadsheet the correct classification. 
    
    Input   = 1.) Directory where the case files are located; 2.) List of case files; 3.) Master Case File Excel Doc
    Output  = None;  Function should write to the Master Case File Excel Doc. 
    '''
    # Get List Text Files
    Location_legal_docs = os.chdir(r'C:\Users\Chris.Cirelli\Desktop\Python Programming Docs\GSU\Sprint Project\Legal Docs')
    Directory_list = os.listdir()
    List_text_files = [file for file in Directory_list if '.txt' in file]
    
    # Get Category of Text Files 
    '''This category is based on the fact that Sids code buckets the files into folders that are given the category name'''
    Cwd = os.getcwd()
    Split_cwd= Cwd.split('\\')
    Text_category = Split_cwd[-1]
    
    # Read the Master Case File into a dataframe
    os.chdir(r'C:\Users\Chris.Cirelli\Desktop')
    File = 'Master_case_list.xlsx'
    df_master_case = pd.read_excel(File)
    Master_case_file_ListCaseNumber = df_master_case['case_number']
    
    # Define Regex Expression to obtain unique file number
    Regex = re.compile('[0-9]*-cv-[0-9]*')
    
    # Create a list to capture the case number for the master case files
    Master_case_file_List_case_code = []
    
    # Loop over the dataframe case file column
    for filename in Master_case_file_ListCaseNumber:
        # Search for pattern
        Re_search = re.search(Regex, filename)
        # if pattern not found
        if Re_search == None:
            Master_case_file_List_case_code.append('no match')
        # if pattern cound
        else:
            Match = Re_search.group()
            Master_case_file_List_case_code.append(Match)
    
    # Create a list to capture the case number for the list of files in the target directory
    List_text_files_case_code = []
    
    # Loop over the dataframe case file column
    for filename in List_text_files:
        # Search for pattern
        Re_search = re.search(Regex, filename)
        # if pattern not found
        if Re_search == None:
            List_text_files_case_code.append('no match')
        # if pattern cound
        else:
            Match = Re_search.group()
            List_text_files_case_code.append(Match)
    
    # Append an additional row to the Master Case File DataFrame to use as the matching Value
    df_master_case['Case_Code'] = Master_case_file_List_case_code
    
    # Create List that Represents the Matching Values 
    
    List_match_to_master_code_file = []
    
    for case_code in df_master_case['Case_Code']:
        if case_code in List_text_files_case_code:
            List_match_to_master_code_file.append(Text_category)
        else:
            List_match_to_master_code_file.append('no match found')
    
    # Add this new list as a new column in the master dataframe
    
    df_master_case['File Classification'] = List_match_to_master_code_file
    
    
    return df_master_case
    
    

In [122]:
Dataframe = match_function()

In [125]:
Dataframe_File_class = Dataframe['File Classification']
Limit = Dataframe_File_class != 'no match found'
New_frame = Dataframe_File_class[Limit]

In [126]:
New_frame

3818    Legal Docs
3819    Legal Docs
3820    Legal Docs
3821    Legal Docs
3822    Legal Docs
3823    Legal Docs
3824    Legal Docs
3825    Legal Docs
3826    Legal Docs
3827    Legal Docs
3828    Legal Docs
3829    Legal Docs
3854    Legal Docs
Name: File Classification, dtype: object