In [1]:
import re
import os
import importlib.util
import pymupdf
import sys
import pathlib
import CONSTANTS as C

In [6]:
# Run all the .py files in the folder getInfo with the text files in the folder PyMuPDFExampleOutputs as arguments

# Define the folders
py_folder_path = 'getInfo' 
txt_folder_path = 'PyMuPDFExampleOutputs'  

txt_filenames = [f for f in os.listdir(txt_folder_path) if f.endswith('.txt')]
py_filenames = [f for f in os.listdir(py_folder_path) if f.endswith('.py') and f != "__init__.py"]
txt_filenames.sort()
py_filenames.sort()

for txt_filename in txt_filenames:
    txt_full_path = os.path.join(txt_folder_path, txt_filename)
    was_successful = False 
    for py_filename in py_filenames:
        if os.stat(os.path.join(py_folder_path, py_filename)).st_size == 0: continue
        try:
            py_full_path = os.path.join(py_folder_path, py_filename)
            
            # Dynamically import the .py file as a module
            spec = importlib.util.spec_from_file_location(py_filename[:-3], py_full_path)
            py_file = importlib.util.module_from_spec(spec)
            spec.loader.exec_module(py_file)
            
            result = py_file.get_info(txt_full_path)
            
            # If it works, print a success message
            print(f"{py_filename} executed successfully for {txt_filename} with result: {result}\n")
            if was_successful: 
                print(f"!!      Another file worked for {txt_filename}")
            was_successful = True
        
        except C.InfoNotFound as e:
            # Handle InfoNotFound errors that occur during method execution
            # if py_filename == "capital_one_1.py": print(f"{py_filename} did not execute on {txt_filename}: {e}")
            pass
        except Exception as e:
            # Handle any errors that occur during method execution
            # if py_filename == "capital_one_1.py": print(f"{py_filename} did not execute on {txt_filename}: {e}")
            pass


    # if no .py files work
    if not was_successful:
        # UNCOMMENT
        # print(f"No .py files executed successfully for {txt_filename}")
        continue

ally_1.py executed successfully for ally_1.txt with result: 2018.03.06-2018.04.05 7583

bank_of_america_1.py executed successfully for bank_of_america_1.txt with result: 2022.11.14-2022.12.13 3997

bank_of_america_2.py executed successfully for bank_of_america_2.txt with result: 2020.11.30-2020.12.30 9789

becu_1.py executed successfully for becu_1.txt with result: 2016.11.19-2016.12.16 2596

bmo_1.py executed successfully for bmo_1.txt with result: 2021.11.6-2021.12.5 2900

capital_one_1.py executed successfully for capital_one_1.txt with result: 2021.11.20-2021.11.20 9813

capital_one_2.py executed successfully for capital_one_1.txt with result: 2021.10.22-2021.11.20 9813

!!      Another file worked for capital_one_1.txt
capital_one_1.py executed successfully for capital_one_2.txt with result: 2023.03.02-2023.03.02 0103

capital_one_2.py executed successfully for capital_one_2.txt with result: 2023.02.03-2023.03.02 0103

!!      Another file worked for capital_one_2.txt


In [17]:
for f in os.listdir("ExampleStatements"):
    print(f)

first_citizens_1.pdf
bank_of_america_1.pdf
us_1.pdf
chase_4.pdf
citi_1.pdf
commerce.pdf
rbs_2_cut.pdf
rbs_1.pdf
varo.pdf
ally_1.pdf
capital_one_2.pdf
td_1.pdf
regions.pdf
citi_2.pdf
becu_1.pdf
td_2.pdf
chase_1.pdf
capital_one_1.pdf
rbs_3.pdf
wells_fargo_2.pdf
pnc_1.pdf
bank_of_america_2.pdf
bmo_1.pdf
chase_2.pdf
wells_fargo_1.pdf
truist.pdf
us_2.pdf
chase_3.pdf


In [None]:
# getting the text from all the pdfs in the folder

for i, filename in enumerate(os.listdir("ExampleStatements")):
    
    fname = "ExampleStatements/" + filename  # get document filename
    print(fname)
    with pymupdf.open(fname) as doc:  # open document
        text = chr(12).join([page.get_text() for page in doc])
    # write as a binary file to support non-ASCII characters
    f = open("PyMuPDFExampleOutputs/" + filename[:-4] + ".txt", "a")
    f.write(text)
    f.close

In [20]:
# Create blank .py files
# For every .txt and .pdf file in PYMuPDFExampleOutputs and ExampleStatements, respectively, create a blank .py in getInfo with the same filename
pdf_files = [f[:-4] for f in os.listdir("ExampleStatements") if f.endswith('.pdf')]
py_files = [f[:-4] for f in os.listdir("getInfo") if f.endswith('.py')]
txt_files = [f[:-4] for f in os.listdir("PyMuPDFExampleOutputs") if f.endswith('.txt')]
pdf_files.sort()
py_files.sort()
txt_files.sort() 
for pdf, txt in zip(pdf_files, txt_files):
    if pdf == txt and pdf not in py_files:
        try:
            with open(f"getInfo/{pdf}.py", "x") as f:
                print(f"Created {pdf}.py")
        except:
            print(f"Did NOT create {pdf}.py")
    else: 
        print(f"ERROR: {pdf} does not match {txt} or {pdf} is already in getInfo")

Did NOT create ally_1.py
Did NOT create bank_of_america_1.py
Did NOT create bank_of_america_2.py
Did NOT create becu_1.py
Did NOT create bmo_1.py
Created capital_one_1.py
Created capital_one_2.py
Created chase_1.py
Created chase_2.py
Created chase_3.py
Created chase_4.py
Created citi_1.py
Created citi_2.py
Created commerce.py
Created first_citizens_1.py
Created pnc_1.py
Created rbs_1.py
Created rbs_2_cut.py
Created rbs_3.py
Created regions.py
Created td_1.py
Created td_2.py
Created truist.py
Created us_1.py
Created us_2.py
Created varo.py
Created wells_fargo_1.py
Created wells_fargo_2.py


In [4]:
# Specific example pymupdf confirmation  
fname = "ExampleStatements/chase_3.pdf"  # get document filename
with pymupdf.open(fname) as doc:  # open document
    # print(doc, type(doc), len(doc))
    # print([page.get_text() for page in doc])   
    text = chr(12).join([page.get_text() for page in doc])
# write as a binary file to support non-ASCII characters
with open("PyMuPDFExampleOutputs/chase_3.txt", "w") as f:
    f.write(text)
    print(text, type(text))

x
 $
.
CUSTOMER SERVICE
In U.S.
1-800-945-2000
Español
1-888-446-3308
TDD
1-800-955-8060
Pay by phone 1-800-436-7958
Outside U.S. call collect
1-302-594-8200
ACCOUNT INQUIRIES
P.O. Box 15298
Wilmington, DE 19850-5298
PAYMENT ADDRESS
P.O. Box 94014
Palatine, IL  60094-4014
VISIT US AT:
www.chase.com/creditcards
725 
N 
Z 
17 
01/18/18
Page 1 of 2
05686
MA MA 33034
10810018660003304584 
21034 BEX Z 10808 D 
DUPAGE AUTO BATH
27W230 NORTH AVE
WEST CHICAGO, IL 60185-1531
630-504-0597
Previous Balance
$1,032.66
Payment, Credits
-$1,732.50
Purchases, Cash, Debits
+$2,400.82
New Balance
$1,700.98
Total Credit Line
$13,020
Available Credit
$12,338
Cash Access Line
$14,045
Available for Cash
$12,088
1790001 FIS33335 D 6 0285 
INS12675 INS13037        
VISA CREDIT BUSINESS SUMMARY
Account Number:  4154 xxxx xxxx 9614
CHASE BUSINESSCARD REWARDS SUMMARY
__________________________________________________________
Previous balance
$14.81
Rebates earned from gas purchases
$12.72
Rebates earned from pur

In [22]:
# Test C.pattern...s on every line in every text file

for i, filename in enumerate(os.listdir("PyMuPDFExampleOutputs")):
    fname = "PyMuPDFExampleOutputs/" + filename  # get document filename
    with open(fname, "r") as f:
        text = f.readlines()
    first = True
    for line in text:
        # match_list = C.pattern_month_word_day.findall(line)
        match_list = C.pattern_forward_slash.findall(line)
        # if match_list:
        #     if first:
        #         print(f"File: {filename}")
        #         first = False
        #     print(len(match_list), line.strip("\n"))
        if len(match_list) > 1:
            if first:
                print(f"File: {filename}")
                first = False
            print(len(match_list), match_list, line.strip("\n")) 
    if not first:
        print("\n")
    

File: pnc_1.txt
2 ['09/01/2016', '09/30/2016'] 09/01/2016 to 09/30/2016


File: wells_fargo_2.txt
2 ['01/22/2020', '02/22/2020'] Fee period 01/22/2020 - 02/22/2020


File: wells_fargo_1.txt
2 ['07/24/2015', '08/25/2015'] Fee period 07/24/2015 - 08/25/2015


File: becu_1.txt
2 ['11/19/2016', '12/16/2016'] Statement Period: 11/19/2016 - 12/16/2016
2 ['11/19/2016', '12/16/2016'] Statement Period: 11/19/2016 - 12/16/2016




In [53]:
# test update_date
line_both = "lkjfl Beginning Balance on May 3, 2003  Ending Balance on June 4, 2022 res"
line_both_non = "5/03/03 -  06/4/2022" # non preface
line_start = "Beginning Balance on May 3, 2003"
line_end = "Ending Balance on June 4, 2022"
line_none = "flkfnlsk"
start_right = "2003.05.03"
end_right = "2022.06.04"
start_wrong = "1900.12.12"
end_wrong = "1940.09.09"

# print(update_date("", "", line_none), "\n")

# print(update_date("", "", line_start))
# print(update_date("", "", line_end), "\n")

# print(update_date(start_right, "", line_none))
# print(update_date("", end_right, line_none), "\n")

# print(update_date(start_right, "", line_start))
# print(update_date("", end_right, line_end), "\n")

# print(update_date(start_right, "", line_end))
# print(update_date("", end_right, line_start), "\n")

# print(update_date("", "", line_both), "\n")

# print(update_date(start_right, "", line_both))
# print(update_date("", end_right, line_both), "\n")

# print(update_date(start_right, end_right, line_none), "\n")

# print(update_date(start_right, end_right, line_start), "\n")

# print(update_date(start_right, end_right, line_both))


# print(update_date("", "", line_both_non), "\n")

# print(update_date(start_right, "", line_both_non))
# print(update_date("", end_right, line_both_non), "\n")

# print(update_date(start_right, end_right, line_both_non))


# ERROR
# print(update_date(start_wrong, "", line_both))
# print(update_date("", end_wrong, line_both), "\n")

# print(update_date(start_wrong, "", line_start))
# print(update_date("", end_wrong, line_end), "\n")

# print(update_date(start_wrong, end_wrong, line_both))


# print(update_date(start_wrong, "", line_both_non))
# print(update_date("", end_wrong, line_both_non), "\n")

# print(update_date(start_wrong, end_wrong, line_both_non))


Exception: In validating potential_range_start_date, potential_date 2003.05.03 != current_date 1900.12.12 in line: 
	 5/03/03 -  06/4/2022

In [39]:
def preface_2_date(preface, line, match_list):
    before_position = line.find(preface)
    if before_position == -1: return ""
    date_position = before_position + len(preface)
    for date, date_start  in match_list:
        if date_start == date_position:
            return date
    return ""
        
def validate_potential_date(potential_date, current_date):
    if current_date and potential_date and current_date != potential_date: 
        raise C.InfoInconsistent(f"potential_date {potential_date} != current_date {current_date}")
    else: 
        return

def update_date(start_date, end_date, line):
    for index, DATE_PATTERN in enumerate(C.DATE_PATTERNS):
        match_list = [(m[0], m.start(0)) for m in DATE_PATTERN.finditer(line)]

        if not match_list: continue

        if len(match_list) > 2: raise Exception(f"More than 2 dates on one line - \
                                                {len(match_list)} dates in line: \n\t {line}")
        
        if len(match_list) == 2:
            potential_range_start_date = C.clean_date(match_list[0][0], index)
            potential_range_end_date = C.clean_date(match_list[1][0], index)
            try: 
                validate_potential_date(potential_range_start_date, start_date)
            except C.InfoInconsistent as e:
                raise Exception(f"In validating potential_range_start_date, {e} in line: \n\t {line}")
            try:
                validate_potential_date(potential_range_end_date, end_date)
            except C.InfoInconsistent as e:
                raise Exception(f"In validating potential_range_end_date, {e} in line: \n\t {line}")
            start_date = potential_range_start_date if potential_range_end_date else start_date
            end_date = potential_range_end_date if potential_range_end_date else end_date
            
        for start_preface in C.start_prefaces:
            potential_preface_start_date = C.clean_date(preface_2_date(start_preface, line, match_list), index)
            try:
                validate_potential_date(potential_preface_start_date, start_date)
            except C.InfoInconsistent as e:
                raise Exception(f"In validating potential_start_date, {e} in line: \n\t {line}")
            start_date = potential_preface_start_date if potential_preface_start_date else start_date
                
        for end_preface in C.end_prefaces:
            potential_preface_end_date = C.clean_date(preface_2_date(end_preface, line, match_list), index)
            try:
                validate_potential_date(potential_preface_end_date, end_date)
            except C.InfoInconsistent as e:
                raise Exception(f"In validating potential_end_date, {e} in line: \n\t {line}")
            end_date = potential_preface_end_date if potential_preface_end_date else end_date
    return start_date, end_date 