In [2]:
import re
import os
import importlib.util
import pymupdf
import sys
import pathlib
import CONSTANTS as C


In [6]:
# Run all the .py files in the folder getInfo with the text files in the folder PyMuPDFExampleOutputs as arguments

# Define the folders
py_folder_path = 'getInfo' 
txt_folder_path = 'PyMuPDFExampleOutputs'  

txt_filenames = [f for f in os.listdir(txt_folder_path) if f.endswith('.txt')]
py_filenames = [f for f in os.listdir(py_folder_path) if f.endswith('.py') and f != "__init__.py"]
txt_filenames.sort()
py_filenames.sort()

for txt_filename in txt_filenames:
    txt_full_path = os.path.join(txt_folder_path, txt_filename)
    was_successful = False 
    for py_filename in py_filenames:
        if os.stat(os.path.join(py_folder_path, py_filename)).st_size == 0: continue
        try:
            py_full_path = os.path.join(py_folder_path, py_filename)
            
            # Dynamically import the .py file as a module
            spec = importlib.util.spec_from_file_location(py_filename[:-3], py_full_path)
            py_file = importlib.util.module_from_spec(spec)
            spec.loader.exec_module(py_file)
            
            result = py_file.get_info(txt_full_path)
            
            # If it works, print a success message
            print(f"{py_filename} executed successfully for {txt_filename} with result: {result}\n")
            if was_successful: 
                print(f"!!      Another file worked for {txt_filename}")
            was_successful = True
        
        except C.InfoNotFound as e:
            # Handle InfoNotFound errors that occur during method execution
            # if py_filename == "capital_one_1.py": print(f"{py_filename} did not execute on {txt_filename}: {e}")
            pass
        except Exception as e:
            # Handle any errors that occur during method execution
            # if py_filename == "capital_one_1.py": print(f"{py_filename} did not execute on {txt_filename}: {e}")
            pass


    # if no .py files work
    if not was_successful:
        # UNCOMMENT
        # print(f"No .py files executed successfully for {txt_filename}")
        continue

ally_1.py executed successfully for ally_1.txt with result: 2018.03.06-2018.04.05 7583

bank_of_america_1.py executed successfully for bank_of_america_1.txt with result: 2022.11.14-2022.12.13 3997

bank_of_america_2.py executed successfully for bank_of_america_2.txt with result: 2020.11.30-2020.12.30 9789

becu_1.py executed successfully for becu_1.txt with result: 2016.11.19-2016.12.16 2596

bmo_1.py executed successfully for bmo_1.txt with result: 2021.11.6-2021.12.5 2900

capital_one_1.py executed successfully for capital_one_1.txt with result: 2021.11.20-2021.11.20 9813

capital_one_2.py executed successfully for capital_one_1.txt with result: 2021.10.22-2021.11.20 9813

!!      Another file worked for capital_one_1.txt
capital_one_1.py executed successfully for capital_one_2.txt with result: 2023.03.02-2023.03.02 0103

capital_one_2.py executed successfully for capital_one_2.txt with result: 2023.02.03-2023.03.02 0103

!!      Another file worked for capital_one_2.txt


In [17]:
for f in os.listdir("ExampleStatements"):
    print(f)

first_citizens_1.pdf
bank_of_america_1.pdf
us_1.pdf
chase_4.pdf
citi_1.pdf
commerce.pdf
rbs_2_cut.pdf
rbs_1.pdf
varo.pdf
ally_1.pdf
capital_one_2.pdf
td_1.pdf
regions.pdf
citi_2.pdf
becu_1.pdf
td_2.pdf
chase_1.pdf
capital_one_1.pdf
rbs_3.pdf
wells_fargo_2.pdf
pnc_1.pdf
bank_of_america_2.pdf
bmo_1.pdf
chase_2.pdf
wells_fargo_1.pdf
truist.pdf
us_2.pdf
chase_3.pdf


In [None]:
# getting the text from all the pdfs in the folder

for i, filename in enumerate(os.listdir("ExampleStatements")):
    
    fname = "ExampleStatements/" + filename  # get document filename
    print(fname)
    with pymupdf.open(fname) as doc:  # open document
        text = chr(12).join([page.get_text() for page in doc])
    # write as a binary file to support non-ASCII characters
    f = open("PyMuPDFExampleOutputs/" + filename[:-4] + ".txt", "a")
    f.write(text)
    f.close

In [20]:
# Create blank .py files
# For every .txt and .pdf file in PYMuPDFExampleOutputs and ExampleStatements, respectively, create a blank .py in getInfo with the same filename
pdf_files = [f[:-4] for f in os.listdir("ExampleStatements") if f.endswith('.pdf')]
py_files = [f[:-4] for f in os.listdir("getInfo") if f.endswith('.py')]
txt_files = [f[:-4] for f in os.listdir("PyMuPDFExampleOutputs") if f.endswith('.txt')]
pdf_files.sort()
py_files.sort()
txt_files.sort() 
for pdf, txt in zip(pdf_files, txt_files):
    if pdf == txt and pdf not in py_files:
        try:
            with open(f"getInfo/{pdf}.py", "x") as f:
                print(f"Created {pdf}.py")
        except:
            print(f"Did NOT create {pdf}.py")
    else: 
        print(f"ERROR: {pdf} does not match {txt} or {pdf} is already in getInfo")

Did NOT create ally_1.py
Did NOT create bank_of_america_1.py
Did NOT create bank_of_america_2.py
Did NOT create becu_1.py
Did NOT create bmo_1.py
Created capital_one_1.py
Created capital_one_2.py
Created chase_1.py
Created chase_2.py
Created chase_3.py
Created chase_4.py
Created citi_1.py
Created citi_2.py
Created commerce.py
Created first_citizens_1.py
Created pnc_1.py
Created rbs_1.py
Created rbs_2_cut.py
Created rbs_3.py
Created regions.py
Created td_1.py
Created td_2.py
Created truist.py
Created us_1.py
Created us_2.py
Created varo.py
Created wells_fargo_1.py
Created wells_fargo_2.py


In [63]:
# Specific example pymupdf confirmation  
fname = "ExampleStatements/bank_of_america_2.pdf"  # get document filename
with pymupdf.open(fname) as doc:  # open document
    # print(doc, type(doc), len(doc))
    # print([page.get_text() for page in doc])   
    text = chr(12).join([page.get_text() for page in doc])
# write as a binary file to support non-ASCII characters
with open("PyMuPDFExampleOutputs/bank_of_america_2_confirmation.txt", "w") as f:
    f.write(text)
    print(text, type(text))

Customer service information
Customer service information
Customer Service: 
Customer Service: 1.800.432.1
1.800.432.1000
000
TDD/TTY users only: 
TDD/TTY users only: 1.800.288.
1.800.288.4408
4408
En Español: 
En Español: 1.800.688.60
1.800.688.6086
86
bankofamerica.com
bankofamerica.com
Bank of America, N.A.
Bank of America, N.A.
P.O. Box 25118
P.O. Box 25118
Tampa, FL 33622-5118
Tampa, FL 33622-5118
TELLDRICK WILLIAMS
5654 HARDWOOD FOREST DR
HOUSTON, TEXAS 77088
P.O. Box 15284
P.O. Box 15284
 Wilmington, D
 Wilmington, DE 19850
E 19850
Page 1 of 3
Account number: 0056  7648  9789
$35,368.25
2,538.00
-1,144.64
-2,106.42
-18,000.00
-0.00
Y
Yo
ou
ur
r 
 B
Bo
of
fA
A 
 C
Co
or
re
e 
 C
Ch
he
ec
ck
ki
in
ng
g
for November 30, 2020 to December, 2020 
TELLDRICK WILLIAMS
Account summary
Beginning balance on November 30, 2020 
Deposits and other additions
ATM and debit card subtractions Other 
subtractions
Checks
Service fees
Ending balance on December 30, 2020
$16,655.19
TELLDRICK WILLIAMS

In [4]:
# Test C.pattern_date_month_word on every line in every text file

for i, filename in enumerate(os.listdir("PyMuPDFExampleOutputs")):
    fname = "PyMuPDFExampleOutputs/" + filename  # get document filename
    with open(fname, "r") as f:
        text = f.readlines()
    print(f"File: {filename}")
    for line in text:
        match_list = C.pattern_date_month_word.findall(line)
        # if match_list:
        #     print(len(match_list), line.strip("\n"))
        if len(match_list) > 1:
            print(match_list, line.strip("\n")) 
    print("\n")
    

File: chase_2.txt
['August 01, 2022', 'August 31, 2022']                                          August 01, 2022 through August 31, 2022 
['August 01, 2022', 'August 31, 2022']                                          August 01, 2022 through August 31, 2022 
['August 01, 2022', 'August 31, 2022']                                          August 01, 2022 through August 31, 2022 
['August 01, 2022', 'August 31, 2022']                                          August 01, 2022 through August 31, 2022 


File: commerce.txt


File: bmo_1.txt
['Nov. 6, 2021', 'Dec. 5, 2021'] Nov. 6, 2021 - Dec. 5, 2021
['Nov. 6, 2021', 'Dec. 5, 2021'] Nov. 6, 2021 - Dec. 5, 2021


File: first_citizens_1.txt


File: chase_4.txt
['July 1, 2008', 'July 31, 2008'] July 1, 2008 through July 31, 2008
['July 1, 2008', 'July 31, 2008'] July 1, 2008 through July 31, 2008
['July 1, 2008', 'July 31, 2008'] July 1, 2008 through July 31, 2008
['July 1, 2008', 'July 31, 2008'] July 1, 2008 through July 31, 2008


File: rbs_

In [9]:
example_0 = "Jan 1, 1900"
example_1 = "Jan 1, 1900 wfgfgws Feb 1, 1900"
example_2 = "Jan 1 "
example_3 = "Jan 1 1900 - Feb 2 1901"
example_4 = "January 1 1900-February 2 1901"
example_5 = "Jan 1 1900-Feb 2 1901"
example_6 = "Jan. 1, 1900 - Feb. 2, 1901"
example_7 = "Jan 1, 1900 - Feb 2, 1901"
examples = [example_0, example_1, example_2, example_3, example_4, example_5, example_6, example_7]
for example in examples:
    print(C.pattern_date_month_word.findall(example))

['Jan 1, 1900']
['Jan 1, 1900', 'Feb 1, 1900']
[]
['Jan 1 1900', 'Feb 2 1901']
['January 1 1900', 'February 2 1901']
['Jan 1 1900', 'Feb 2 1901']
['Jan. 1, 1900', 'Feb. 2, 1901']
['Jan 1, 1900', 'Feb 2, 1901']
