In [None]:
#TabulaRazr - specific to calculate  - TABLE Parser
#Infers a table with arbitrary number of columns from reoccuring patterns in text lines
#(c) Alexander Hirner 2016, no redistribution without permission
#Contributions: ____ (refactoring), UI styling (), ....


#Main assumptions Table identificatin:
#1) each row is either in one line or not a row at all
#2) each column features at least one number (=dollar amount)
#2a) each column features at least one date-like string [for time-series only]
#3) a table exists if rows are in narrow consecutive order and share similarities --> scoring algo [DONE] 
#4) each column is separated by more than x consecutive whitespace indicators (e.g. '  ' or '..')

#Feature List Todo:
#1) Acknowledge footnotes / make lower meta-data available
#2) make delimiter length smartly dependent on number of columns (possible iterative approach)
#3) improve captioning: expand non canonical values in tables [DONE] .. but not to the extent how types match up  --> use this to further
## delineate between caption and headers
#4) UI: parameterize extraction on the show page on the fly
#5) deeper type inference on token level: type complex [DONE], subtype header (centered, capitalized), 
## subtype page nr., type free flow [DONE, need paragraph]
#5a) re
#6) Respect negative values with potential '-' for numerical values
#7)
#8) classify tables with keywords (Muni Bonds) and unsupervised clustering (Hackathon)
#9) Restructure folder and URI around MD5 hash (http://stackoverflow.com/questions/24570066/calculate-md5-from-werkzeug-datastructures-filestorage-without-saving-the-object)
#10) proper logging
#11) include tesseract for OCR capabilities (quickstart guide: http://pythontips.com/2016/02/25/ocr-on-pdf-files-using-python/)

In [4]:
from __future__ import print_function

import os
import glob
import codecs
import json

import string

from backend import *
from data_query import *

UPLOAD_FOLDER = './static/ug'
PROJECT = 'muni_bonds_bulk'

path = os.path.join(UPLOAD_FOLDER, PROJECT, '*.tables.json')
table_files = glob.glob(path)

In [14]:
def clean_string(s):
    lc = s.encode('ascii', errors='ignore').lower()#.translate(remove_punctuation_map)
    return lc.translate(None, string.punctuation + '0123456789').strip()
    
from collections import Counter

table_counter = Counter()
terms_stripped = Counter()
terms_lc_cleaned = Counter()

tables_looked_at = 0
confidences = []
no_result_files = []
funny_tables = {}
funny_rows = {}
funny_values = ['NaN', 'Introduction', '']

filter_file = os.path.join('static', 'filters', 'funds.json')
with codecs.open(filter_file, "r", "utf-8", errors="replace") as file:
    _filter = json.load(file) 

print ("Processing with filter %s" % str(_filter))

Processing with filter {u'headers': {u'threshold': 0.35, u'terms': [u'USES OF FUNDS']}, u'name': u'Estimated use and sources of funds'}


In [39]:



#Get all tables
for i,f in enumerate(table_files):

    with codecs.open(f, 'r', 'utf-8') as file:
        tables = json.load(file)
        tables_looked_at += len(tables)
        
        filename = f.split(r'/')[-1].replace('.tables.json', '')
        
        filter_results = []
        for t in filter_tables(tables.values(), _filter):
            if len(filter_results) == 0 or t[0] >= max(r[0] for r in filter_results):
                filter_results.append(t)
        
        table_counter[len(filter_results)] += 1        
        if len(filter_results):

            #Only keep first one
            confidence, table, _, _ = max( sorted( filter_results, key = lambda t: t[1]['begin_line'] ), 
                                          key = lambda t: t[0])
            confidences.append(confidence)
            if len(table['captions']) != 2 or table['subtypes'][1] != 'dollar':
                funny_tables[filename] = table['begin_line']
            for row in table['data']:
                #Prune for rows that don't have (the right) data
                #if True:
                if len(row) > 1 and 'subtype' in row[1] and row[1]['subtype'] == 'dollar':
                    first_term = row[0]['value'].strip()
                    if first_term in funny_values:
                        if filename in funny_rows: funny_rows[filename].append(row)
                        else: funny_rows[filename] = [row]
                        
                    terms_stripped[first_term] += 1
                    terms_lc_cleaned[clean_string(first_term)] += 1

                #It's probably an interims caption (or from the TOC!)   
                else:
                    if filename in funny_rows: funny_rows[filename].append(row)
                    else: funny_rows[filename] = [row]
        else:
            no_result_files.append(filename)
        
    if ( (i+1) % 100 ) == 0:
        print ("%i files and %i tables processed... with %i best matches and so far %i/%i unique terms" % \
               (i+1, tables_looked_at, len(confidences), len(terms_stripped), len(terms_lc_cleaned)))

    if i > 200: break


print(table_counter.most_common())
print(terms_lc_cleaned.most_common())
#print(no_result_files)
#print(funny_tables)
#print(funny_rows)

results = {'high_confidence_candidates' : table_counter.most_common(),
           'tables_looked_at' : tables_looked_at,
           'tables_canonical' : len(confidences),
           'confidence_mean' : sum(confidences) / len(confidences),
           'confidences' : confidences, 
           'unique_raw_terms' : len(terms_stripped),
           'unique_cleaned_terms' : len(terms_stripped),
           'raw_term_freq' : terms_stripped,
           'clean_term_freq' : terms_lc_cleaned,
           'no_table_files' : no_result_files,
           'funny_tables' : funny_tables,
           'funny_rows' : funny_rows
          }

Processing with filter {u'headers': {u'threshold': 0.35, u'terms': [u'USES OF FUNDS']}, u'name': u'Estimated use and sources of funds'}
100 files and 5689 tables processed... with 74 best matches and so far 84/69 unique terms
200 files and 11545 tables processed... with 147 best matches and so far 131/105 unique terms
[(1, 119), (0, 53), (2, 18), (3, 7), (5, 3), (4, 2)]
[('total sources', 65), ('total uses', 62), ('total sources of funds', 56), ('total uses of funds', 55), ('deposit to escrow fund', 29), ('principal amount of bonds', 27), ('total', 23), ('par amount of the bonds', 16), ('par amount of bonds', 14), ('par amount', 11), ('deposit to construction fund', 11), ('principal amount of the bonds', 9), ('deposit to project fund', 9), ('costs of issuance', 8), ('', 7), ('escrow fund', 7), ('principal amount of refunding bonds', 7), ('principal amount', 7), ('deposit to building fund', 6), ('building fund', 6), ('principal amount of certificates', 6), ('principal amount of series  

In [43]:
with codecs.open("funds_stats.results.json", "w", "utf-8") as file:
    json.dump(results, file)

In [2]:
####

In [7]:
with codecs.open("funds_stats.results.json", "r", "utf-8") as file:
    results = json.load(file)

In [10]:
len(results["no_table_files"]) + len(results["confidences"])

6777

In [24]:
import xlwt

bold = xlwt.Style.easyxf("font: bold on")

def write_table(sheet, keys, values, row, c_offset = 0, column_style = bold):
    for j, k in enumerate(keys):
        sheet.write(row, c_offset+j, k, column_style)
    row += 1
    for v in values:
        for j, vv in enumerate(v):
            sheet.write(row, c_offset+j, vv)
        row +=1
    return row

In [25]:
wkb = xlwt.Workbook(encoding='utf-8')
s_summary, s_raw_tf, s_clean_tf, s_confidence, s_no_table, s_funny_tables, s_funny_rows = \
    (wkb.add_sheet(s) for s in ['summary', 'raw_TF', 'clean_TF', 'confidence', 'no_table', 'funny_tables', 'funny_rows'])

In [26]:
i = 0
s_summary.write(i,0, 'Filter used', bold)
s_summary.write(i,1, str(_filter))
i+=2
s_summary.write(i,0, 'Distribution of good table matches per document', bold)
i+=1
i = write_table(s_summary, ['Nr. of Table Candidates', 'Nr. of Documents'], 
                results["high_confidence_candidates"], i)

i+=1
s_summary.write(i, 2, 'Total nr. of Table Candidates')
s_summary.write(i, 3, 'out of..')
i+=1
s_summary.write(i, 2, results['tables_canonical'])
s_summary.write(i, 3, results['tables_looked_at'])

i = write_table(s_confidence, ['Confidence in best Table found'], ([c] for c in results['confidences']), 0)


In [27]:
wkb.save('fund_filter_results.xls')

In [None]:
from fuzzywuzzy import fuzz
#print (fuzz.partial_ratio("this is a test!", "this text is"))
#print (fuzz.partial_ratio("USES OF FUNDS", "USES from FUNDS"))
#print (fuzzy_str_match("aaaaaaaaaa text this", "this test is"))

print (fuzzy_str_match("USES OF FUNDS", "USES OF FUNDS"))
print (fuzzy_str_match("USES OF FUNDS", "USES of FUNDS"))
print (fuzzy_str_match("USES OF FUNDS", "uses of funds"))
print (fuzzy_str_match("USES OF FUNDS", "USES which include other FUNDS"))


print (fuzzy_str_match("USES OF FUNDS", "Note 5 - Assets Limited as to Use"))
print (fuzzy_str_match("USES OF FUNDS", "FINANCIAL STATEMENT AMOUNTS - Continued"))

print (fuzzy_str_match("USES OF FUNDS", "FUNDS OF USES"))


print (fuzzy_str_match("USES OF FUNDS", "TABLE OF CONTENTS"))

print (fuzzy_str_match("Debt Service", "Total Periodic Debt Service"))

#print (fuzzy_str_match("USES OF FUNDS", "Note 5 - Assets Limited as to Use"))
#print (fuzzy_str_match("USES OF FUNDS", "FINANCIAL STATEMENT AMOUNTS - Continued"))


In [None]:
print (fuzzy_str_match("Maturity", "ii"))
#print (fuzz.partial_ratio("Maturity", "i"))
print (fuzzy_str_match("Maturity", "Matarit"))
print (fuzzy_str_match("Maturity", "i"))
print (fuzzy_str_match("Maturity", "     i   "))


print ("Maturity".lower() in "i".lower())