In [4]:
# Applies a table filter across all extracted tables from a project and calculates 
# the net underwriter discount, and
# the face value
# for a specific type of table (standard case: two column with $ denominated key-value pairs)

from __future__ import print_function

import os
import sys
import glob
import codecs
import json

import string

sys.path.insert(0, os.path.pardir)

from backend import *
from data_query import *

UPLOAD_FOLDER = os.path.join('..', 'static', 'ug')
FILTER_FOLDER = os.path.join('..', 'static', 'filters')
PROJECT = 'muni_bonds_bulk_2'
FILTER = 'funds'

path = os.path.join(UPLOAD_FOLDER, PROJECT, '*.tables.json')
table_files = glob.glob(path)

In [None]:
def clean_string(s):
    lc = s.encode('ascii', errors='ignore').lower()#.translate(remove_punctuation_map)
    return lc.translate(None, string.punctuation + '0123456789').strip()
    
from collections import Counter

table_counter = Counter()
terms_stripped = Counter()
terms_lc_cleaned = Counter()

tables_looked_at = 0
confidences = []
no_result_files = []
funny_tables = {}
funny_rows = {}
funny_values = ['NaN', 'Introduction', '']

# Get those line items sufficient for IRR estimation
# remark: improved query terms from TF analysis and annotation
irr_estimate_dict = {'face_value' : ['Principal Amount', 'Par Amount', 'Face Amount'], 
                     'premium' : 'Issue Premium',
                     'discount': 'Issue Discount',
                     'underwriter_discount' : 'Underwriter Discount', 
                     'cost_of_issuance' : 'Costs of Issuance'}


filter_file = os.path.join(FILTER_FOLDER, FILTER+'.json')
with codecs.open(filter_file, "r", "utf-8", errors="replace") as file:
    _filter = json.load(file) 

print ("Processing with filter %s" % str(_filter))

In [None]:
#Get all tables
for i,f in enumerate(table_files):

    with codecs.open(f, 'r', 'utf-8') as file:
        tables = json.load(file)
        tables_looked_at += len(tables)
        
        filename = f.split(r'/')[-1].replace('.tables.json', '')
        
        filter_results = []
        for t in filter_tables(tables.values(), _filter):
            if len(filter_results) == 0 or t[0] >= max(r[0] for r in filter_results):
                filter_results.append(t)
        
        table_counter[len(filter_results)] += 1        
        if len(filter_results):

            #Only keep first one
            confidence, table, _, _ = max( sorted( filter_results, key = lambda t: t[1]['begin_line'] ), 
                                          key = lambda t: t[0])
            confidences.append(confidence)
            if len(table['captions']) != 2 or table['subtypes'][1] != 'dollar':
                funny_tables[filename] = table['begin_line']
            for row in table['data']:
                #Prune for rows that don't have (the right) data
                #if True:
                if len(row) > 1 and 'subtype' in row[1] and row[1]['subtype'] == 'dollar':
                    first_term = row[0]['value'].strip()
                    if first_term in funny_values:
                        if filename in funny_rows: funny_rows[filename].append(row)
                        else: funny_rows[filename] = [row]
                        
                    terms_stripped[first_term] += 1
                    terms_lc_cleaned[clean_string(first_term)] += 1

                #It's probably an interims caption (or from the TOC!)   
                else:
                    if filename in funny_rows: funny_rows[filename].append(row)
                    else: funny_rows[filename] = [row]
        else:
            no_result_files.append(filename)
        
    if ( (i+1) % 100 ) == 0:
        print ("%i files and %i tables processed... with %i best matches and so far %i/%i unique terms" % \
               (i+1, tables_looked_at, len(confidences), len(terms_stripped), len(terms_lc_cleaned)))

print(table_counter.most_common())
print(terms_lc_cleaned.most_common())
#print(no_result_files)
#print(funny_tables)
#print(funny_rows)

results = {'high_confidence_candidates' : table_counter.most_common(),
           'tables_looked_at' : tables_looked_at,
           'tables_canonical' : len(confidences),
           'confidence_mean' : sum(confidences) / len(confidences),
           'confidences' : confidences, 
           'unique_raw_terms' : len(terms_stripped),
           'unique_cleaned_terms' : len(terms_lc_cleaned),
           'raw_term_freq' : terms_stripped,
           'clean_term_freq' : terms_lc_cleaned,
           'no_table_files' : no_result_files,
           'funny_tables' : funny_tables,
           'funny_rows' : funny_rows
          }

In [None]:
#Save intermediate results
with codecs.open("IRR_estimate.results.json", "w", "utf-8") as file:
    json.dump(results, file)

In [None]:
#Work from intermediate results
with codecs.open("IRR_estimate.results.json", "r", "utf-8") as file:
    results = json.load(file)

In [None]:
len(results["no_table_files"]) + len(results["confidences"])

In [None]:
import xlwt

bold = xlwt.Style.easyxf("font: bold on")

def write_table(sheet, keys, values, row, c_offset = 0, column_style = bold):
    for j, k in enumerate(keys):
        sheet.write(row, c_offset+j, k, column_style)
    row += 1
    for v in values:
        for j, vv in enumerate(v):
            sheet.write(row, c_offset+j, vv)
        row +=1
    return row

In [None]:
url_prefix = "http://tabularazr.eastus.cloudapp.azure.com:7081/show/"+PROJECT+'/'

In [None]:
wkb = xlwt.Workbook(encoding='utf-8')
s_summary, s_raw_tf, s_clean_tf, s_confidence, s_no_table, s_funny_tables, s_funny_rows = \
    (wkb.add_sheet(s) for s in ['summary', 'raw_TF', 'clean_TF', 'confidence', 'no_table', 'funny_tables', 'funny_rows'])

In [None]:
i = 0
s_summary.write(i,0, 'Filter used', bold)
s_summary.write(i,1, str(_filter))
i+=2
s_summary.write(i,0, 'Distribution of good table matches per document', bold)
i+=1
i = write_table(s_summary, ['Nr. of Table Candidates', 'Nr. of Documents'], 
                results["high_confidence_candidates"], i)

i+=1
s_summary.write(i, 2, 'Total nr. of Table Candidates')
s_summary.write(i, 3, 'out of..')
i+=1
s_summary.write(i, 2, results['tables_canonical'])
s_summary.write(i, 3, results['tables_looked_at'])

i = write_table(s_confidence, ['Confidence in best Table found'], ([c] for c in results['confidences']), 0)
i = write_table(s_no_table, ['Files with no suitable table found', 'URL'], 
                ( ([c], url_prefix+c) for c in results['no_table_files'] ), 0)

i = write_table(s_raw_tf, ['Term (raw)', 'Frequency'], 
                (tf for tf in Counter(results['raw_term_freq']).most_common() ), 0)
i = write_table(s_clean_tf, ['Term (cleaned)', 'Frequency'], 
                (tf for tf in Counter(results['clean_term_freq']).most_common() ), 0)

s_funny_tables.write(0,4, "[as returned by filter but with <> 2 rows, and/or no $ value in the 2nd column]")
i = write_table(s_funny_tables, ['Funny Tables in File', 'Table ID',  'URL'], 
                ( (f, t, url_prefix+f+'#'+str(t)) for f, t in results['funny_tables'].iteritems() ), 0)

In [None]:
wkb.save('fund_filter_results.xls')

In [None]:
for f, t in results['funny_tables'].iteritems():
    print (f, t)

In [None]:
results['funny_tables']