In [1]:
#2018 HMDA Edit Testing File Generator
from collections import OrderedDict
from io import StringIO
import json
import os
import pandas as pd
import random

#custom imports
import lar_constraints
import lar_generator
from rules_engine import rules_engine
!pwd

/Users/roellk/Desktop/HMDA/hmda-test-files/python


In [2]:
#2018 Filing Instruction Guide: https://www.consumerfinance.gov/data-research/hmda/static/for-filers/2018/2018-HMDA-FIG.pdf

use_cols = ['name', 'metDivName', 'countyFips', 'geoIdMsa', 'metDivFp', 'smallCounty', 'tracts']
cbsa_cols = ['name', 'metDivName', 'state', 'countyFips', 'county', 'tracts','geoIdMsa', 'metDivFp', 'smallCounty', 
             'stateCode', 'tractDecimal']
cbsas = pd.read_csv('../dependancies/tract_to_cbsa_2015.txt', usecols=use_cols, delimiter='|', 
                    header=None, names=cbsa_cols, dtype=str) #load tract to CBSA data from platform file
cbsas["tractFips"] = cbsas.countyFips + cbsas.tracts
counties = list(cbsas.countyFips)
tracts = list(cbsas.tractFips)

In [3]:
#load schemas for LAR and transmittal sheet
lar_schema_df = pd.DataFrame(json.load(open("../schemas/lar_schema.json", "r")))
ts_schema_df = pd.DataFrame(json.load(open("../schemas/ts_schema.json", "r")))

In [4]:
lar_gen = lar_generator.lar_gen(lar_schema_df, ts_schema_df, counties=counties, tracts=tracts) #instantiate generator
lar_const = lar_constraints.lar_constraints(counties=counties, tracts=tracts)#instantiate constraints


In [8]:
#run constraint functions on row
file_length = 5 #set number of rows in test file
first = True
lei = None

def get_const_list():
    """Creates a list of constraints from the functions in the lar_constraints object."""
    constraints = [] 
    for func in dir(lar_const):
        if func[:1] in ("s", "v") and func[1:4].isdigit()==True:
            constraints.append(func)
    return constraints
            
def constraints_loop(constraints=[], row=None, row_base=None):
    for const in constraints:
        row = apply_constraint(row, const)
        diff = get_diff(row, row_base)
    return row

def apply_constraint(row, func):
    """Applies all constraints in the constrains list and returns a LAR row in dictionary format."""
    row_start = row.copy()
    row = getattr(lar_const, func)(row) #apply constraint to row
    diff_1, diff_2 = get_diff(row, row_start)
    if len(diff_1) > 0:
        print(diff_1, "\n\n", diff_2)
    print(str(func))
    return row

def get_diff(row, row_base):
    """Checks the difference between an initial row and the row after constraints are applied"""
    initial_row = set(row_base.items()) #convert initial row to set
    changed_row = set(row.items()) #convert constrained row to set
    diff_1 = (changed_row - initial_row) #subtract row sets to show changes from constraint funcs
    diff_2 = (initial_row - changed_row)
    return diff_1, diff_2

def validation(row):
    """"""
    data = pd.DataFrame(row, index=[1])
    val = rules_engine(lar_schema=lar_schema_df, ts_schema=ts_schema_df, tracts=tracts, 
                             counties=counties, data_row=row) #instantiate edits rules engine
    for func in dir(val):
        if func[:1] in ("s", "v") and func[1:4].isdigit()==True:
            #print("applying:", func)
            getattr(val, func)()
    return val.results



In [10]:
for i in range(0, file_length): #loop over file length
    print("making new row\n\n")
    if lei:
        row = lar_gen.make_row(lei=lei) #generate new LEI. The same LEI must be used for each row
    else:
        row = lar_gen.make_row() #create new row
    lei = row["lei"] #copy LEI from previous row
    iters = 1 #start iteration count for checking diff time
    stop = False

    while stop == False:
        row_base = row.copy() #copy row to enable diff
        print("\nstarting constraints iteration {iter}".format(iter=iters))
        row = constraints_loop(get_const_list(), row, row_base)
        diff_1, diff_2 = get_diff(row, row_base) #get difference between row and base row
        
        if len(diff_1) == 0:
            #results = validation(row)
            #res_df = pd.DataFrame(results)
            #if len(res_df[res_df.status=="failed"])>0:
            #    print("\n\n\nedit fails:", len(res_df[res_df.status=="failed"]))
            #    print(res_df[res_df.status=="failed"])
            #    stop = False
            #else:
            #    stop = True
            stop = True
        iters+=1
    
    if first: #create first row of dataframe
        lar_frame = pd.DataFrame(row, index=[1])
        first = False
        print("finished row\n",lar_frame.columns)
    else: #add additional rows to dataframe
        #print("concating")
        print("finished row\n")
        new_lar = pd.DataFrame(row, index=[1])
        lar_frame = pd.concat([lar_frame, new_lar], axis=0)
        
lar_frame.reset_index(inplace=True) #reset index
lar_frame.drop("index", inplace=True, axis=1) #drop additional index column
print(iters)

making new row



starting constraints iteration 1
v610_const
v612_const
{('preapproval', '1')} 

 {('preapproval', '2')}
v613_const
{('preapproval', '2')} 

 {('preapproval', '1')}
v614_const
v615_const
v619_const
v622_const
v627_const
{('app_eth_5', '13'), ('app_eth_4', '1'), ('app_eth_3', '11')} 

 {('app_eth_3', ''), ('app_eth_4', ''), ('app_eth_5', '1')}
v628_const
{('app_eth_5', ''), ('app_eth_3', ''), ('app_eth_4', ''), ('app_eth_2', '')} 

 {('app_eth_5', '13'), ('app_eth_4', '1'), ('app_eth_2', '14'), ('app_eth_3', '11')}
v629_const
v630_const
{('co_app_eth_5', '13'), ('co_app_eth_3', '2'), ('co_app_eth_2', '11')} 

 {('co_app_eth_2', ''), ('co_app_eth_5', '11'), ('co_app_eth_3', '')}
v631_const
v632_const
{('co_app_eth_1', '4')} 

 {('co_app_eth_1', '14')}
v633_const
v634_const
{('app_race_3', '41'), ('app_race_5', '44'), ('app_race_2', '24'), ('app_race_4', '25')} 

 {('app_race_5', ''), ('app_race_2', '2'), ('app_race_4', '41'), ('app_race_3', '')}
v635_const
v636_const
{('

v644_const
v645_const
v647_const
v648_const
v649_const
v650_const
v651_const
v652_const
{('income', 'NA')} 

 {('income', '19142')}
v654_const
v655_const
{('purchaser_type', '0')} 

 {('purchaser_type', '6')}
v656_const
{('rate_spread', 'NA')} 

 {('rate_spread', '24.240000000000002')}
v657_const
v658_const
v661_const
{('app_score_code_8', '')} 

 {('app_score_code_8', '6E0TQBQJ0YKOJQLB686SAH53ILFYU2UHQ8F0CYPM5M18S')}
v662_const
v663_const
v664_const
v666_const
{('co_app_score_code_8', '')} 

 {('co_app_score_code_8', '7HQ052K9FWU2GWFAW7KTCRCWYE2RIZ40KFP6FYCKM3TSPCBWPYHQ0CDVFZWA3PLG9DCGLQVLNII2K3H1XYEG0BK65SD')}
v667_const
v668_const
{('denial_4', '6'), ('denial_3', '5')} 

 {('denial_3', '4'), ('denial_4', '')}
v669_const
v670_const
{('denial_code_9', '')} 

 {('denial_code_9', 'JKWQKWMQ93EB4NFDM0PPJFR94GTU7PASWWMG3Y40EE6JJ5DBA0JFZO3YUFERNRZFPLDGB350RQ05SZIJ09THCFPLWQAJ5CBG2CO5A4A3G7PK')}
v671_const
{('loan_costs', 'NA')} 

 {('loan_costs', '645')}
v672_const
{('points_fees', 'NA')} 


In [11]:
#Quality and Macro field interrelationship constraints:

In [12]:
#Create a sample TS row
#Note: this will need to be more robust to include other federal agencies
# mlo_id needs NA option
#set dummy values for TS row
ts_row_small = OrderedDict()
ts_row_small["record_id"]="1"
ts_row_small["inst_name"]="Ficus Bank"
ts_row_small["calendar_year"]=str(2018)
ts_row_small["calendar_quarter"]="4"
ts_row_small["contact_name"]="Mr. Smug Pockets"
ts_row_small["contact_tel"]="555-555-5555"
ts_row_small["contact_email"]="pockets@ficus.com"
ts_row_small["contact_street_address"]="1234 Ficus Lane"
ts_row_small["office_city"]="Ficusville"
ts_row_small["office_state"]="UT"
ts_row_small["office_zip"]="84096"
ts_row_small["federal_agency"]="9"
ts_row_small["lar_entries"]= str(len(lar_frame))
ts_row_small["tax_id"]="01-0123456"
ts_row_small["lei"]=lar_frame.get_value(0, "lei")

In [13]:
#join LAR and TS rows to make an output file
def write_file(ts_input=None, lar_input=None, directory="../edits_files/", name="passes_all.txt"):
    """Takes a TS row as a dictionary and LAR data as a dataframe. Writes LAR data to file and 
    re-reads it to combine with TS data to make a full file."""
    #make directories for files if they do not exist
    if not os.path.exists(directory):
        os.makedirs(directory)

    #write LAR dataframe to file
    parts_dir = directory+"file_parts/"
    if not os.path.exists(parts_dir):
        os.makedirs(parts_dir)
        
    lar_input.to_csv(parts_dir + "lar_data.txt", sep="|", header=False, index=False, index_label=False)
    #load LAR data as file rows
    with open(parts_dir + "lar_data.txt", 'r') as lar_data:
        lar = lar_data.readlines()

    with open(directory + name, 'w') as final_file:
        final_file.write("|".join(ts_input.values())+"\n")
        for line in lar:
            final_file.write("{line}".format(line=line))

In [14]:
#modify file for testing


In [25]:
#write sample file to disk
write_file(ts_input=ts_row_small, lar_input=lar_frame, name="test.txt") #writes created file to disk
#validator engine uses the default: path="../edits_files/", data_file="passes_all.txt" for data files
validator = rules_engine(lar_schema=lar_schema_df, ts_schema=ts_schema_df, tracts=tracts, counties=counties) #instantiate edits rules engine

In [27]:
#split TS and LAR using validator function
#validator creates class objects of each of these internally as well
ts_df, lar_df = validator.split_ts_row()


In [28]:
for func in dir(validator):
    if func[:1] in ("s", "v") and func[1:4].isdigit()==True:
        #print("applying:", func)
        getattr(validator, func)()


In [30]:
validator.results

[{'edit_name': 's300_1',
  'fields': 'record_id',
  'row_type': 'TS',
  'status': 'passed'},
 {'edit_name': 's300_2',
  'fields': 'record_id',
  'row_type': 'LAR',
  'status': 'passed'},
 {'edit_name': 's301', 'fields': 'LEI', 'row_type': 'TS', 'status': 'passed'},
 {'edit_name': 's302',
  'fields': 'calendar_year',
  'row_type': 'TS',
  'status': 'passed'},
 {'edit_name': 's304',
  'fields': '',
  'row_type': 'TS/LAR',
  'status': {'lar_entries': 'passed'}},
 {'edit_name': 's305', 'fields': 'all', 'row_type': 'LAR', 'status': 'passed'},
 {'edit_name': 'v600', 'fields': 'LEI', 'row_type': 'LAR', 'status': 'passed'},
 {'edit_name': 'v601_1',
  'fields': 'inst_name',
  'row_type': 'TS',
  'status': 'passed'},
 {'edit_name': 'v601_2',
  'fields': 'contact_name',
  'row_type': 'TS',
  'status': 'passed'},
 {'edit_name': 'v601_3',
  'fields': 'contact_email',
  'row_type': 'TS',
  'status': 'passed'},
 {'edit_name': 'v601_4',
  'fields': 'contact_street_address',
  'row_type': 'TS',
  'stat

In [31]:
ulis=[]
validator.lar_df[["loan_purpose", "preapproval","action_taken","app_race_1", "app_race_2", "app_race_3", "app_race_4", "app_race_5"
                  ,"uli"]][validator.lar_df.uli.isin(ulis)]

Unnamed: 0,loan_purpose,preapproval,action_taken,app_race_1,app_race_2,app_race_3,app_race_4,app_race_5,uli


In [32]:
results_df = pd.DataFrame(validator.results)

In [33]:
results_df[results_df.status=="failed"]

Unnamed: 0,edit_name,fields,row_type,status


In [34]:
validator.lar_df[["action_taken", "preapproval", "interest_rate","mlo_id","uli"]]

Unnamed: 0,action_taken,preapproval,interest_rate,mlo_id,uli
0,6,2,11.11,82,DSO6IMAIL15CFVR0PZ81DL37NWYAZ27AFTZ3P3RW0H768
1,4,2,,QESO0I624MZEND2LC1,DSO6IMAIL15CFVR0PZ81DNY5T3IT2S4WCGSUFSINFU742
2,6,2,,NV,DSO6IMAIL15CFVR0PZ811BRSOMZGDLB7KQE6JN3MG9248
3,4,2,,J,DSO6IMAIL15CFVR0PZ81DOWGJSNJE1A552ZW907NG1F34
4,6,2,,PABCJ1URU,DSO6IMAIL15CFVR0PZ81CHNB0FO38IMNN2CAUAFKO9L61


In [35]:
set_row = set(row.items())
row_base["mlo_id"] = "17"
set_base = set(row_base.items())
diff = (set_row - set_base)
diff2 = (set_base - set_row)
diff, diff2

({('mlo_id', 'PABCJ1URU')}, {('mlo_id', '17')})