In [2]:
#2018 HMDA Edit Testing File Generator
from collections import OrderedDict
from io import StringIO
import json
import os
import pandas as pd
import random

#custom imports
import lar_constraints
import lar_generator
from rules_engine import rules_engine


In [3]:
#2018 Filing Instruction Guide: https://www.consumerfinance.gov/data-research/hmda/static/for-filers/2018/2018-HMDA-FIG.pdf

use_cols = ['name', 'metDivName', 'countyFips', 'geoIdMsa', 'metDivFp', 'smallCounty', 'tracts']
cbsa_cols = ['name', 'metDivName', 'state', 'countyFips', 'county', 'tracts','geoIdMsa', 'metDivFp', 'smallCounty', 
             'stateCode', 'tractDecimal']
cbsas = pd.read_csv('../dependancies/tract_to_cbsa_2015.txt', usecols=use_cols, delimiter='|', 
                    header=None, names=cbsa_cols, dtype=str) #load tract to CBSA data from platform file
cbsas["tractFips"] = cbsas.countyFips + cbsas.tracts
counties = list(cbsas.countyFips)
tracts = list(cbsas.tractFips)



In [4]:
#load schemas for LAR and transmittal sheet
lar_schema_df = pd.DataFrame(json.load(open("../schemas/lar_schema.json", "r")))
ts_schema_df = pd.DataFrame(json.load(open("../schemas/ts_schema.json", "r")))


In [5]:
lar_gen = lar_generator.lar_gen(lar_schema_df, ts_schema_df, counties=counties, tracts=tracts) #instantiate generator
lar_const = lar_constraints.lar_constraints(counties=counties, tracts=tracts)#instantiate constraints
validator = rules_engine(lar_schema_df, ts_schema_df) #instantiate edits rules engine

In [6]:


#test = lar_generator.lar_gen(lar_schema_df, ts_schema_df)
row = lar_gen.make_row() #create test row

#run constraint functions on row

file_length = 10 #set number of rows in test file

first = True
for i in range(file_length):
    #print("creating row number: {num}".format(num=i))
    row = lar_gen.make_row() #create new row
    diff = [1] #initialize diff for loop
    while len(diff) >0:
        row_base = row.copy() #copy row to enable diff
        #apply constraint functions to LAR row
        for func in lar_const.constraint_funcs: 
            row = getattr(lar_const, func)(row)
            #convert initial and copied rows to sets for diff
            initial_row = set(row_base.items())
            changed_row = set(row.items())
            diff = (initial_row - changed_row)
#            print(len(initial_row), len(changed_row)) #check number of fields, should be 110
            #show readout of number of changes made to new LAR row
            #print("changes from {func}: {vals}".format(func=str(func), vals=len(diff)))
    
    #create first row of dataframe
    if first:
        lar_frame = pd.DataFrame(row, index=[1])
        first = False
    #add additional rows to dataframe
    else:
        #print("concating")
        new_lar = pd.DataFrame(row, index=[1])
        lar_frame = pd.concat([lar_frame, new_lar], axis=0)




In [7]:
#Quality and Macro field interrelationship constraints:

In [8]:
#Create a sample TS row
#Note: this will need to be more robust to include other federal agencies
# mlo_id needs NA option
#set dummy values for TS row
ts_row_small = OrderedDict()
ts_row_small["record_id"]="1"
ts_row_small["inst_name"]="Ficus Bank"
ts_row_small["calendar_year"]=str(2018)
ts_row_small["calendar_quarter"]="4"
ts_row_small["contact_name"]="Mr. Smug Pockets"
ts_row_small["contact_tel"]="555-555-5555"
ts_row_small["contact_email"]="pockets@ficus.com"
ts_row_small["contact_street_address"]="1234 Ficus Lane"
ts_row_small["office_city"]="Ficusville"
ts_row_small["office_state"]="UT"
ts_row_small["office_zip"]="84096"
ts_row_small["federal_agency"]="9"
ts_row_small["lar_entries"]="100"
ts_row_small["tax_id"]="01-0123456"
ts_row_small["lei"]="12345678901234567890"



In [9]:
#join LAR and TS rows to make an output file

def write_file(ts_input=None, lar_input=None, directory="../edits_files/", name="passes_all.txt"):
    """Takes a TS row as a dictionary and LAR data as a dataframe. Writes LAR data to file and 
    re-reads it to combine with TS data to make a full file."""
    #make directories for files if they do not exist
    if not os.path.exists(directory):
        os.makedirs(directory)

    #write LAR dataframe to file
    parts_dir = directory+"file_parts/"
    if not os.path.exists(parts_dir):
        os.makedirs(parts_dir)
    lar_input.to_csv(parts_dir + "lar_data.txt", sep="|", header=False, index=False)
    #load LAR data as file rows
    with open(parts_dir + "lar_data.txt", 'r') as lar_data:
        lar = lar_data.readlines()
    with open(directory + name, 'w') as final_file:
        final_file.write("|".join(ts_input.values())+"\n")
        for line in lar:
            final_file.write("{line}\n".format(line=line))
            
#load LAR and TS data as strings of file rows
#write file rows to same file

In [10]:
#write sample file to disk
write_file(ts_input=ts_row_small, lar_input=lar_frame)

In [11]:
ts_df, lar_df = validator.split_ts_row(path="../edits_files/", data_file="passes_all.txt")

In [13]:
lar_df

Unnamed: 0,record_id,lei,uli,app_date,loan_type,loan_purpose,preapproval,const_method,occ_type,loan_amount,...,aus_code_5,aus_result_1,aus_result_2,aus_result_3,aus_result_4,aus_result_5,aus_code_16,reverse_mortgage,open_end_credit,business_purpose
0,2,1L2I04KELRCJG26MZ3EE,1L2I04KELRCJG26MZ3EEROTJELM5580CS1CBJZJ8L86E0,20180615.0,1.0,1.0,2.0,2.0,2.0,8742.0,...,2VEO6EADLKM0X9U1MHTC1OAGKSZCQX75ZLULMLNI3441J6...,5.0,6.0,13.0,2.0,10.0,,1.0,1.0,2\n
1,\n,,,,,,,,,,...,,,,,,,,,,
2,2,U7D0PBVDK15U6E1LQXXQ,U7D0PBVDK15U6E1LQXXQT95PXY818IITP8GYD4HUTUWGC,,2.0,1.0,2.0,2.0,1.0,7225.0,...,B5DLOID767GB28O0K8PRHIE17E7,4.0,,1.0,,,,2.0,2.0,2\n
3,\n,,,,,,,,,,...,,,,,,,,,,
4,2,QUGKQZGKU0OUWBOZF8RM,QUGKQZGKU0OUWBOZF8RMZKLL1CUZIWOUMEXACXMGCQJQX,,4.0,5.0,2.0,2.0,1.0,4377.0,...,,7.0,6.0,,9.0,,,2.0,2.0,1\n
5,\n,,,,,,,,,,...,,,,,,,,,,
6,2,GP9QX2AFDZ974YL8B12G,GP9QX2AFDZ974YL8B12GCDAQC20MATW739WYPO34JIRMM,,1.0,1.0,2.0,2.0,2.0,8584.0,...,6JOX7ZBCZU9UQ4DRFS6SZID4XOQ33SKL1OYBKSA39CIZZJ...,2.0,4.0,,,2.0,,2.0,2.0,2\n
7,\n,,,,,,,,,,...,,,,,,,,,,
8,2,PJ0SCNJ66MT6OWJG1XG9,PJ0SCNJ66MT6OWJG1XG9GPUKY986K8EJSCWKBGOQFBOTB,20180925.0,2.0,1.0,2.0,2.0,1.0,1370.0,...,V5MFUBHRXP0GK64VJ6C61269YSUUM,2.0,,,15.0,11.0,,1.0,1.0,2\n
9,\n,,,,,,,,,,...,,,,,,,,,,
