In [1]:
import yaml
import json
import pandas as pd
import os
from os import listdir
from os.path import isfile, join

from rules_engine import rules_engine
import utils

pd.options.display.max_columns = 999

In [2]:
with open('configurations/clean_file_config.yaml') as f:
	# use safe_load instead load
	data_map = yaml.safe_load(f)
file_length = data_map['file_length']['value']


In [3]:
use_cols = ['name', 'metDivName', 'countyFips', 'geoIdMsa', 'metDivFp', 'smallCounty', 'tracts', 'stateCode']
cbsa_cols = ['name', 'metDivName', 'state', 'countyFips', 'county', 'tracts','geoIdMsa', 'metDivFp', 'smallCounty', 
			 'stateCode', 'tractDecimal']
cbsas = pd.read_csv('../dependencies/tract_to_cbsa_2015.txt', usecols=use_cols, delimiter='|', 
					header=None, names=cbsa_cols, dtype=str) #load tract to CBSA data from platform file
cbsas["tractFips"] = cbsas.countyFips + cbsas.tracts
counties = list(cbsas.countyFips)
tracts = list(cbsas.tractFips)
small_counties = list(cbsas.countyFips[cbsas.smallCounty=="1"])
#load schemas for LAR and transmittal sheet
#schemas contain valid enumerations, including NA values, for each field in the dataset
lar_schema_df = pd.DataFrame(json.load(open("../schemas/lar_schema.json", "r")))
ts_schema_df = pd.DataFrame(json.load(open("../schemas/ts_schema.json", "r")))
!pwd

/Users/roellk/hmda/hmda-test-files/python


In [6]:

file = "v662_1.txt"
#file = "clean_file_100_rows.txt"
if file[0] == "v":
    file_type = "validity"
elif file[0] == "s":
    file_type = "syntax"
elif file[0] == "q":
    file_type = "quality"
else:
    file_type = "clean"
path = "../edits_files/"+file_type+"/"
if file_type == "clean":
    path = "../edits_files/"

#lar validator checks a dataframe and returns a JSON with generate_error_files
checker = rules_engine(lar_schema=lar_schema_df, ts_schema=ts_schema_df, cbsa_data=cbsas)#tracts=tracts, 
#                       counties=counties, small_counties=small_counties) 
#load data to checker
ts_df, lar_df = utils.read_data_file(path=path, data_file=file)
checker.load_data_frames(ts_df, lar_df)
for func in dir(checker):
    if func[:1] in ("s", "v", "q") and func[1:4].isdigit()==True:
        #print("applying:", func)
        getattr(checker, func)()
res_df = pd.DataFrame(checker.results)
res_df[(res_df.status=="failed")&(res_df.edit_name==file[:-4])]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


Unnamed: 0,edit_name,fail_count,fields,row_ids,row_type,status


In [None]:
#get all edit test files in single list
val_path = "../edits_files/validity/"
syn_path = "../edits_files/syntax/"
val_files = [f for f in listdir(val_path) if isfile(join(val_path, f)) and f[0]=="v"] #get list of only files
syn_files = [f for f in listdir(syn_path) if isfile(join(syn_path, f)) and f[0]=="s"]

#check validity test files for error rate
for file in val_files:
    print(file)
    #lar validator checks a dataframe and returns a JSON with generate_error_files
    checker = rules_engine(lar_schema=lar_schema_df, ts_schema=ts_schema_df, cbsa_data=cbsas)#tracts=tracts, counties=counties)
    #load data to checker
    ts_df, lar_df = utils.read_data_file(path=val_path, data_file=file)
    checker.load_data_frames(ts_df, lar_df)
    for func in dir(checker):
        if func[:1] in ("s", "v") and func[1:4].isdigit()==True:
            getattr(checker, func)()
    res_df = pd.DataFrame(checker.results)
    res_df = res_df[(res_df.edit_name==file[:-4])] #set res_df for only the edit in the file name
    if res_df.row_type.iloc[0]=="TS" and res_df.status.iloc[0]=="failed":
        print("file is good")
        
    if res_df.row_type[(res_df.edit_name==file[:-4])].iloc[0] == "LAR" and \
       res_df.fail_count[(res_df.edit_name==file[:-4])].iloc[0] != file_length:
        print("WARNING BOOOOOP", len(res_df.fail_count[res_df.edit_name==file[:-4]]))
    elif res_df.row_type[(res_df.edit_name==file[:-4])].iloc[0] == "LAR" and res_df.fail_count[(res_df.edit_name==file[:-4])].iloc[0] == file_length:
        print("file is good")
    print()

#check syntax test files for error rate
for file in syn_files:
    print(file)
    #lar validator checks a dataframe and returns a JSON with generate_error_files
    checker = rules_engine(lar_schema=lar_schema_df, ts_schema=ts_schema_df, tracts=tracts, counties=counties)
    #load data to checker
    ts_df, lar_df = utils.read_data_file(path=syn_path, data_file=file)
    checker.load_data_frames(ts_df, lar_df)
    for func in dir(checker):
        if func[:1] in ("s", "v") and func[1:4].isdigit()==True:
            getattr(checker, func)()
    res_df = pd.DataFrame(checker.results)
    res_df = res_df[(res_df.edit_name==file[:-4])] #set res_df for only the edit in the file name
    
    if res_df.row_type.iloc[0]=="TS" and res_df.status.iloc[0]=="failed":
        print("file is good")
    if res_df.row_type[(res_df.edit_name==file[:-4])].iloc[0] == "LAR":
        fail_count = res_df.fail_count[(res_df.edit_name==file[:-4])].iloc[0]
    
    if row_type == "LAR" and int(fail_count) != file_length:
        print("WARNING BOOOOOP\n", "*"*100)
    else:
        print("file is good")
    print()

In [None]:
#load CBSA data for geography testing edits
use_cols = ['name', 'metDivName', 'countyFips', 'geoIdMsa', 'metDivFp', 'smallCounty', 'tracts']
cbsa_cols = ['name', 'metDivName', 'state', 'countyFips', 'county', 'tracts','geoIdMsa', 'metDivFp', 'smallCounty', 
'stateCode', 'tractDecimal']
cbsa_data = pd.read_csv('../dependencies/tract_to_cbsa_2015.txt', usecols=use_cols, delimiter='|', 
header=None, names=cbsa_cols, dtype=object) #load tract to CBSA data from platform file


In [None]:
len(lar_df[lar_df.lei.iloc[0]==lar_df.uli.apply(lambda x: x[:20])])

In [None]:
row_ids = list(res_df.row_ids[(res_df.status=="failed")&(res_df.edit_name==file[:-4])])
#for row_id in row_ids[0]:
#    print(row_id, len(row_id), row_id[:20]==lar_df.lei.iloc[0])

In [None]:
row_ids = list(res_df.row_ids[(res_df.status=="failed")&(res_df.edit_name==file[:-4])])
for index, row in lar_df[~lar_df.uli.isin(row_ids[0])].iterrows():
#    print(row.uli, len(row.uli), type(row.uli), row.uli[:20]==ts_df.lei.iloc[0])
    print(row.uli, len(row.uli))

In [None]:
lar_df[lar_df.app_score_name.isin(["NA","-1",""])]