# Ingesting Modified Loan Application Register Data

In [1]:
import pandas as pd
import sqlite3 as sql
import os

CFPB's schema for MLAR data:

In [10]:
schema = pd.read_html('https://ffiec.cfpb.gov/documentation/publications/modified-lar/modified-lar-schema')[0]
schema.set_index('Data Field Number', inplace=True)
schema.index.name = ''
schema.head()

Unnamed: 0,Data Field Name,Data Field Type,Field Type Description,Valid Values,Descriptions and Examples,Data Point Name
,,,,,,
1.0,Activity Year,Numeric,Integer,2017 2018 2019 2020 2021 2022 2023,,
2.0,Legal Entity Identifier (LEI),Alphanumeric. Width is 20 characters,String,,Example: 12121212121212121212,Legal Entity Identifier (LEI)
3.0,Loan Type,Numeric,Integer,1 2 3 4,Descriptions: 1. Conventional (not insured or...,Loan Type
4.0,Loan Purpose,Numeric,Integer,1 2 31 32 4 5,Descriptions: 1. Home purchase 2. Home impro...,Loan Purpose
5.0,Preapproval,Numeric,Integer,1 2,Descriptions: 1. Preapproval requested 2. Pr...,Preapproval


Use schema to create SQLite table:

In [11]:
schema['row_type'] = ''
schema.loc[schema['Data Field Type']=='Alphanumeric. Width is 20 characters', 'row_type'] = 'CHAR(20)'
schema.loc[schema['Field Type Description'].isin(['Integer or Blank', 'Integer or NA', 'Integer or NA or Exempt']), 'row_type'] = 'INT'
schema.loc[schema['Field Type Description']=='Integer', 'row_type'] = 'INT NOT NULL'
schema.loc[schema['Field Type Description'].str.contains('Double'), 'row_type'] = 'FLOAT'
schema.loc[(schema['Field Type Description']=='String')&(schema['Data Field Type']!='Alphanumeric. Width is 20 characters'), 'row_type'] = 'VARCHAR'

In [12]:
schema_str = "CREATE TABLE MLAR(\n"
schema_str += ',\n'.join(schema['Data Field Name'].apply(lambda x: f"'{x}'") + " " + schema['row_type']) + ");"

In [5]:
connection = sql.connect('./cleaned/hmda.db')
cursor = connection.cursor()
cursor.execute("DROP TABLE IF EXISTS MLAR")
cursor.execute(schema_str)
connection.close()

Raw data downloaded from <a href="https://ffiec.cfpb.gov/data-publication/modified-lar/2023">here</a>.

Read data and save to SQLite:

In [6]:
# for every file in raw directory
for filename in os.listdir('./raw'):
    if 'mlar' in filename:
        filepath = f'./raw/{filename}' 

        # read data into Pandas DataFrame
        print(f'Reading {filename[0:4]}...')
        df = pd.read_csv(filepath, delimiter='|', na_values=['NA','Exempt'], header=None, names=schema['Data Field Name'],
                        dtype={'State':'str', 
                                'Census Tract':'str',
                                'Age of Applicant or Borrower':'str',
                                'Age of Applicant >= 62':'str',
                                'Age of Co-Applicant or Co-Borrower':'str',
                                'Age of Co-Applicant >= 62':'str',
                                'Debt-to-Income Ratio':'str',
                                'Total Units':'str'
                                }
            )
        
        # save to SQLite database, delete Pandas DataFrame to remove from active memory
        print(f'Saving to SQLite...')
        with sql.connect('./cleaned/hmda.db') as connection:
            df.to_sql(name='MLAR', con=connection, index=False, if_exists='append')
            del df

print('Done.')
        


Reading 2022...
Saving to SQLite...
Reading 2023...
Saving to SQLite...
Done.


In [18]:
ts = pd.read_csv('./raw/2022_public_ts_pipe.txt', delimiter='|')
ts.head()

Unnamed: 0,activity_year,calendar_quarter,lei,tax_id,agency_code,respondent_name,respondent_state,respondent_city,respondent_zip_code,lar_count
0,2022,4,549300BI43WZOK3XF350,43-1894848,3,Bank of Franklin County,MO,Washington,63090,264
1,2022,4,549300HH2N9Q3HH7XL57,30-0999121,7,"CCM FINANCE, LLC",MN,MINNEAPOLIS,55439,97
2,2022,4,549300EHQ0Y7SP41BR91,20-8803449,7,"Athas Capital Group, Inc.",CA,Calabasas,91301,6064
3,2022,4,549300HN58ONH5KNJJ12,45-4156623,7,"Thompson Kane & Company, Inc",WI,Madison,53717,1278
4,2022,4,549300M0I314MIP2FT97,66-0690568,7,ACTUAL MORTGAGE BANKERS,PR,BAYAMON,960,355


In [19]:
ts = ts.drop(columns=['calendar_quarter', 'tax_id', 'agency_code', 'lar_count']).groupby('lei').max().reset_index()
ts['respondent_city'] = ts.respondent_city.str.title()
ts['respondent_zip_code'] = [x if len(x) == 5 else x[:5] for x in ts.respondent_zip_code] 
ts

Unnamed: 0,lei,activity_year,respondent_name,respondent_state,respondent_city,respondent_zip_code
0,01ERPZV3DOLNXY2MLB90,2022,Libertyville Bank & Trust NA,IL,Libertyville,60048
1,01J4SO3XTWZF4PP38209,2022,TRUSTMARK NATIONAL BANK,MS,Jackson,39201
2,01KWVG908KE7RKPTNP46,2022,HomeStreet Bank,WA,Seattle,98101
3,03D0JEWFDFUS0SEEKG89,2022,TD Bank,NJ,Mount Laurel,08054
4,0K2D5AK28E3O5CC06E35,2022,Silicon Valley Bank,CA,Santa Clara,95054
...,...,...,...,...,...,...
4455,YQI2CPR3Z44KAR0HG822,2022,The Park Bank,WI,Madison,53713
4456,YWC0TIKBQM2JV8L4IV08,2022,FIRST REPUBLIC BANK,CA,San Francisco,94111
4457,Z867SNMO7WGY8TTGQG78,2022,"SUMMIT COMMUNITY BANK, INC",WV,Moorefield,26836
4458,ZF85QS7OXKPBG52R7N18,2022,Associated Bank NA,WI,Green Bay,54301


In [20]:
with sql.connect('./cleaned/hmda.db') as connection:
    cursor = connection.cursor()
    cursor.execute("DROP TABLE IF EXISTS Institutions")
    cursor.execute("""
CREATE TABLE Institutions(
'lei' VARCHAR(20) NOT NULL,
'activity_year' INT NOT NULL,
'respondent_name' VARCHAR(70) NOT NULL,
'respondent_state' CHAR(2),
'respondent_city' VARCHAR(25),
'respondent_zip_code' CHAR(5)
);
""")
    ts.to_sql(name='Institutions', con=connection, index=False, if_exists='append')