# Imports and Paths

In [44]:
import re

import pandas as pd
# Set the maximum number of columns to None (unlimited)
pd.set_option('display.max_columns', None)

# Set the display option to show all rows
pd.set_option('display.max_rows', None)
# 1. Set options to show all columns and full column content
pd.set_option('display.max_columns', None)      # Show all columns
pd.set_option('display.max_colwidth', None)     # Show full width of cell content
pd.set_option('display.expand_frame_repr', False) # Don't wrap to multiple lines

import numpy as np

# Post Processing

In [45]:
final_df = pd.read_csv('master_output1.csv')
final_df.head()

Unnamed: 0,source_pdf,row_in_pdf,raw_text,lender,clsing_dt,endrsmt_dt,tmntn_dt,Borr_Age,Coborr_Age,Borr_Cnt,es_status,int_rt,int_rt_10yr,hecm_margin,pd_stmln_flg,rt_typ,arm_indx_typ,arm_prdc_typ,max_clm_amt,init_prncpl_lmt,hecm_orgntn_fees,prop_addr_zip_cd,loan_typ
0,100,1,"""GMFS LLC"" 12/31/2012 2/25/2013 2/16/2017 64 1 “Terminated 5.06 5.06 oR uo 117000 74529 0 ‘36108 *02""",GMFS LLC,12/31/2012,2/25/2013,2/16/2017,64.0,,1.0,Terminated,5.06,5.06,,,,,,117000.0,74529.0,0,36108.0,2.0
1,100,2,"""ONE REVERSE MORTGAGE LLC’ 12/15/2012 1/25/2013 23 73 2 ‘Assigned 4.5 4.5 o'N’ r uo uo 200000 136200 o ""36312 ""02""",ONE REVERSE MORTGAGE LLC,12/15/2012,1/25/2013,,,,,,4.5,4.5,,O,N,R,,200000.0,136200.0,0,36312.0,2.0
2,100,3,"""SUN WEST MORTGAGE CO_INC’ 1/11/2013, 3/4/2013 ca 1 ‘Assigned 5.3 5.3 oN i i a 130000 86710 0 '36571 ""02""",SUN WEST MORTGAGE CO_INC,1/11/2013,3/4/2013,,,,,,5.3,5.3,,I,I,A,,130000.0,86710.0,0,36571.0,2.0
3,100,4,"“MSR_ASSET VEHICLE LLC’ 2/20/2013 7/25/2013 76 1 ‘Assigned 5.06 5.06 o IN’ Fr uo 300000 207900 5000 ""36078 ""02""",“MSR_ASSET VEHICLE LLC,2/20/2013,7/25/2013,,,,,,5.06,5.06,,O,,,,300000.0,207900.0,0,36078.0,2.0
4,100,5,"""SUN WEST MORTGAGE CO_TNC’ 1/26/2013 4/22/2013 2 1_‘Endorsed 4,99 4.99 oN’ - — uo 240000 161040 2500 ""35966 ""02""",SUN WEST MORTGAGE CO_TNC,1/26/2013,4/22/2013,,,,,,4.99,,,,,,,240000.0,161040.0,0,35966.0,2.0


In [46]:
# Calculate the percentage of NaNs for each column
(final_df.isna().sum() / len(final_df)) * 100

source_pdf           0.000000
row_in_pdf           0.000000
raw_text             0.000000
lender               1.759782
clsing_dt            5.839955
endrsmt_dt          11.366537
tmntn_dt            55.488900
Borr_Age            66.719638
Coborr_Age          91.327870
Borr_Cnt            66.719638
es_status           49.990929
int_rt              10.424033
int_rt_10yr         25.335427
hecm_margin         62.993682
pd_stmln_flg        12.884058
rt_typ              39.815480
arm_indx_typ        79.943711
arm_prdc_typ        98.766929
max_clm_amt          3.121809
init_prncpl_lmt      9.901365
hecm_orgntn_fees     0.000000
prop_addr_zip_cd    21.657674
loan_typ             1.311129
dtype: float64

In [47]:
final_df.isna().sum()

source_pdf               0
row_in_pdf               0
raw_text                 0
lender               16976
clsing_dt            56336
endrsmt_dt          109649
tmntn_dt            535282
Borr_Age            643621
Coborr_Age          881008
Borr_Cnt            643621
es_status           482245
int_rt              100557
int_rt_10yr         244402
hecm_margin         607678
pd_stmln_flg        124288
rt_typ              384086
arm_indx_typ        771189
arm_prdc_typ        952770
max_clm_amt          30115
init_prncpl_lmt      95515
hecm_orgntn_fees         0
prop_addr_zip_cd    208924
loan_typ             12648
dtype: int64

## es_status

If raw_text column contains any of the strings in (erminate, ssigne, ndorse), es_status must be given the values Terminated, Assigned, Endorsed respectively.

In [48]:
# Count the number of NaN values in the 'es_status' column
print('Before Operation: ')
nan_count = final_df['es_status'].isna().sum()
print(f"Number of NaN values in es_status: {nan_count}")
print(list(final_df['es_status'].unique()))

# Define the mapping for CamelCase results
status_map = {
    'rminate': 'Terminated',
    'ssigne': 'Assigned',
    'ndorse': 'Endorsed',
    'rninate': 'Terminated',
    'dorssed': 'Endorsed',
    'dosed': 'Endorsed',
    'dorsod': 'Endorsed',
    'andrea' : 'Endorsed',
    'erinate': 'Terminated',
    'vminate': 'Terminated',
    'adored' : 'Endorsed',
    'Assizned' : 'Assigned',
    'Temnaed' : 'Terminated',
    'Temnted' : 'Terminated',
    'Teminsed' : 'Terminated',
    'Temrated' : 'Terminated',
    'Tememed' : 'Terminated',
    'Temnated' : 'Terminated',
    'Teminated' : 'Terminated',
    'Temnised' : 'Terminated',
    'Temesed': 'Terminated',
    'Temmmed': 'Terminated',
    'Teminaied': 'Terminated',
    'Temrated': 'Terminated',
    'Tememed': 'Terminated',
    'Temas': 'Terminated',
    'Temeses': 'Terminated',
    'Teminaied':'Terminated',
    'Temes':'Terminated',
    'Teminaed':'Terminated',
    'Teminaed':'Terminated',
    'Temeated':'Terminated',
    'Temurated':'Terminated',
    'Temnated':'Terminated',
    'Temiewtes':'Terminated',
    'Temnwed':'Terminated',
    'Temes':'Terminated',
    'Teminsted':'Terminated',
    'Temrated':'Terminated',
    'Temnsed': 'Terminated',
    'Temmates':'Terminated',
    'Temnwed':'Terminated',
    'Tememes':'Terminated',
    'Teminatas':'Terminated',
    'Asuged':'Assigned',
    'Asogred':'Assigned',
    'Asugred':'Assigned',
    'Enmres':'Endorsed',
    'Exess':'Endorsed',
    'Enaones': 'Endorsed',
    'Temnaed' : 'Terminated',
    'Temrated' : 'Terminated',
    'Teminised' : 'Terminated',
    'Tememned' : 'Terminated',
    'Temnated' : 'Terminated',
    'Temeses' : 'Terminated',
    'Temmesed' : 'Terminated',
    'Temmmed' : 'Terminated',
    'Teminaied' : 'Terminated',
    'Temesed' : 'Terminated',
    'Teminaed' : 'Terminated',
    'Temwetse' : 'Terminated',
    'Teminast' : 'Terminated',
    'Temiewtes' : 'Terminated',
    'Temned' : 'Terminated',
    'Teminsted' : 'Terminated',
    'Teminas' : 'Terminated',
    'Teminatas' : 'Terminated',
    'Asuged' : 'Assigned',
    'Asogred' : 'Assigned',
    'Asugred' : 'Assigned',
    'Enmres' : 'Endorsed',
    'Enaones' : 'Endorsed',
    'Enaoses' : 'Endorsed'
}

# Create a regex pattern from the keys (terminated|assigned|endorsed)
pattern = '|'.join(status_map.keys())

# 1. Identify rows where 'es_status' is NaN
mask_is_nan = final_df['es_status'].isna()

# 2. Extract the matching string from 'raw_text' (case-insensitive)
# This returns the lowercase version found in the text
extracted = final_df['raw_text'].str.extract(f'({pattern})', expand=False, flags=re.IGNORECASE).str.lower()

# 3. Apply the update
# Only update if es_status is NaN AND a match was found in raw_text
final_df.loc[mask_is_nan, 'es_status'] = extracted.map(status_map)

# Count the number of NaN values in the 'es_status' column
print('After Operation: ')
nan_count = final_df['es_status'].isna().sum()
print(f"Number of NaN values in es_status: {nan_count}")
print(list(final_df['es_status'].unique()))

Before Operation: 
Number of NaN values in es_status: 482245
['Terminated', nan]
After Operation: 
Number of NaN values in es_status: 563
['Terminated', 'Assigned', 'Endorsed', nan]


In [49]:
# 1. Set the option to show unlimited column width
pd.set_option('display.max_colwidth', None)

# 2. Define your columns and filter (as you did in your screenshot)
cols_to_show = ['source_pdf', 'row_in_pdf', 'raw_text']
nan_status_df = final_df[final_df['es_status'].isna()]

# 3. Display the first 20 rows of these columns
# Using .head(20) prevents the notebook from lagging if there are many rows
nan_status_df[cols_to_show]

Unnamed: 0,source_pdf,row_in_pdf,raw_text
234,1,1,"""Lender"" ""clsng dt’ ‘endrsmnt_dt' trmtn_dt' ‘Horr Age’ 'Coborr_Age’ ‘Horr Cnt' ‘cs status’ ‘int _rt' ‘int_rt lyr’ ‘hecm margin’ ‘pd_strmln fig’ 'rt_typ' ‘arm indx typ’ ‘arm prde typ’ ""max clm amt’ “init prncpl_Imt’ ‘hecm orgntn fees’ ""prop _addr_zip_cd’ loan_typ"
37852,10506,23,"['WELLS FARGO BANK NAY uyaoo5} Psa} sa af fof fut ay 2000 7zasi| of '98360 ""02"""
70957,10955,4,[TRINANCIAL FREEDOM SENIOR FUNDING corp’ | 11/20/2002] i/a2/ao0sf |] Assizned TP .01f 58 fin aye 60000 oraao.24[ 0 [98685 Fon
72856,10980,24,"[TRINANCE OF AMERICA REVERSE LLC’ | 8/15/2012] 9/io/aorz{ ff Assizned [5.06 506 one is00 [7750.5] 200 [""2655a ""02"" |"
111150,11495,10,"""WORLD ALLIANCE FINANCIAL CORP. ' 10/27/2008 12/22/2008 69 68 2 5. 681 5.67 1.5 'N ae 'L! ‘iM! 157000 156999 3140 '62431 | '02"""
116737,1157,23,"""PINANCIAL FREEDOM SENIOR FUNDING CORP"" 1/24/2006 9/19/2006 76 1 5.93 5.86 1.5 'N' Ww T ny 312000 221520 0 ""95822 ""02"""
120780,11623,22,"""FINANCIAL FREEDOM SENIOR FUNDING CORP"" 1/8/2007 __ 8/17/2009 a 1 6.49 6.17 1.5 'N' w 'T ‘iy! 155500 97498. 5 0 ""60620 ' ""02"""
125235,11683,49,"""WORLD ALLIANCE FINANCIAL CORP. ' 7/23/2008 __ 9/30/2008 68 1 3.7L 5.36 1.5 'N' ww 'T! ‘iw! 160000 159999 0 ‘60419 ""02"""
131998,11774,26,"""PINANCE OF AMERICA REVERSE LLC’ 11/23/2009 2/11/2010 84 83 2 5. 56 5. 56 o'N' ""Fr ut i 300000 216300 2254 '60630 | 02"""
133549,11795,35,"""WELLS FARGO BANK NA"" 10/8/2010 11/30/2010 67 1 4.99 4.99 oN rt a co 265000 160325 0 ""60630 ' ""02"""


## lender

See if in a row where lender is NaN and the raw text contains a string that is already there in the lender column

In [50]:
# 1. Get unique non-null lender values
lender_values = final_df["lender"].dropna().unique()

# 2. Create a regex pattern from lender values (escaped for safety)
pattern = r'(' + '|'.join(map(re.escape, lender_values)) + r')'

# 3. Only update rows where lender is NaN
mask = final_df["lender"].isna()

# 4. Extract matching lender from raw_text
final_df.loc[mask, "lender"] = (
    final_df.loc[mask, "raw_text"]
    .str.extract(pattern, expand=False)
)

Remove all non alphabet characters from the beginning or the end

In [51]:
nan_count = final_df['lender'].isna().sum()
print(f"Number of NaN values in lender: {nan_count}")

Number of NaN values in lender: 787


Remove all special characters from beginning or end

In [52]:
# Total non unique values in the column lender
print('Before Operation: ')
print(final_df['lender'].nunique(dropna=False))

# Remove all the special characters and spaces from the beginning or the end
final_df['lender'] = final_df['lender'].str.replace(r'^[^a-zA-Z0-9]+|[^a-zA-Z0-9]+$', '', regex=True)

# Total non unique values in the column lender
print('After Operation: ')
print(final_df['lender'].nunique(dropna=False))

Before Operation: 
48399
After Operation: 
42787


Make all names uppercase

In [53]:
# Total non unique values in the column lender
print('Before Operation: ')
print(final_df['lender'].nunique(dropna=False))

# Turn all the characters to UPPER STRING
final_df['lender'] = final_df['lender'].str.upper()

# Total non unique values in the column lender
print('After Operation: ')
print(final_df['lender'].nunique(dropna=False))

Before Operation: 
42787
After Operation: 
40299


Most Frequent lenders

In [54]:
# Returns a Series where the index is the Lender and the value is the count
final_df['lender'].value_counts()

lender
WELLS FARGO BANK NA                                                                                                                   99314
AMERICAN ADVISORS GROUP                                                                                                               72899
FINANCIAL FREEDOM SENIOR FUNDING CORP                                                                                                 39340
PINANCIAL FREEDOM SENIOR FUNDING CORP                                                                                                 35416
BANK OF AMERICA NA CHARLOTTE                                                                                                          35193
SEATTLE MORTGAGE COMPANY                                                                                                              32458
METLIFE BANK, NATIONAL ASSOCIATION                                                                                                    28326
ONE REVERSE M

### FARGO

In [55]:
# Returns unique values that contain a particular string
# 1. Define your search variable
search_string = "RGO B"

# 2. Filter the column and get unique values
# na=False handles rows with missing data so the code doesn't error out
list(final_df[final_df['lender'].str.contains(search_string, na=False)]['lender'].unique())

['WELLS FARGO BANK NA',
 'WELLS FARGO BANK NAT',
 'WELLS BARGO BANK NA',
 'WELLS FARGO BANK NAY',
 'TWELLS FARGO BANK NAT   72172006',
 'TWELES FARGO BANK NAT',
 'WELLS FARGO BANK NAY 27',
 'WELLS FARGO BANK NAT A /EDA006',
 'WELLS FARGO BANK NAY AS FO007',
 "WELLS FARGO BANK NA' /YA007",
 'WELLS FARGO BANK NAT 11 /A',
 'TWELLS FARGO BANK NAT',
 'WELLS FARGO BANK NAY TI /2006',
 'WELLS FARGO BANK NAT 142006',
 "WELLS FARGO BANK NA' /T/2006",
 'WELLS FARGO BANK NAT AS /A/2006',
 "WELLS FARGO BANK NA'   T",
 "WELLS FARGO BANK NA'   T/T",
 'WELLS FARGO BANK NAT A2 /2006',
 "WELLS FARGO BANK NA' A/T6Y2006",
 'WELLS FARGO BANK NAY ASF 2006',
 'WELLS FARGO BANK NAT AA /AAO06',
 'WELLS FARGO BANK NAY   17',
 'WELLS FARGO BANK NAT APA /A006',
 'WELLS FARGO BANK NAY   APE /E006',
 'WELLS FARGO BANK NAT A2 /A006',
 'TELLS FARGO BANK NAT',
 'WELLS FARGO BANK NAT AP/I9F2006 A/I',
 'WELLS FARGO BANK NAT AF FO007',
 'WELLS FARGO BANK NAT   P/AA/O006',
 'WELLS FARGO BANK NAT   A/APZOO07',
 'WELLS FAR

In [56]:
print('Before Operation: ')
print(final_df['lender'].nunique(dropna=False))

# Use .loc to find rows where 'lender' contains 'GMFS' and update the 'lender' column
final_df.loc[final_df['lender'].str.contains('RGO B', na=False), 'lender'] = 'WELLS FARGO BANK NA'

print('After Operation: ')
print(final_df['lender'].nunique(dropna=False))

Before Operation: 
40299
After Operation: 
35202


### AMERICAN ADVISORS GROUP

In [57]:
# Returns unique values that contain a particular string
# 1. Define your search variable
search_string = "ISOR"

# 2. Filter the column and get unique values
# na=False handles rows with missing data so the code doesn't error out
list(final_df[final_df['lender'].str.contains(search_string, na=False)]['lender'].unique())

['AMERICAN ADVISORS GROUP',
 'AMERICAN ADVISORS GROUP\'   A/TAZYAONO] GENNI]   WA]   NDORSED "  5.625] 5.625] ANY TE G60',
 "AMERICAN ADVISORS GROUP'   A/TZAON0",
 'TAMERICAN ADVISORS GROUP™   87',
 "AMERICAN ADVISORS GROUP' _   2010",
 "AMERICAN ADVISORS GROUP'   T0IGY20N0",
 'TAMBRICAN ADVISORS GROUP',
 "TAMERICAN ADVISORS GROUP'   2Z/2Z0NN",
 "AMERICAN ADVISORS GROUP'   TI2Z0NI",
 "AMERICAN ADVISORS GROUP'   AON",
 "TAMERICAN ADVISORS GROUP'   A",
 'TAMERICAN ADVISORS GROUP   87',
 "AMERICAN ADVISORS GROUP'   A5 FO0NI",
 "AMERICAN ADVISORS GROUP'   A/ZA0NI",
 'TAMERICAN ADVISORS GROUP AVZI/AONN',
 "AMERICAN ADVISORS GROUP' 5 /TFA0NI",
 "AMERICAN ADVISORS GROUP'   AZAR",
 "AMERICAN ADVISORS GROUP'   A/ZZ/A0NI  5 /A",
 "AMERICAN ADVISORS GROUP'   W",
 'TAMERICAN ADVISORS GROUT   Z',
 "AMERICAN ADVISORS GROUP' 6A FO0NI",
 "AMERICAN ADVISORS GROUP'   ATIZ0NI",
 'TAMERICAN ADVISORS GROUP',
 'AMERICAN ADVISORS GROUT',
 "AMERICAN ADVISORS GROUP' _   G4Y2OUI  G/A",
 "AMERICAN ADVISORS GROUP

In [58]:
print('Before Operation: ')
print(final_df['lender'].nunique(dropna=False))

# Use .loc to find rows where 'lender' contains 'GMFS' and update the 'lender' column
final_df.loc[final_df['lender'].str.contains('ADV', na=False), 'lender'] = 'AMERICAN ADVISORS GROUP'

print('After Operation: ')
print(final_df['lender'].nunique(dropna=False))

Before Operation: 
35202
After Operation: 
30168


### FINANCIAL FREEDOM SENIOR FUNDING CORP

In [59]:
# Returns unique values that contain a particular string
# 1. Define your search variable
search_string = "R FU"

# 2. Filter the column and get unique values
# na=False handles rows with missing data so the code doesn't error out
list(final_df[final_df['lender'].str.contains(search_string, na=False)]['lender'].unique())

['PINANCIAL FREEDOM SENIOR FUNDING CORP',
 'FINANCIAL FREEDOM SENIOR FUNDING CORP',
 'PINANCTAL FREEDOM SENTOR FUNDING CORP',
 'RINANCTAL FREEDOM SENTOR FUNDING CORP',
 'BINANCIAL FREEDOM SENIOR FUNDING CORP',
 'RINANCIAL FREEDOM SENIOR FUNDING CORP',
 'PINANCIAL FREEDOM SENTOR FUNDING CORP',
 'TRINANCIAL FREEDOM SENIOR FUNDING CORP',
 'FINANCTAL FREEDOM SENIOR FUNDING CORP',
 'PINANCTAL FREEDOM SENIOR FUNDING CORP',
 "RINANCTAL FREEDOM SENIOR FUNDING CORP'   AZ/A/2006",
 'RINANCTAL FREEDOM SENIOR FUNDING CORP',
 "FINANCIAL FREEDOM SENIOR FUNDING CORP' _____I",
 'TRINANCTAL FREEDOM SENIOR FUNDING CORP',
 'FINANCIAL FREEDOM SENTOR FUNDING CORP',
 'FINANCIAL FREEDOM SENJOR FUNDING CORP',
 'BINANCTAL FREEDOM SENIOR FUNDING CORP',
 'FINANCIAL FREEDON SENIOR FUNDING CORP',
 'TFINANCIAL FREEDOM SENIOR FUNDING CORP',
 'TBINANCIAL FREEDOM SENIOR FUNDING CORP',
 'TFINANCTAL FREEDOM SENIOR FUNDING CORP',
 'TFINANCIAL FREEDOM SENTOR FUNDING CORP',
 "RINANCTAL FREEDOM SENIOR FUNDING CORP'   A/AZ/2

In [60]:
print('Before Operation: ')
print(final_df['lender'].nunique(dropna=False))

# Use .loc to find rows where 'lender' contains 'GMFS' and update the 'lender' column
final_df.loc[final_df['lender'].str.contains('R FU', na=False), 'lender'] = 'FINANCIAL FREEDOM SENIOR FUNDING CORP'

print('After Operation: ')
print(final_df['lender'].nunique(dropna=False))

Before Operation: 
30168
After Operation: 
29745


### BANK OF AMERICA NA CHARLOTTE

In [61]:
# Returns unique values that contain a particular string
# 1. Define your search variable
search_string = "A C"

# 2. Filter the column and get unique values
# na=False handles rows with missing data so the code doesn't error out
list(final_df[final_df['lender'].str.contains(search_string, na=False)]['lender'].unique())

['BANK OF AMERICA NA CHARLOTTE',
 'BANK OF _ANERICA NA CHARLOTTE',
 "BANK OF AMERICA NA CHARLOTTE'   N",
 'BANK OF AWERICA NA CUARLORTE',
 'BANK OF AWERICA NA CHARLOTTE',
 'BANK OP AMERICA NA CHARLOTTE',
 'TBANK OF AMERICA NA CHARLOTTE',
 "BANK OP AMERICA NA CHARLOTTE'   Z/T",
 'BANK OF ANERICA NA CHARLOTTE',
 'BANK OF AWERTCA NA CUARLORTE',
 "BANK OF AMERICA NA CHARLOTTE'   AN/A",
 'BANK OF AMERICA NA CHARLOTTE   AA /2I 2007',
 "BANK OF AMERICA NA CHARLOTTE'   AS/EIY2O07",
 "BANK OF AMBRICA NA CHARLOTTE'   AN",
 "BANK OF AMERICA NA CHARLOTTE'   17",
 "BANK OF AMERICA NA CHARLOTTE'   AP/A",
 "BANK OP AMERICA NA CHARLOTTE'   142008",
 "BANK OF AMBRICA NA CHARLOTTE'   AZ/AI/20O7",
 'BANK OF AMERICA NA CUARLOTTE',
 "BANK OF AMERICA NA CHARLOTTE'   P/AA/2007",
 "BANK OF AMERICA NA CHARLOTTE'   ATI 2Z008",
 "BANK OF AMERICA NA CHARLOTTE'   AZ /ZOO8",
 "BANK OP AMERICA NA CHARLOTTE'   A7TGY2008",
 "BANK OP AMERICA NA CHARLOTTE'   A/TI/2O08",
 "BANK OF AMERICA NA CHARLOTTE'   47",
 'BANK OF A

In [62]:
print('Before Operation: ')
print(final_df['lender'].nunique(dropna=False))

# Use .loc to find rows where 'lender' contains 'GMFS' and update the 'lender' column
final_df.loc[final_df['lender'].str.contains('A C', na=False), 'lender'] = 'BANK OF AMERICA NA CHARLOTTE'

print('After Operation: ')
print(final_df['lender'].nunique(dropna=False))

Before Operation: 
29745
After Operation: 
28530


### SEATTLE MORTGAGE COMPANY

In [63]:
# Returns unique values that contain a particular string
# 1. Define your search variable
search_string = "TLE M"

# 2. Filter the column and get unique values
# na=False handles rows with missing data so the code doesn't error out
list(final_df[final_df['lender'].str.contains(search_string, na=False)]['lender'].unique())

['SEATTLE MORTGAGE COMPANY',
 'TSEATTLE MORTGAGE COMPANY',
 "SEATTLE MORTGAGE CONPANY'   T",
 'SEATTLE MORTGAGE COWPANY',
 'SEATTLE MORTGAGE CONPANY',
 "TSEATTLE MORTGAGE COMPANY' 17",
 "SEATTLE MORTGAGE COMPANY'   A",
 "SEATTLE MORTGAGE COMPANY'   AZ/T/A003",
 "SEATTLE MORTGAGE COMPANY'   T",
 "TSEATTLE MORTGAGE COMPANY'   17",
 "SEATTLE MORTGAGE COMPANY'   TSI 2006",
 "SEATTLE MORTGAGE COMPANY'   AAA /2006",
 "SEATTLE MORTGAGE COMPANY' __ 27",
 "SEATTLE MORTGAGE COMPANY' __   2007",
 "SEATTLE MORTGAGE COMPANY'   ATZFA007",
 "SEATTLE MORTGAGE COMPANY'   A/A/A007",
 "SEATTLE MORTGAGE COMPANY'   2 FA007",
 "TSEATTLE MORTGAGE COMPANY'   27",
 "SEATTLE MORTGAGE COMPANY'  G T/2007",
 "SEATTLE MORTGAGE COMPANY'   ATIIGOA    GYZAVIGPA] TF   TERMINATED'   7A] A] EEE TATE YT TOGS",
 "SEATTLE MORTGAGE COMPANY'   A/AYIG95",
 "SEATTLE MORTGAGE COMPANY'   87",
 "SEATTLE MORTGAGE COMPANY'   A/TAYI995",
 "SEATTLE MORTGAGE COMPANY'   T0ISI995",
 "SEATTLE MORTGAGE COMPANY'   /AYIG95",
 "SEATTLE MORTGA

In [64]:
print('Before Operation: ')
print(final_df['lender'].nunique(dropna=False))

# Use .loc to find rows where 'lender' contains 'GMFS' and update the 'lender' column
final_df.loc[final_df['lender'].str.contains('TLE M', na=False), 'lender'] = 'SEATTLE MORTGAGE COMPANY'

print('After Operation: ')
print(final_df['lender'].nunique(dropna=False))

Before Operation: 
28530
After Operation: 
28093


### METLIFE BANK, NATIONAL ASSOCIATION

In [65]:
# Returns unique values that contain a particular string
# 1. Define your search variable
search_string = "K,"

# 2. Filter the column and get unique values
# na=False handles rows with missing data so the code doesn't error out
list(final_df[final_df['lender'].str.contains(search_string, na=False)]['lender'].unique())

['METLIFE BANK, NATIONAL ASSOCIATION',
 'WETLIFE BANK, NATIONAL ASSOCTATION',
 'ETLIFE BANK, NATIONAL ASSOCTATION',
 'WETLIFE BANK, NATIONAL ASSOCIATION',
 'METLIFE BANK, NATIONAL ASSOCTATION',
 'ETLIFE BANK, NATIONAL ASSOCIATION',
 'METLIFE BANK, NATIONAL ASSOCRATION',
 'METLIFE BANK, NATIONAL ASSOCATION',
 'IMBTLIFE BANK, NATIONAL ASSOCIATION',
 "METLIFE BANK, NATIONAL ASSOCRATION'   T",
 "METLIFE BANK, NATIONAL ASSOCRATION'   AZ/I/2008",
 "METLIFE BANK, NATIONAL ASSOCIATION'   N",
 "METLIFE BANK, NATIONAL ASSOCRATION'   TO/AI/2008",
 "METLIFE BANK, NATIONAL ASSOCIATION' __   TO",
 'METLIPE BANK, NATIONAL ASSOCIATION',
 'METLIBE BANK, NATIONAL ASSOCRATION',
 'WETLLFE BANK, NATIONAL ASSOCIATION',
 "METLIFE BANK, NATIONAL ASSOCIATION'   UU/A/2008",
 "METLIFE BANK, NATIONAL ASSOCIATION'   A",
 'METLIBE BANK, NATIONAL ASSOCIATION',
 'METLIFE BANK, NATIONAL ASSOCRATRON',
 "METLIFE BANK, NATIONAL ASSOCRATION'   UU",
 "WETLIFE BANK, NATIONAL ASSOCRATRON'   I",
 'METLLIFE BANK, NATIONAL ASSO

In [66]:
print('Before Operation: ')
print(final_df['lender'].nunique(dropna=False))

# Use .loc to find rows where 'lender' contains 'GMFS' and update the 'lender' column
final_df.loc[final_df['lender'].str.contains('K,', na=False), 'lender'] = 'METLIFE BANK, NATIONAL ASSOCIATION'

print('After Operation: ')
print(final_df['lender'].nunique(dropna=False))

Before Operation: 
28093
After Operation: 
26956


### ONE REVERSE MORTGAGE LLC

In [67]:
# Returns unique values that contain a particular string
# 1. Define your search variable
search_string = "NE R"

# 2. Filter the column and get unique values
# na=False handles rows with missing data so the code doesn't error out
list(final_df[final_df['lender'].str.contains(search_string, na=False)]['lender'].unique())

['ONE REVERSE MORTGAGE LLC',
 'ONE REVERSE MORTGAGE LIC',
 'ONE REVERSE WORTGAGE LLC',
 "ONE REVERSE MORTGAGE LLC'   TO/IGY2007",
 'ONE REVERSE WORTGAGE LUC',
 'TONE REVERSE MORTGAGE LLC',
 'TONE REVERSE MORTGAGE LIC',
 'TONE REVERSE WORTGAGE LLC',
 'ONE REVERSE ORTGAGE LIC',
 'ONE REVERSE WORTGAGE LIC',
 'ONE REVERSE ORTGAGE LUC',
 'ONE REVERSE MORTGAGE LUC',
 "TONE REVERSE MORTGAGE LLC' A2 /A008",
 "TONE REVERSE MORTGAGE LLC'   A/2D/A008",
 "ONE REVERSE MORTGAGE LIC' __—   TOI /2008",
 "ONE REVERSE MORTGAGE LIC' __   /9Y2008",
 "ONE REVERSE MORTGAGE LLC'   TZ /2008",
 "TONE REVERSE MORTGAGE LLC' TA",
 "TONE REVERSE MORTGAGE LLC'   A/Z2009",
 "TONE REVERSE MORTGAGE LLC'   AA",
 "TONE REVERSE MORTGAGE LLC'   A",
 'TONE REVERSE WORTGAGE LUC',
 'ONE REVERSE ORTGAGE LLC',
 "ONE REVERSE MORTGAGE LLC'   A/I/2009",
 "TONE REVERSE MORTGAGE LLC'   FT /O009",
 "TONE REVERSE WORTGAGE LLC'   27",
 "TONE REVERSE MORTGAGE LLC'   A/T",
 'LONE REVERSE MORTGAGE LLC',
 "TONE REVERSE MORTGAGE LLC'   AT 

In [68]:
print('Before Operation: ')
print(final_df['lender'].nunique(dropna=False))

# Use .loc to find rows where 'lender' contains 'GMFS' and update the 'lender' column
final_df.loc[final_df['lender'].str.contains('NE R', na=False), 'lender'] = 'ONE REVERSE MORTGAGE LLC'

print('After Operation: ')
print(final_df['lender'].nunique(dropna=False))

Before Operation: 
26956
After Operation: 
24609


### GENERATION MORTGAGE COMPANY

In [69]:
# Returns unique values that contain a particular string
# 1. Define your search variable
search_string = "ON MO"

# 2. Filter the column and get unique values
# na=False handles rows with missing data so the code doesn't error out
list(final_df[final_df['lender'].str.contains(search_string, na=False)]['lender'].unique())

['GENERATION MORTGAGE COMPANY',
 'GENERATION MORTGAGE COWPANY',
 'TGENERATION MORTGAGE COWPANY',
 "TGENERATION MORTGAGE COMPANY'   27",
 "GENERATION MORTGAGE COWPANY'   A",
 'GENERATION MORTGAGE COVPANY',
 "GENERATION MORTGAGE COMPANY'   A",
 'TGENERATION MORTGAGE COMPANY',
 "GENERATION MORTGAGE COWPANY'   A/I/2008",
 "GENERATION MORTGAGE COMPANY'   A/ADYAO08",
 "GENERATION MORTGAGE COMPANY'   ATA/Z008",
 "GENERATION MORTGAGE COMPANY'   AVZ 2008] S/ZO/ZOOS]  ] TA  TINDORSED § {AF AF ATT AY 26",
 'GENERATION MORTGAGE COMANY',
 'GENERATION MORTGAGE COMPAANY',
 "GENERATION MORTGAGE COMPANY'   617 /2008",
 'GENERATTON MORTGAGE COMPANY',
 "GENERATION MORTGAGE COMPANY'   G",
 'GENERATION MORTGAGE COWPAANY',
 "GENERATION MORTGAGE COMPANY' _—   G",
 "GENERATION MORTGAGE COMPANY'   T/3 2008] A/AN/EOO9  SVAS/EOZI  GS]     TERMINATED'   ANF SAG] TNT TAY 35",
 "GENERATION MORTGAGE COMPANY'   T0I",
 "GENERATION MORTGAGE COMPANY'   1A/T",
 "GENERATION MORTGAGE COWPANY'   A/4Y2009",
 "GENERATION MORT

In [70]:
print('Before Operation: ')
print(final_df['lender'].nunique(dropna=False))

# Use .loc to find rows where 'lender' contains 'GMFS' and update the 'lender' column
final_df.loc[final_df['lender'].str.contains('ON MO', na=False), 'lender'] = 'GENERATION MORTGAGE COMPANY'

print('After Operation: ')
print(final_df['lender'].nunique(dropna=False))

Before Operation: 
24609
After Operation: 
23239


### REVERSE MORTGAGE FUNDING LLC

In [71]:
# Returns unique values that contain a particular string
# 1. Define your search variable
search_string = "GE FU"

# 2. Filter the column and get unique values
# na=False handles rows with missing data so the code doesn't error out
list(final_df[final_df['lender'].str.contains(search_string, na=False)]['lender'].unique())

['REVERSE MORTGAGE FUNDING LCT   A',
 'REVERSE MORTGAGE FUNDING LLC',
 'REVERSE MORTGAGE FUNDING LLCT   A/APZYAOIALA/II/OOIA  PO NDORSED “  506] 06 FUT 63000] GIGS2S2',
 'REVERSE MORTGAGE FUNDING LUC',
 'REVERSE MORTGAGE FUNDING LLCT',
 'REVERSE MORTGAGE FUNDING LIC',
 'REVERSE MORTGAGE FUNDING LLCT   A/I/AOIA',
 "REVERSE MORTGAGE FUNDING LUC'   AT6Y20I4",
 'REVERSE MORTGAGE FUNDING LCT',
 "TREVERSE MORTGAGE FUNDING LLC'   7S /A0IA",
 'REVERSE MORTGAGE FUNDING LLC)   3ISYAOIA',
 'TREVERSE MORTGAGE FUNDING LLC',
 "TREVERSE MORTGAGE FUNDING LLC'   47",
 "REVERSE MORTGAGE FUNDING LLC'   A/TG/AOIAL",
 "REVERSE MORTGAGE FUNDING LUC'   AT2OIA",
 'REVERSE MORTGAGE FUNDING LUCY   A',
 "REVERSE MORTGAGE FUNDING LUC'   AO/A0IA  5",
 "REVERSE MORTGAGE FUNDING LIC'   A//Z0IA  5Y",
 "REVERSE MORTGAGE FUNDING LLC'   A//A0I4",
 "REVERSE MORTGAGE FUNDING LC'   A/T0Y2014",
 'REVERSE MORTGAGE FUNDING LLCT   A',
 'REVERSE MORTGAGE FUNDING LUCT',
 'IREVERSE MORTGAGE FUNDING LIC',
 "REVERSE MORTGAGE FUNDIN

In [72]:
print('Before Operation: ')
print(final_df['lender'].nunique(dropna=False))

# Use .loc to find rows where 'lender' contains 'GMFS' and update the 'lender' column
final_df.loc[final_df['lender'].str.contains('GE FU', na=False), 'lender'] = 'REVERSE MORTGAGE FUNDING LLC'

print('After Operation: ')
print(final_df['lender'].nunique(dropna=False))

Before Operation: 
23239
After Operation: 
22204


### JAMES B NUTTER AND COMPANY

In [73]:
# Returns unique values that contain a particular string
# 1. Define your search variable
search_string = " B "

# 2. Filter the column and get unique values
# na=False handles rows with missing data so the code doesn't error out
list(final_df[final_df['lender'].str.contains(search_string, na=False)]['lender'].unique())

['JAMES B NUTTER AND COWPANY',
 'JAMES B NUTTER AND COMPANY',
 "VJAMES B NUTTER AND COWPANY'   N/I/2006",
 "JAMES B NUTTER AND COWPANY'   T",
 "JAMES B NUTTER AND COMPANY'   A/A/2007",
 "JAMES B NUTTER AND COMPANY'   AI /2007",
 'JAWES B NUTTER AND COWPANY',
 "JAMES B NUTTER AND COMPANY'   A",
 "JAMES B NUTTER AND COMPANY'  I YA007",
 'TJAMES B NUTTER AND COPANY',
 "VJAMES B NUTTER AND COMPANY'   A/AI 2007",
 'VJAMES B NUTTER AND COWPANY',
 'VJAMES B NUTTER AND COMPANY',
 "JAWES B NUTTER AND COWPANY'   87",
 "JAMES B NUTTER AND COMPANY'   A//2007",
 "JAMES B NUTTER AND COMPANY'   ATE Y2007  _S/2007    SF   ASSIGNED *] 5.94] 3) A RY YT YT   300",
 "JAMES B NUTTER AND COWPANY'   A/TA/2007",
 "JAMES B NUTTER AND COWPANY'   85",
 'TJAMES B NUTTER AND COWPANY',
 'TAMES B NUTTER AND COPANY',
 'JAMES B NUTTER AND COPANY',
 "JAMES B NUTTER AND COMPANY'   E/A /A007",
 "JAMES B NUTTER AND COWPANY'   A",
 "JAMES B NUTTER AND COMPANY'   /T",
 "JAMES B NUTTER AND COWPANY'   N",
 "VAMES B NUTTER AND

In [74]:
print('Before Operation: ')
print(final_df['lender'].nunique(dropna=False))

# Use .loc to find rows where 'lender' contains 'GMFS' and update the 'lender' column
final_df.loc[final_df['lender'].str.contains(' B ', na=False), 'lender'] = 'JAMES B NUTTER AND COMPANY'

print('After Operation: ')
print(final_df['lender'].nunique(dropna=False))

Before Operation: 
22204
After Operation: 
21767


### MSR ASSET VEHICLE LLC

In [75]:
# Returns unique values that contain a particular string
# 1. Define your search variable
search_string = "R AS"

# 2. Filter the column and get unique values
# na=False handles rows with missing data so the code doesn't error out
list(final_df[final_df['lender'].str.contains(search_string, na=False)]['lender'].unique())

['MSR ASSET VEHICLE LLC',
 'WSR ASSET VEHICLE LLC',
 'WSR ASSET VENICLE LIC',
 "TSR ASSET VEHICLE LLC' 47",
 "MSR ASSET VEHICLE LLC'   A /2009",
 'WSR ASSET VENIGLE LIC',
 "MSR ASSET VEHICLE LLC'   A",
 "TSR ASSET VEHICLE LLC' 17",
 'MSR ASSET VEHICLE LC',
 "TSR ASSET VENTGLE LLC' 17",
 'ISR ASSET VENIGLE LLC',
 "TSR ASSET VEHICLE LLC' NT 72000",
 "L'MSR ASSET VEHICLE LLC",
 'MSR ASSET VEMICLE LLC',
 "L'MSR ASSET VEHICLE LLC'   A",
 'TSR ASSET VENTGLE LLC',
 'MSR ASSET VEWIGLE LIC',
 'MSR ASSET VEHICLE LUC',
 "MSR ASSET VEHICLE LLC' A8 /2009",
 "ISR ASSET VEHICLE LLC' 47",
 'SR ASSET VEWICLE LIC',
 'SR ASSET VIWICLE LUC',
 "MSR ASSET VENICLE LUC' __ 57",
 'WSR ASSET VEHICLE LIC',
 'TSR ASSET VENIGLE LLC',
 "MSR ASSET VEHICLE LLC' 822010] 9VAI/20I0",
 "MSR ASSET VEHICLE LLC'   Z/T",
 'SR ASSET VENICLE LUC',
 "SR ASSET VENIGLE LUC' T5200",
 'MSR ASSET VENIGLE LIC',
 "MSR ASSET VEHICLE LLC'   AFAPZOONOL",
 "MSR ASSET VEHICLE LUC'   5",
 "MSR ASSET VEHICLE LLC'   NF /2009",
 "MSR ASSET VEH

In [76]:
print('Before Operation: ')
print(final_df['lender'].nunique(dropna=False))

# Use .loc to find rows where 'lender' contains 'GMFS' and update the 'lender' column
final_df.loc[final_df['lender'].str.contains('R AS', na=False), 'lender'] = 'MSR ASSET VEHICLE LLC'

print('After Operation: ')
print(final_df['lender'].nunique(dropna=False))

Before Operation: 
21767
After Operation: 
18364


### FINANCE OF AMERICA REVERSE LLC

In [77]:
# Returns unique values that contain a particular string
# 1. Define your search variable
search_string = "CA R"

# 2. Filter the column and get unique values
# na=False handles rows with missing data so the code doesn't error out
list(final_df[final_df['lender'].str.contains(search_string, na=False)]['lender'].unique())

['PINANCE OF AMERICA REVERSE LLC',
 'PINANCE OF AMERTCA REVERSE LLC',
 'FINANCE OF AMERICA REVERSE LLC',
 'RINANCE OF AMERTCA REVERSE LLC',
 'PINANCE OF AWERTCA REVERSE_LLC',
 'BINANCE OF AMERICA REVERSE LLC',
 'EINANCE OF AMERICA REVERSE LLC',
 'TRINANCE OF AMERICA REVERSE LLC',
 'FINANCE OF AMERTCA REVERSE LLC',
 'RINANCE OF AMERICA REVERSE LLC',
 'FINANCE OF AMERICA REVERSE_LLC',
 'TFINANCE OF AMERICA REVERSE LLC',
 'RINANCE OF AWERTCA REVERSE LLC',
 'RINANCE OF AWERTCA REVERSE_LLC',
 'RINANCE OF AWERTCA REVERSE_L1LC',
 'EINANCE OF AMPRICA REVERSE LLC',
 "PINANCE OF AMERICA REVERSE LLC'   T",
 'TBINANCE OF AMERICA REVERSE LLC',
 "FINANCE OF AMERICA REVERSE LIC'   G",
 'TFINANCE OF AWERTCA REVERSE LC',
 "PINANCE OF AMERICA REVERSE LLC'   AUA/ZONI  5",
 'FINANCE OF AMERICA REVERSE LIC',
 "FINANCE OF AMERICA REVERSE LLC'   A/ASYAONI",
 'TPINANCE OF AWERTCA REVERSE LLC)   A',
 "TRINANCE OF AMERICA REVERSE LLC'   A/AI/AONI",
 "PINANCE OF AMERICA REVERSE LLC'   A",
 "FINANCE OF AMERICA RE

In [78]:
print('Before Operation: ')
print(final_df['lender'].nunique(dropna=False))

# Use .loc to find rows where 'lender' contains 'GMFS' and update the 'lender' column
final_df.loc[final_df['lender'].str.contains('CA R', na=False), 'lender'] = 'FINANCE OF AMERICA REVERSE LLC'

print('After Operation: ')
print(final_df['lender'].nunique(dropna=False))

Before Operation: 
18364
After Operation: 
16333


### REVERSE MORTGAGE SOLUTIONS INC

In [79]:
# Returns unique values that contain a particular string
# 1. Define your search variable
search_string = "GE SO"

# 2. Filter the column and get unique values
# na=False handles rows with missing data so the code doesn't error out
list(final_df[final_df['lender'].str.contains(search_string, na=False)]['lender'].unique())

['REVERSE MORTGAGE SOLUTIONS INC',
 "REVERSE MORTGAGE SOLUTIONS INC'   A",
 'TREVERSE MORTGAGE SOLUTIONS INC',
 "TREVERSE MORTGAGE SOLUTIONS INC'   A/Z",
 "REVERSE MORTGAGE SOLUTIONS INC'S",
 "REVERSE MORTGAGE SOLUTIONS INC'   A/2YZ0I3  EA/AOIS}_—   A]     ENDORSED '   IGO) 50A TT   2300",
 "REVERSE MORTGAGE SOLUTIONS INC'   8IZY20IA E/IS/AOIS}_—   TS]   ASSIGNED ‘  5.06) 5.06] OTT TH SZ0",
 "REVERSE MORTGAGE SOLUTIONS INC'   T",
 "REVERSE MORTGAGE SOLUTIONS INC'   @/TI/A0I3",
 'REVERSE MORTGAGE SOLUTIONS TINCT',
 "REVERSE MORTGAGE SOLUTIONS INC'   ZIA0IS  _S/E/AOIS}   SA]    TINDORSED   TOS} A6   ST AI S0",
 "REVERSE MORTGAGE SOLUTIONS INC'   /T/20I3  G",
 'REVERSE NORTGAGE SOLUTIONS INC',
 "REVERSE MORTGAGE SOLUTIONS INC'   6IGY2013",
 "REVERSE MORTGAGE SOLUTIONS INC'   A/TAY20IS",
 "REVERSE MORTGAGE SOLUTIONS INC'   R/AOIS",
 "REVERSE MORTGAGE SOLUTIONS INC'   8I20I3",
 'REVERSE MORTGAGE SOLUTIONS INCT',
 "REVERSE MORTGAGE SOLUTIONS INC'   /SIVAOIS",
 "REVERSE MORTGAGE SOLUTIONS INC

In [80]:
print('Before Operation: ')
print(final_df['lender'].nunique(dropna=False))

# Use .loc to find rows where 'lender' contains 'GMFS' and update the 'lender' column
final_df.loc[final_df['lender'].str.contains('GE SO', na=False), 'lender'] = 'REVERSE MORTGAGE SOLUTIONS INC'

print('After Operation: ')
print(final_df['lender'].nunique(dropna=False))

Before Operation: 
16333
After Operation: 
15783


### MSR_ASSET VEHICLE LLC

In [81]:
print('Before Operation: ')
print(final_df['lender'].nunique(dropna=False))

# Use .loc to find rows where 'lender' contains 'GMFS' and update the 'lender' column
final_df.loc[final_df['lender'].str.contains('MSR_ASSET VEHICLE LLC', na=False), 'lender'] = 'MSR ASSET VEHICLE LLC'

print('After Operation: ')
print(final_df['lender'].nunique(dropna=False))

Before Operation: 
15783
After Operation: 
15696


### WORLD ALLIANCE FINANCIAL CORP

In [82]:
# Returns unique values that contain a particular string
# 1. Define your search variable
search_string = "D AL"

# 2. Filter the column and get unique values
# na=False handles rows with missing data so the code doesn't error out
list(final_df[final_df['lender'].str.contains(search_string, na=False)]['lender'].unique())

['WORLD ALLIANCE FINANCIAL COP',
 'WORLD ALLIANCE FINANCIAL CORP',
 'WORLD ALLTANCE PINANCIAL CORP',
 'TWORLD ALLTANCE FINANCIAL CORP,"   17',
 'WORLD ALLIANCE FINANCIAL CORP."   A/AIA007',
 'TWORLD ALLTANCE FINANCIAL CORP',
 'WORLD ALLIANCE FINANCIAL CRP',
 "WORLD ALLIANCE FINANCIAL CRP.'   N",
 'WORLD ALLIANCE FINANCIAL ORP',
 'WORLD ALLIANCE FINANCIAL CORP."   A',
 'WORLD ALLTANCE FINANCIAL CORP',
 'WORLD ALLIANCE FINANCIAL CORP."   N',
 "WORLD ALLIANCE FINANCIAL CORP.'   T",
 "WORLD ALLIANCE FINANCIAL CORP.'   TO/I",
 'WORLD ALLTANCE PINANCIAL CORP,"   U',
 "WORLD ALLIANCE FINANCIAL CORP.'   TO/TI2007",
 "WORLD ALLIANCE FINANCIAL CORP.' ___—   G",
 "WORLD ALLIANCE FINANCIAL CRP.'   /A/2007",
 "WORLD ALLIANCE FINANCIAL CORP.'   U",
 "WORLD ALLIANCE FINANCIAL CORP,'   S/I/2O07",
 'WORLD ALLIANCE FINANCIAL CORP."   A/A/2007 IZ/E',
 "WORLD ALLIANCE FINANCIAL CORP.'   Z/I",
 'WORLD ALLIANCE FINANCIAL CORP."   AZ/A',
 'WORLD ALLIANCE FINANCIAL CORP." __   US/2008',
 'WORLD ALLTANCE PINAN

In [83]:
print('Before Operation: ')
print(final_df['lender'].nunique(dropna=False))

# Use .loc to find rows where 'lender' contains 'GMFS' and update the 'lender' column
final_df.loc[final_df['lender'].str.contains('D AL', na=False), 'lender'] = 'WORLD ALLIANCE FINANCIAL CORP'

print('After Operation: ')
print(final_df['lender'].nunique(dropna=False))

Before Operation: 
15696
After Operation: 
15538


### MUTUAL OF OMAHA MORTGAGE, INC

In [84]:
# Returns unique values that contain a particular string
# 1. Define your search variable
search_string = "AHA M"

# 2. Filter the column and get unique values
# na=False handles rows with missing data so the code doesn't error out
list(final_df[final_df['lender'].str.contains(search_string, na=False)]['lender'].unique())

['MUTUAL. OF OMAHA MORTGAGE, INC',
 'MUTUAL OF OMAHA MORTGAGE, INC."   A/ZAYA0I6',
 "MUTUAL OF OMAHA MORTGAGE, INC.'   A/2AOI6",
 'MUTUAL OF OMAHA MORTGAGE, INC."   @/A4YA0I6',
 'MUTUAL OF OMAHA MORTGAGE, INC',
 "MUTUAL. OF OMAHA MORTGAGE, INC.'   AVTAYAOI7",
 "MUTUAL OF OMAHA MORTGAGE, INC.'   AVTZYA0I7",
 "MUTUAL OF OMAHA MORTGAGE, INC.'   A",
 'MUTUAL OF OMAHA MORTGAGE, INC."   A/AYE0I7',
 'MUTUAL OF OMAHA MORTGAGE, INC."   @/ZI/A0I7',
 "MUTUAL OF OMAHA MORTGAGE, INC.'  E /T20I7  7",
 'MUTUAL OF OMAHA MORTGAGE, INC.\'   Z/AGYAOI7 G/IS/2OI7 _    ]   ENDORSED "  A.9S6] 5.38] 3.25  \'NY TAH YT A S0',
 "MUTUAL OF OMAHA MORTGAGE, INC.'   TS /2017  TO/IS/AOIZ ]   TO]     ENDORSED '   5.06) 5.06] OFT TH 25",
 'MUTUAL OF OMAHA MORTGAGE, ING',
 "MUTUAL. OF OMAHA MORTGAGE, INC.'   T",
 "MUTUAL OF OMAHA MORTGAGE, INC.'   TO",
 "MUTUAL. OF OMAHA MORTGAGE, INC.'   AN/3GYAOI7",
 "IMUTUAL OF OMAHA MORTGAGE, INC.'   TO/A",
 'MUTUAL OF OMAHA MORTGAGE, INC."   A/AA/A0IS  O/I',
 "MUTUAL OF OMAHA MORTG

In [85]:
print('Before Operation: ')
print(final_df['lender'].nunique(dropna=False))

# Use .loc to find rows where 'lender' contains 'GMFS' and update the 'lender' column
final_df.loc[final_df['lender'].str.contains('AHA M', na=False), 'lender'] = 'MUTUAL OF OMAHA MORTGAGE, INC'

print('After Operation: ')
print(final_df['lender'].nunique(dropna=False))

Before Operation: 
15538
After Operation: 
15317


### LIVE WELL FINANCIAL INC

In [86]:
# Returns unique values that contain a particular string
# 1. Define your search variable
search_string = "LIVE W"

# 2. Filter the column and get unique values
# na=False handles rows with missing data so the code doesn't error out
list(final_df[final_df['lender'].str.contains(search_string, na=False)]['lender'].unique())

['LIVE WELL FINANCIAL INC',
 "LIVE WELL FINANCIAL INC'   7",
 "TLIVE WELL FINANCIAL ING'   17",
 "LIVE WELL FINANCIAL INC'   AZ/4YA006",
 "LIVE WELL FINANCIAL INC'   /T",
 "LIVE WELL FINANCIAL INC'   AFAO07",
 "TLIVE WELL FINANCIAL INC'   A/I/2007",
 "LIVE WELL FINANCIAL INC'   A7 /2007",
 'TLIVE WELL FINANCIAL INC',
 "TLIVE WELL FINANCIAL INC'   ATA /A007",
 'LIVE WELL FINANCIAL IC',
 "TLIVE WELL FINANCIAL INC'   5A /A007",
 'LIVE WELL FINANCIAL INCÉ   A',
 'TLIVE WELL FINANCIAL INGT',
 "LIVE WELL FINANCIAL IC'   T",
 'LIVE WELL FINANCIAL INCH',
 "LIVE WELL FINANCIAL IC'   Z/AYZOO7  _A/G/ZOOS    EA]   ENDORSED ' {A 75] TA] TNT TAY   T530",
 "LIVE WELL FINANCIAL INC'   N/T",
 "LIVE WELL FINANCIAL INC'   T",
 "LIVE WELL FINANCIAL INC'   /2007",
 "LIVE WELL FINANCIAL INC'   AA/2 E007",
 "TLIVE WELL FINANCIAL INC'   A",
 "L'LIVE WELL FINANCIAL INC",
 'LIVE WELL FINANCIAL INCT   AZ/AYA007 /A',
 "LIVE WELL FINANCIAL INC' AZ",
 'TLIVE WELL FINANCIAL INCT   AP/AYEO07',
 "LIVE WELL FINANCIAL I

In [87]:
print('Before Operation: ')
print(final_df['lender'].nunique(dropna=False))

# Use .loc to find rows where 'lender' contains 'GMFS' and update the 'lender' column
final_df.loc[final_df['lender'].str.contains('LIVE W', na=False), 'lender'] = 'LIVE WELL FINANCIAL INC'

print('After Operation: ')
print(final_df['lender'].nunique(dropna=False))

Before Operation: 
15317
After Operation: 
14417


### COUNTRYWIDE BANK FSB

In [88]:
# Returns unique values that contain a particular string
# 1. Define your search variable
search_string = "IDE BA"

# 2. Filter the column and get unique values
# na=False handles rows with missing data so the code doesn't error out
list(final_df[final_df['lender'].str.contains(search_string, na=False)]['lender'].unique())

['COUNTRYWIDE BANK FSB',
 "COUNTRYWIDE BANK FSB' 6",
 "COUNTRYWIDE BANK FSB'   9A /2007",
 "COUNTRYWIDE BANK FSB' ANF /2007",
 'NCOUNTRYWIDE BANK FSB',
 'NCOUNTRYWIDE BANK FSBY   T',
 'COUNTRYWIDE BANK FSBY',
 'VCOUNTRYWIDE BANK FSB',
 "COUNTRYWIDE BANK PSB'   GT FAZ007",
 'TCOUNTRYWIDE BANK FSB',
 'COUNTRYWIDE BANK PSB',
 "TCOUNTRYWIDE BANK FSB'   1071572007",
 "COUNTRYWIDE BANK FSB'   N",
 "COUNTRYWIDE BANK FSB'   T",
 "COUNTRYWIDE BANK FSB'   A7 A007",
 'TCOUNTRYWIDE BANK FSBT',
 "COUNTRYWIDE BANK FSB'   AA",
 "COUNTRYWIDE BANK FSB' TY /Z007  G",
 "COUNTRYWIDE BANK FSB' APF FOO07",
 "COUNTRYWIDE BANK FSB' AZZ YAO07",
 "COUNTRYWIDE BANK FSB'   /T",
 "COUNTRYWIDE BANK FSB'   AF A007",
 "COUNTRYWIDE BANK FSB' TZ/IZ/2007",
 'YCOUNTRYWIDE BANK FSB',
 "COUNTRYWIDE BANK FSB' A /2D/2008",
 "COUNTRYWIDE BANK FSB'   AFOO08",
 "TCOUNTRYWIDE BANK FSB' 17",
 "COUNTRYWIDE BANK FSB' AA FA008",
 "COUNTRYWIDE BANK FSB'   ASY2Z008",
 "VCOUNTRYWIDE BANK FSB'   T",
 "COUNTRYWIDE BANK FSB' 87",
 "COUNTR

In [89]:
print('Before Operation: ')
print(final_df['lender'].nunique(dropna=False))

# Use .loc to find rows where 'lender' contains 'GMFS' and update the 'lender' column
final_df.loc[final_df['lender'].str.contains('IDE BA', na=False), 'lender'] = 'COUNTRYWIDE BANK FSB'

print('After Operation: ')
print(final_df['lender'].nunique(dropna=False))

Before Operation: 
14417
After Operation: 
14136


### DITECH MORTGAGE CORP

In [90]:
# Returns unique values that contain a particular string
# 1. Define your search variable
search_string = "CH M"

# 2. Filter the column and get unique values
# na=False handles rows with missing data so the code doesn't error out
list(final_df[final_df['lender'].str.contains(search_string, na=False)]['lender'].unique())

['DITECH MORTGAGE CORP',
 'DITCH MORTGAGE CORP',
 "PRTECH MORTGACE CORP'   AEZ0UN  G",
 "RTECH MORTGACE COP' 5 AZOUI",
 "DITECH MORTGAGE CORP'   9 /20NI",
 "DITECH MORTGAGE CORP'   TY2OIE  AFIE/AOIZ    88]   ASSIGNED @   5.06) 5.06] OUTS TE 75",
 "DITECH MORTGAGE COP' ATZYA0IE",
 "DITECH MORTGAGE COP'   AZZOUN  A/UI/AOIZ  AVIVEOZT  I]     TERMINATED'   AGO] AGO OT TH STOO",
 "DITECH MORTGAGE CORP' ___   AZO",
 'DETECH MORTGAGE CORP',
 "DITECH MORTGAGE CORP'   882012",
 "DITECH MORTGAGE CORP' AAN",
 'DITECH MORTGAGE COP',
 "DRTECH MORTGAGE CORP' 6A FO0I2",
 "DITECH MORTGAGE COP'   7A20I2",
 "DITECH MORTGAGE CORP'   82012",
 "DETECH MORTGAGE CORP' 9202012] AAIFEOREF  ] PA RNDORSED PE 7AF A SAFE FEE A T60000[T02720  A2",
 "DITECH MORTGAGE CORP'   T",
 "DITECH MORTGAGE CORP'   TAYZ0NZ",
 "DITECH MORTGAGE CORP'   942I] TO/IS/EOIZ   E]   ASSIGNED ‘  OOF AGO OTT TE 960",
 'DITECH MORTGAGE CORP\'__   G0A/20N2  TIFEO/ZOI2    OF   ASSIGNED "  4G9F AGO OFTHE 3570',
 "DITECH MORTGAGE CORP' AZAR] A

In [91]:
print('Before Operation: ')
print(final_df['lender'].nunique(dropna=False))

# Use .loc to find rows where 'lender' contains 'GMFS' and update the 'lender' column
final_df.loc[final_df['lender'].str.contains('CH M', na=False), 'lender'] = 'DITECH MORTGAGE CORP'

print('After Operation: ')
print(final_df['lender'].nunique(dropna=False))

Before Operation: 
14136
After Operation: 
13860


### SUN WEST MORTGAGE CO INC

In [92]:
search_string = "UN WE"
list(final_df[final_df['lender'].str.contains(search_string, na=False)]['lender'].unique())

['SUN WEST MORTGAGE CO_INC',
 'SUN WEST MORTGAGE CO_TNC',
 'SUN WEST MORTGAGE CO INC',
 "SUN WEST MORTGAGE CO INC' ZZ 2007",
 "SUN WEST MORTGAGE CO INC'   NA/AZA2007",
 "SUN WEST MORTGAGE CO INC'   /A2007  TI/EO/2007  IIVAO/EOR7  OZ]     TERMINATED'   A7] 589 TNT AY 50",
 "SUN WEST MORTGAGE CO INC'   APA A007",
 'SUN WEST ORTGAGE CO INC',
 "SUN WEST WORTGAGE COIN'   AAD YA008",
 'SUN WEST MORTGAGE CO TNC',
 "SUN WEST WORTGAGE COINC' 47",
 'SUN WEST NORTGAGE CO INC',
 'SUN WEST ORTGAGE COIN',
 'SUN WEST WORTGAGE CO INC',
 "SUN WEST WORTGAGE CO INC'   A",
 'SUN WEST MORTGAGE CONC',
 'SUN WEST MORTGAGE COINC',
 "SUN WEST MORTGAGE CO INC'   A",
 'SUN WEST WORTGAGE GO INC',
 'SUN WEST MORTGAGE GO INC',
 'SUN WEST WORTGAGE CONC\'   /2009  _AVA/Z0I0        ASSIGNED "  5.56] 556] OTH TE G600',
 "SUN WEST MORTGAGE CO INC' 67",
 "SUN WEST ORTGAGE CO INC'   A",
 'SUN WEST WORTGAGE COINC',
 'SUN WEST MORTGAGE CO_INCY',
 "SUN WEST MORTGAGE CO INC' 97",
 "SUN WEST WORTGAGE CO INC'   1073072000",
 "S

In [93]:
print('Before Operation: ')
print(final_df['lender'].nunique(dropna=False))

# Use .loc to find rows where 'lender' contains 'GMFS' and update the 'lender' column
final_df.loc[final_df['lender'].str.contains('UN WE', na=False), 'lender'] = 'SUN WEST MORTGAGE CO INC'

print('After Operation: ')
print(final_df['lender'].nunique(dropna=False))

Before Operation: 
13860
After Operation: 
13481


### EVERBANK REVERSE MORTGAGE LLC

In [94]:
search_string = "VERB"
list(final_df[final_df['lender'].str.contains(search_string, na=False)]['lender'].unique())

['TEVERBANK REVERSE WORTGAGE LC',
 'TEVERBANK REVERSE WORTGAGE LLC',
 'TBVERBANK REVERSE MORTGAGE LLC',
 'EVERBANK REVERSE WORTEAGE LC',
 'BVERBANK REVERSE MORTGAGE LLC',
 'EVERBANK REVERSE MORTGAGE LLC',
 'EVERBANK REVERSE MORTGAGE LIC',
 'LTEVERBANK REVERSE MORTGAGE LC',
 'TEVERBANK REVERSE MORTGAGE LLC',
 'TEVERBANK REVERSE MORTGAGE LIC',
 'TEVERBANK REVERSE MORTGAGE LUC',
 'EVERBANK REVERSE WORTGAGE LLC',
 "EVERBANK REVERSE MORTGAGE LLC'   ATI /A008",
 "TEVERBANK REVERSE WORTGAGE LLC'   17",
 "EVERBANK REVERSE MORTGAGE LUC'   N/T /2007",
 "TEVERBANK REVERSE MORTGAGE LLC'   ATAY2Z008",
 "TEVERBANK REVERSE WORTGAGE LLC'   N/5 2007",
 'EVERBANK REVERSE MORTGAGE LUC',
 "EVERBANK REVERSE MORTGAGE LUC'   N/AZY2007 AZ",
 "EVERBANK REVERSE WORTGAGE LLC'   UN/T",
 'LTBVERBANK REVERSE MORTGAGE LLC',
 "EVERBANK REVERSE MORTGAGE LUC'   //2007",
 "TEVERBANK REVERSE MORTGAGE LLC'   /A/2007",
 'EVERBANK REVERSE WORTGAGE LLC\' __—   TZ/2I/2007 I/EA/EOOS    2] 3] "ENDORSED § {AE} AE} TNT AYE 70',
 

In [95]:
print('Before Operation: ')
print(final_df['lender'].nunique(dropna=False))

# Use .loc to find rows where 'lender' contains 'GMFS' and update the 'lender' column
final_df.loc[final_df['lender'].str.contains(search_string, na=False), 'lender'] = 'EVERBANK REVERSE MORTGAGE LLC'

print('After Operation: ')
print(final_df['lender'].nunique(dropna=False))

Before Operation: 
13481
After Operation: 
13147


### WENDOVER FINANCIAL SERVICES CORP

In [96]:
search_string = "VER F"
list(final_df[final_df['lender'].str.contains(search_string, na=False)]['lender'].unique())

['WENDOVER FINANCTAL SERVICES CORP',
 'WENDOVER FINANCIAL SERVICES CORP',
 "WENDOVER FINANCIAL SERVICES CORP'   TO",
 "WENDOVER FINANCIAL SERVICES CORP'   A/I",
 "WENDOVER FINANCIAL SERVICES CORP'   T",
 "WENDOVER FINANCIAL SERVICES CORP'   A",
 "WENDOVER FINANCTAL SERVICES CORP'   T",
 "WENDOVER FINANCIAL SERVICES CORP'   ATATGO7    A",
 "WENDOVER FINANCIAL SERVICES CORP'   A/2I/IG97",
 'WENDOVER FINANCIAL SERVICES COP',
 "WENDOVER FINANCTAL SERVICES CORP'   A",
 'TWENDOVER FINANCIAL SERVICES COP',
 "WENDOVER FINANCIAL SERVICES CORP'   /2I/IG97",
 "WENDOVER FINANCTAL SERVICES CORP'   UOV",
 "WENDOVER FINANCIAL SERVICES CORP'   //I997",
 "WENDOVER FINANCIAL SERVICES CORP'   UN/I",
 "WENDOVER FINANCIAL SERVICES COP'   1I",
 "WENDOVER FINANCIAL SERVICES CORP'   A/T",
 "WENDOVER FINANCTAL SERVICES CORP'   N",
 "WENDOVER FINANCIAL SERVICES CORP'   1Z",
 "WENDOVER FINANCTAL SERVICES CORP'   27",
 "WENDOVER FINANCIAL SERVICES CORP'   A/2I/I999",
 "WENDOVER FINANCIAL SERVICES COP'   17",
 "WE

In [97]:
print('Before Operation: ')
print(final_df['lender'].nunique(dropna=False))

# Use .loc to find rows where 'lender' contains 'GMFS' and update the 'lender' column
final_df.loc[final_df['lender'].str.contains(search_string, na=False), 'lender'] = 'WENDOVER FINANCIAL SERVICES CORP'

print('After Operation: ')
print(final_df['lender'].nunique(dropna=False))

Before Operation: 
13147
After Operation: 
12983


### LONGBRIDGE FINANCIAL LLC

In [98]:
search_string = "GB"
list(final_df[final_df['lender'].str.contains(search_string, na=False)]['lender'].unique())

['LONGBRIDGE PINANCTAL LLC',
 'LONGBRIDGE FINANCIAL LIC',
 "LONGBRIDGE FINANCIAL LIC'   AF /2016",
 'LONGBRIDGE FINANCIAL LUC',
 "TLONGBRIDGE FINANCIAL LUC'   A/T",
 'LONGBRIDGE FINANCIAL LIC\'   TOZIYAOI6  AVAS/ZOR7    TF   ENDORSED "  4.336] A 35] 75 \'Y TAH UY\'   GS0',
 "LONGBRIDGE FINANCIAL LUC' 87",
 "LONGBRIDGE FINANCIAL LIC'   AS /Y20I6] 2VAI/AOI7 _   TF   ENDORSED *] AGO A9} ATH TT A050",
 "LONGBRIDGE FINANCIAL LIC'   N",
 'TLONGBRIDGE FINANCIAL LLC',
 "LONGBRIDGE FINANCIAL LUC'   N",
 'LONGBRIDGE FINANCIAL LLC',
 "TLONGBRIDGE FINANCIAL LLC'   N/A 72016] A/SI/AORR{ RAF INDORSED TY A S7AF SOF FIN RA AYE 75",
 'LONGBRTDGE PINANCTAL LLC',
 "TLONGBRIDGE FINANCIAL LLC'   AA /A0I7   A/IO/Z0I7 A",
 "LONGBRIDGE FINANCIAL LLC'   A",
 "LONGBRIDGE FINANCIAL LUC'   A/Z",
 "LONGBRIDGE PINANCTAL LLC'   142016",
 'LONGBRIDGE PINANCTAL LIC',
 'TLONGBRIDGE FINANCIAL LUC',
 'LONGBRIDGE FINANCIAL LLC\'   8 FA0N7   S/IO/AOI7    TAF NDORSED "  AAAN 5.005[ 625 NY TA IY AS00',
 'TLONGBRIDGE PINANCTA

In [99]:
print(final_df['lender'].nunique(dropna=False))
final_df.loc[final_df['lender'].str.contains(search_string, na=False), 'lender'] = 'LONGBRIDGE FINANCIAL LLC'
print(final_df['lender'].nunique(dropna=False))

12983
12559


### HIGH TECH LENDING INC

In [100]:
search_string = "H L"
list(final_df[final_df['lender'].str.contains(search_string, na=False)]['lender'].unique())

['HIGH TECH LENDING 1NC',
 'HIGH TECH LENDING INC',
 'THIGH TECH LENDING INC',
 "THIGH TECH LENDING INC' AZ/IGYA0I6",
 "GH TECH LENDING INC' AA/ZAOIE A/AI/EOUZ  EA]   BNDORSED 1] A 52] A T05[ 75 FTNT RAT TAY   2250",
 "IG TECH LENDING INC' AZZZOIS",
 "THIGH TECH LENDING INC'   AZAYOOUS AAI/ZOIG        ENDORSED *] 52} AF ATH TAT YT   P200",
 'WIIG TECH LENDING INC',
 "INIGIL TECH LENDING INC' 8282019} TO/IO/ZOIPF   OO]     TINDORSED 1] AZO) 3.23 ZS TN TAT TAY   ZS000  SS00",
 'THIGH THCH LENDING INC',
 "HIGH TECH LENDING INC' G",
 'HIGH TECH LENDING INCY',
 'HIGH TECH LENDING TNC',
 'THIGH TRCH LENDING INC',
 "THIGH TECH LENDING INC' A/AIOOIZ A/II/EOIS",
 "THIGH TECH LENDING INC' N/A",
 'TIIGH TECH LENDING INC',
 "THIGH TECH LENDING INC' AZ/ZZYA0NA",
 "THIGH TECH LENDING INC'   AZ/AOOIAL",
 'WG TECH LENDING INC',
 "HGH TECH LENDING INC' N",
 "HIGH TECH LENDING INC' TYZ0IS",
 "WLGH TECH LENDING INC' ZZYA0IS",
 "THIGH TECH LENDING INC'   A",
 'WWGH TECH LENDING INC',
 'GH TECH LENDING INC

In [101]:
print(final_df['lender'].nunique(dropna=False))
final_df.loc[final_df['lender'].str.contains(search_string, na=False), 'lender'] = 'HIGH TECH LENDING INC'
print(final_df['lender'].nunique(dropna=False))

12559
12170


### OPEN MORTGAGE LLC

In [102]:
search_string = "EN MO"
list(final_df[final_df['lender'].str.contains(search_string, na=False)]['lender'].unique())

['OPEN MORTGAGE LLC',
 'YOPEN MORTGAGE LIC',
 "OPEN MOWTGAGE LIC' OT ZNI9  TOVEA/EOIOF  ] ENDORSED 1] 3.690) AS] 75 F?NÉ RAT AY   E250",
 "YOPEN MOTGAGE LIC'   ATAZ0IS",
 'YOPEN MORTEAGE LIC',
 'YOPEN MOWTGAGE LIC',
 "OPEN MORTGAGE LIC' 7462015",
 'TOPEN MORTGAGE LLC) RZ 2015',
 "YOPEN MOTGAGE LIC' TT Z0IS",
 "YOPEN MORTEAGE LIC' SAZ",
 "YOPEN MOWTGAGE LIC'   ATTZ0I6  E/ZOUS   ] 3]   ENDORSED 1] A ZIS} A] TT RAT TAY   S980",
 'OPEN MORTGAGE LIC',
 "YOPEN MORTEAGE LIC'   T/OYZ0I7",
 'TOPEN MORTGAGE LLC',
 "YOPEN MORTEAGE LIC'   AZ/AZOIS  I/AI/ZOI9  GVI/ZOZ0  5]     TERMINATED'   5.495] 5.475 75  TNÉ TAT TT AY   3350",
 "YOPEN MORTGAGE LIC' 22020",
 'OPEN MOWTGAGE LIC',
 'YOPEN MOTGAGE LIC',
 "OPEN MOWTGAGE LUC' 8132020] R/E/ZOZOF  ] 83]  TNDORSED   3.266) 72] TY RAT TAY   T5",
 'OPEN MOWTGAGE LUC',
 "YOPEN MORTGAGE LIC' 87",
 "YOPEN MORTGAGE LIC'   T",
 "YOPEN MOTGAGE LIC' 07",
 "YOPEN MOWTGAGE LIC' _— 87",
 'YOPEN MOTEAGE LIC',
 "YOPEN MORTEAGE LIC'   8T",
 "OPEN MORTGAGE LIC'   AZ/20I

In [103]:
print(final_df['lender'].nunique(dropna=False))
final_df.loc[final_df['lender'].str.contains(search_string, na=False), 'lender'] = 'OPEN MORTGAGE LLC'
print(final_df['lender'].nunique(dropna=False))

12170
12124


### CHERRY CREEK MORTGAGE, LLC

In [104]:
search_string = "RY C"
list(final_df[final_df['lender'].str.contains(search_string, na=False)]['lender'].unique())

['CHERRY CREEK MORTGAGE, LLC',
 'CHERRY CREEK MORTGAGE, LLC V/I/2010',
 "CHERRY CREEK MORTGAGE, LLC'   N",
 'CHERRY CREEK WORTGAGE, LLC',
 'CHERRY CREEK MOWTEAGE, LUC',
 "CHERRY CREEK MORTGAGE, LLC'   A",
 'CHERRY CREEK MORTGAGE, LUC',
 'TCHERRY CREEK WORTGAGE, LUC',
 'CHERRY CREEK WORTGAGE, LUC',
 'TCHERRY CREEK WORTGAGE, LC',
 "CHERRY CREEK MORTGAGE, LLC'   Z/Z 2014 /AI/2OI4",
 "CHERRY CREEK MORTGAGE, LUC'   T29Y2OIA",
 'CHBRRY CREEK WORTGAGE, LLC',
 "TCHERRY CREEK WORTGAGE, LUC'   TAA0IS",
 "CHERRY CREEK MORTGAGE, LLC'   A/Z/20IS",
 'TCHBRRY CREEK WORTGAGE, LC',
 "CHERRY CREEK MORTGAGE, LLC'   T/T 2012",
 'TCHERRY CREEK WORTGAGE, LLC',
 "CHERRY CREEK WORTGAGE, LLC'   UY20I3  _S",
 "TCHERRY CREEK WORTGAGE, LLC' 97",
 "CHERRY CREEK WORTGAGE, LLC'   7152013",
 "CHERRY CREEK MORTGAGE, LUC'   TO/T",
 "CUERRY CREEK WORTGAGE, LUC'   TO/3I20IS  I",
 "CHERRY CREEK MORTGAGE, LLC'   T",
 "CHERRY CREEK MORTGAGE, LLC'   AN/A",
 "CHERRY CREEK WORTEAGE, LLC'   ISZYZOIA",
 "CHERRY CREEK MORTGAGE, L

In [105]:
print(final_df['lender'].nunique(dropna=False))
final_df.loc[final_df['lender'].str.contains(search_string, na=False), 'lender'] = 'CHERRY CREEK MORTGAGE, LLC'
print(final_df['lender'].nunique(dropna=False))

12124
11806


### JAMES B_NUTTER AND COMPANY

In [106]:
search_string = "B_N"
list(final_df[final_df['lender'].str.contains(search_string, na=False)]['lender'].unique())

['JAMES B_NUTTER AND COWPANY',
 'TJAMES B_NUTTER AND COPANY',
 "TAMES B_NUTTER AND COWPANY'   27",
 'TJAMES B_NUTTER AND COWPANY',
 'TAMES B_NUTTER AND COWPANY',
 'JAMES B_NUTTER AND COMPANY',
 "TJAMES B_NUTTER AND COPANY'   1073072007",
 "JAMES B_NUTTER AND COWPANY'   T2008",
 'TJAMES B_NUTTER AND COPANY   17',
 'JAWES B_NUTTER AND COWPANY',
 'JAMES B_NUTTER_AND COMPANY',
 'JAMES B_NUPTER AND COMPANY',
 'IAMES B_NUTTER AND COMPANY',
 'HAMES B_NUTTER AND COMPANY',
 'AMES B_NUTTER AND COMPANY',
 'VJAMES B_NUTTER AND COWPANY',
 "TAMES B_NUTTER AND COWPANY'   10",
 "TJAMES B_NUTTER AND COWPANY'   87",
 "TJAMES B_NUTTER AND COWPANY'   1172572008",
 "JAMES B_NUTTER AND COWPANY'   6",
 "TAMES B_NUTTER AND COWPANY'   47",
 'TJAMES B_NUTTER AND COPANY   1171372008] I']

In [107]:
print(final_df['lender'].nunique(dropna=False))
final_df.loc[final_df['lender'].str.contains(search_string, na=False), 'lender'] = 'JAMES B NUTTER AND COMPANY'
print(final_df['lender'].nunique(dropna=False))

11806
11784


### NATIONWIDE EQUITIES CORPORATION

In [108]:
search_string = "DE E"
list(final_df[final_df['lender'].str.contains(search_string, na=False)]['lender'].unique())

["NATLONNIDE EQUITIES CORPORATION'   6282013",
 'NATIONWIDE EQUITIES CORPORATION',
 "NATIONWIDE EQUITIES CORPORATION'   AP/ZAYAOIS A/I",
 "NATIONWIDE EQUITIES CORPORATION'   A",
 'TNATIONWIDE EQUITIES CORPORATION',
 'NATIONNIDE EQUITIES CORPORATION',
 "NATIONWIDE EQUITIES CORPORATION'   A/A",
 "NATIONWIDE EQUITIES CORPORATION'   T",
 "NATIONWIDE EQUITIES CORPORATION'   TI/TOY2016  I",
 "NATIONWIDE EQUITIES CORPORATION'   23Y20I7 S/IE/AOUR   E]     TNDORSED {A TONF ASA] ST RAT AYE S230",
 "NATIONWIDE EQUITIES CONPORATION'   A/2IZOI9  IZ/AE/AOIG}   SA]   ENDORSED 1] ATER] A7] TNT RAT AY   G060",
 "NATIONWIDE EQUITIES CONPORATION'   A",
 "NATIONWIDE EQUITIES CORPORATION'   A/2AY20I6  3AI/ZOIE    GIF   ENDORSED *   3. G06] A6] SR) TAH YT 625500) ASS957",
 "NATIONWIDE EQUITIES CORPORATION'   N0UY20I7] AI/E/AOUZ  5]   BNDORSED @] 5.313) 5.03 8 FE) RAT AY   S620",
 "TNATIONWIDE EQUITIES CORPORATION'   A/T",
 'NATIONWIDE EQUITIES CORORATION',
 "NATIONWIDE EQUITIES CORPORATION'   TI/TSY20IS I/A

In [109]:
print(final_df['lender'].nunique(dropna=False))
final_df.loc[final_df['lender'].str.contains(search_string, na=False), 'lender'] = 'NATIONWIDE EQUITIES CORPORATION'
print(final_df['lender'].nunique(dropna=False))

11784
11430


### DITECH

In [110]:
search_string = "DITE"
list(final_df[final_df['lender'].str.contains(search_string, na=False)]['lender'].unique())

['DITECH MORTGAGE CORP',
 'DITECH WORTGAGE CORP',
 "DITECH NORTGAGE CORP'   RT 2ZOUI  G/AI/2OI1  SVIZ/AOIE] SA]     TERMINATED'   5.06] 5.06] YH TT 56500) IS027",
 "DITECH NORTGAGE CORP' ATAU",
 "DITECH NORTGAGE CORP' __   AZ/20I2",
 'DITECH NORTGAGE CORP',
 "DITECH WORTGAGE CORP'   A/NDFO0IZ",
 "DITECH WORTGAGE CORP' 5A FO0IZ",
 "DITECH NORTGAGE CORP'   GTYAMRZ",
 "DITECH NORTGAGE COP' 812012",
 "DITECH WORTGAGE COR'   T",
 "DITECH NORTGAGE CORP'  G 2012",
 "DITECH NORTGAGE COR'   OZIAOIE  AI/IS/EOUZ   ] 8] ASSIGNED § {A 75] A5] OFT TE 350",
 "DITECH NORTGAGE CORP'   FYAMUZ IVE/AOIA _—       ASSIGNED *] 5.06] 5.06] ATH TT Z95",
 "DITECH NORTGAGE CORP' A",
 'DITECH NORTGAGE COP',
 'DITECH WORTGAGE CORP\'   A/ZOY2012 I/EAZOI3    GF   ASSIGNED "  4.G9F AGO OTH TE G50',
 "DITECH NORTGAGE CORP'__   T",
 "DITECH WORTGAGE CORP' AA /A",
 "DITECH NORTGAGE CORP' __—   ZZYAOUZ A/AO/ZOI3}_       ASSIGNED *   475] A5] ATH TT 300000) AS75",
 "DITECH WORTGAGE CORP' AA/AOOIZ   AV",
 'DITECH NORTGAGE 

In [111]:
print(final_df['lender'].nunique(dropna=False))
final_df.loc[final_df['lender'].str.contains(search_string, na=False), 'lender'] = 'DITECH MORTGAGE CORP'
print(final_df['lender'].nunique(dropna=False))

11430
11268


### SEATTLE

In [112]:
search_string = "EATT"
list(final_df[final_df['lender'].str.contains(search_string, na=False)]['lender'].unique())

['SEATTLE MORTGAGE COMPANY',
 'SEATTLE WORTGAGE COWPANY',
 'SEATTLE NORTGAGE CONPANY',
 'SEATTLE NORTGAGE COWPANY',
 'SEATTLE NORTGAGE CONANY',
 'SEATTLE NORTGAGE COPAY',
 'SEATTLE NORTGAGE COMPANY',
 'TSEATTLE WORTGAGE COMPANY',
 "SEATTLE NORTGAGE COWPANY'   AT /2006",
 'SEATTLE WORTGAGE CONPANY',
 "SEATTLE NORTGAGE COWPANY'   T",
 "SEATTLE WORTGAGE CONPANY'   AT",
 'SEATTLE NORTGAGE COWANY',
 "SEATTLE NORTGAGE COMPANY'   8",
 "SEATTLE NORTGAGE COWPANY'   ATAZ/TG95",
 "SEATTLE WORTGAGE COWPANY' 85",
 "SEATTLE NORTGAGE COWPANY'   TGVTGO5    TVAIFIG95  GE]   TERMINATED'   A] TA] 05 TNT TAY   TS0",
 "SEATTLE WORTGAGE COWPANY'   97",
 "SEATTLE NORTGAGE COWPANY'   27",
 "SEATTLE WORTGAGE COWPANY' 87",
 "SEATTLE NORTGAGE COWPANY'   97",
 "SEATTLE WORTGAGE COWPANY' 27",
 "TSEATTLE WORTGAGE COMPANY'   27A /TGO7 A/AS/AGOR",
 "SEATTLE NORTGAGE CONPANY'   T",
 "SEATTLE WORTGAGE CONANY'   A//TGO7",
 "SEATTLE NORTGAGE COWPANY'   T/TGO7",
 "SEATTLE WORTGAGE COWPANY'   8",
 "SEATTLE WORTGAGE COWPANY

In [113]:
print(final_df['lender'].nunique(dropna=False))
final_df.loc[final_df['lender'].str.contains(search_string, na=False), 'lender'] = 'SEATTLE MORTGAGE COMPANY'
print(final_df['lender'].nunique(dropna=False))

11268
11142


### FINANCIAL FREEDOM ACQUISITION LLC

In [114]:
search_string = "M AC"
list(final_df[final_df['lender'].str.contains(search_string, na=False)]['lender'].unique())

['BINANCIAL FREEDOM ACQUISITION LLC',
 'PINANCTAL FREEDOM ACQUISITION LLC',
 'PINANCIAL FREEDOM ACQUISITION LLC',
 'TRINANCTAL FREEDOM ACQUISITION LLC',
 'FINANCIAL FREEDOM ACQUISITION LLC',
 'FINANCTAL FREEDOM ACQUISITION LLC',
 'RINANCTAL FREEDOM ACQUISITION LLC',
 'TPINANCIAL FREEDOM ACQUISTTION LLC',
 'TFINANCIAL FREEDOM ACQUISITION LLC',
 'TPINANCTAL FREEDOM ACQUISITION LLC',
 'TBINANCTAL FREEDOM ACQUISITION LLC',
 'TRINANCIAL FREEDOM ACQUISITION LLC',
 'TFINANCTAL FREEDOM ACQUISITION LLC',
 'TFINANCIAL FREEDOM ACQUISTTION LLC',
 "TRINANCTAL FREEDOM ACQUISITION LLC'   ASF 2009",
 "FINANCIAL FREEDOM ACQUISITION LLC'   TS",
 "FINANCIAL FREEDOM ACQUISITION LLC'   TA/ISY2009",
 "FINANCIAL FREEDOM ACQUISITION LLC'   TO/AI/2009",
 'TPINANCTAL FREEDOM ACQUISITION LUC',
 "TRINANCTAL FREEDOM ACQUISITION LLC'   1 /A/2009",
 "TFINANCIAL FREEDOM ACQUISITION LLC'   T",
 "FINANCIAL FREEDOM ACQUISITION LLC'   TA/T",
 "TRINANCTAL FREEDOM ACQUISITION LLC'   A",
 "FINANCIAL FREEDOM ACQUISITION LLC'

In [115]:
print(final_df['lender'].nunique(dropna=False))
final_df.loc[final_df['lender'].str.contains(search_string, na=False), 'lender'] = 'FINANCIAL FREEDOM ACQUISITION LLC'
print(final_df['lender'].nunique(dropna=False))

11142
10937


### UNITY MORTGAGE CORP

In [116]:
search_string = "NITY W"
list(final_df[final_df['lender'].str.contains(search_string, na=False)]['lender'].unique())

['UNITY WORTGAGE COR',
 "UNITY WORTGAGE CORE' 6",
 'UNITY WORTGAGE CORR',
 'UNITY WORTGAGE CORP',
 'UNITY WORTGAGE COPY',
 'UNITY WORTÉAGE CORP',
 'TUNITY WORTGAGE COP',
 "UNITY WORTGAGE CORP' 7",
 'UNITY WORTGAGE COPY 0',
 'TUNITY WORTGAGE CORP',
 "UNITY WORTGAGE CORP' 7S /TGO7  AVA/IGOS",
 'UNITY WORTGAGE CORE',
 "UNITY WORTGAGE CORE'   AVT/T998",
 "UNITY WORTGAGE CORP' A5 1999] S/O/I999  67",
 'UNITY WORTÉAGE CORR',
 'UNITY WORTÉNGE CORP"   Z713T999',
 'UNITY WORTGAGE CORP™   A1 /2000',
 'TUNITY WORTGAGE CORR',
 'TUNITY WORTGAGE CORPT   NI /TG05',
 "UNITY WORTGAGE CORP'   A",
 "UNITY WORTGAGE CORP' AVIS/IG06  A/UI/AGO7",
 'UNITY WORTGAGE COP',
 'TUNITY WORTGAGE CORPY 97',
 'UNITY WORTGNGE CORP',
 "UNITY WORTGAGE CORP'   A/IIGOO",
 "UNITY WORTGAGE COR'   I",
 "UNITY WORTGAGE CORR'   ATAZ/TG99",
 "UNITY WORTGAGE CORP'   87",
 "TUNITY WORTGAGE CORP'   Z/T",
 'TUNITY WORTGAGE CORPY',
 'UNITY WORTÉNGE CORP',
 "UNITY WORTGAGE CORR' 87",
 "TUNITY WORTGAGE CORP' 27",
 'TUNITY WORTGAGE CORP™

In [117]:
print(final_df['lender'].nunique(dropna=False))
final_df.loc[final_df['lender'].str.contains(search_string, na=False), 'lender'] = 'UNITY MORTGAGE CORP'
print(final_df['lender'].nunique(dropna=False))

10937
10689


### PU MORTGAGE

In [118]:
search_string = "PH M"
list(final_df[final_df['lender'].str.contains(search_string, na=False)]['lender'].unique())

['PH MORTGAGE CORPORATION',
 'IPH MORTGAGE CORPORATION',
 "PH MORTGAGE CORPORATION'   A/T7 72020",
 "PH MORTGAGE CORPORATION'   TZY2020",
 'PH MORTGAGE CONPORATION',
 "PH MORTGAGE CORPORATION' __—   ZVOYAO20  A/IG/ZOZI     ]   ENDORSED *] 2] 92} ATH YT YT 635000) AAAI7O] 20",
 "PH MORTGAGE CORPORATION'   GT",
 "PH MORTGAGE CORPORATION'   A",
 'PH MORTGAGE CORPORATION\'   GIGY2020  OV/IS/2ON0}   TF   ENDORSED "  2.961   SR) TAH IY\'   A850',
 "PH MORTGAGE CORPORATION'   AA /TNY2020",
 "PH MORTGAGE CORPORATION'   A72I/AOAI",
 "PH MORTGAGE CORPORATION'   T",
 'PH MORTGAGE CORPORATION\'   TO2IYAZ020] TI/IZ/2020)   TAF     ENDORSED "  3.625] A5] OR) TT G500',
 "PH MORTGAGE CORPORATION'   A /TOY2Z020",
 "PH MORTGAGE CORPORATION'   TOZIYZ020  1I/I",
 'PH MORTGAGE CORPORATION\'   AUGYAZOZI  2VAE/AOAI    OF    ENDORSED "  2.205] 895] 75  \'RY TA) UY\'   ZE00',
 "PH MORTGAGE CORPORATION'   AZVISYAO20",
 "PH MORTGAGE CORPORATION'   A7A/A0AN",
 "PH MORTGAGE CORPORATION'   77",
 "PH MORTGAGE CORPOR

In [119]:
print(final_df['lender'].nunique(dropna=False))
final_df.loc[final_df['lender'].str.contains(search_string, na=False), 'lender'] = 'PU MORTGAGE CORPORATION'
print(final_df['lender'].nunique(dropna=False))

10689
10664


In [120]:
# Returns a Series where the index is the Lender and the value is the count
final_df['lender'].value_counts()

lender
FINANCIAL FREEDOM SENIOR FUNDING CORP                                                                                               142601
WELLS FARGO BANK NA                                                                                                                 133125
AMERICAN ADVISORS GROUP                                                                                                              84100
FINANCE OF AMERICA REVERSE LLC                                                                                                       64622
MSR ASSET VEHICLE LLC                                                                                                                50107
BANK OF AMERICA NA CHARLOTTE                                                                                                         41651
METLIFE BANK, NATIONAL ASSOCIATION                                                                                                   39889
SEATTLE MORTGAGE COM

## clsing_dt

In [121]:
# Create a filter for NaN values in clsing_dt
mask_nan_date = final_df['clsing_dt'].isna()

# Display only the two columns for those specific rows
final_df.loc[mask_nan_date, ['source_pdf', 'row_in_pdf', 'raw_text', 'lender','clsing_dt']].head(70)

Unnamed: 0,source_pdf,row_in_pdf,raw_text,lender,clsing_dt
234,1,1,"""Lender"" ""clsng dt’ ‘endrsmnt_dt' trmtn_dt' ‘Horr Age’ 'Coborr_Age’ ‘Horr Cnt' ‘cs status’ ‘int _rt' ‘int_rt lyr’ ‘hecm margin’ ‘pd_strmln fig’ 'rt_typ' ‘arm indx typ’ ‘arm prde typ’ ""max clm amt’ “init prncpl_Imt’ ‘hecm orgntn fees’ ""prop _addr_zip_cd’ loan_typ",,
315,10002,4,"[TWELLS FARGO BANK NAT | 72172006] 12/2072006[ ea] || tindorsed TP 653) fin aye 355000 testa [0 [22407 ""02""",WELLS FARGO BANK NA,
318,10002,7,"[TFIRST MARINER BANK’ | t/t0/2006| 29/zoo7| | 3] | | Endorsed ' | 5] a] tt ay t90000/ tistso| of '235is ""02""",TFIRST MARINER BANK' T/T,
324,10002,13,"[TIDEWATER WOME FUNDING’ | 9/2006] ti/a7/2006| | | | | Endorsed *] 65] a] sth yh yt 96500) togs4z.5/ o|'23703 02""",TIDEWATER WOME FUNDING,
340,10002,29,"['WELLS FARGO BANK NAY | 22/2007] 2/uevaoor|] | 3] | Assigned ‘| 6.53) 28] tnt wt tay | 52z000| ossaa| of '23tea ""02""",WELLS FARGO BANK NA,
353,10002,42,"[WELLS FARGO BANK NAT apa ao06|a/i/ao07| | wat a Rndorsed ""| 645] 99 tnt tw rays a20000[azgodo[ a f'23456 03""",WELLS FARGO BANK NA,
356,10002,45,"['WELLS FARGO BANK NAY 2715/2006] a/ig/aoor] | ta] ef |tndorsed § {at 5.98] es tnt ay tza000] eres] io f'23513 ""02""",WELLS FARGO BANK NA,
363,10002,52,"[WELLS FARGO BANK NAT a /eda006| 12/ai/2006| | ts] | | Assigned *]és3f Pe teh way teso00 | ta4o7a[ of '2a572 02""",WELLS FARGO BANK NA,
365,10002,54,"['WELLS FARGO BANK NAY as fo007|3/ia/z007| | tf Assigned “| bor] ee] anh way 0000062400 of'2a701 fon""",WELLS FARGO BANK NA,
384,10001,7,"[U.S FINANCIAL MORTGAGE Corporation’ | 12/2007] 2/ai/aoor|_ | to] | Assigned 1] 6.48) 5.99 tnt tay 57000[ tazez] of '23701 ""02""",U.S FINANCIAL MORTGAGE CORPORATION,


In [122]:
# Assuming df is your combined dataframe from all PDFs
# 1. Group by the source filename
# 2. Sum the occurrences where 'clsing_dt' is null
nan_counts = final_df[final_df['clsing_dt'].isna()].groupby('source_pdf').size()

# 3. Sort in descending order
sorted_nan_counts = nan_counts.sort_values(ascending=False)

sorted_nan_counts

source_pdf
17257    154
17256    154
17258    153
10346     59
6137      56
6138      55
10321     55
6223      54
10952     54
6402      54
6101      53
10322     53
6398      53
10324     53
11150     53
5737      52
6104      52
5779      52
6017      52
10949     52
5969      52
6392      52
5519      52
6121      52
10330     52
10211     52
10196     51
5760      51
10950     51
5740      51
6486      51
10220     51
10994     51
5561      50
6408      50
6123      50
6015      50
6403      50
5761      50
5739      50
5738      50
6238      50
10900     49
10188     49
6482      49
5991      49
5762      49
6102      49
5763      49
10245     49
11145     49
10335     49
10337     49
11155     48
10948     48
11191     48
10189     48
6085      48
6016      48
6019      48
5981      48
5746      48
6635      48
10323     48
6122      48
5781      48
10195     48
10197     48
5972      48
6564      48
6640      48
11157     47
11151     47
6489      47
10248     47
5521      47
5

In [123]:
# Sort by the character count of the 'clsing_dt' column
# na_position='last' ensures any NaN values stay at the bottom
sorted_df = final_df.sort_values(
    by='clsing_dt', 
    key=lambda x: x.str.len(), 
    ascending=False, 
    na_position='last'
)

# Display only the relevant columns for the top 10 longest strings
sorted_df[['raw_text', 'clsing_dt']].head(10)

Unnamed: 0,raw_text,clsing_dt
8,"ONE REVERSE MORTGAGE LLC’ 12/22/2012 1/31/2013 62 1'Endorsed _' 4.99 4.99 o'N' uo 160000 99040 2500 ""35053"" ""02""",12/22/2012
762495,"AMERICAN ADVISORS GROUP"" 10/23/2019 11/25/2019 69 65 2 ‘Endorsed ' 3.617 3.31 1.63. 'N' zy Lt ""y"" 390000 213330 0 ""32837 *02""",10/23/2019
762498,"""MSR ASSET VEHICLE LLC"" 11/25/2019 2/5/2020 80 81 2 ‘Endorsed 4.336 4.145 2.375 'N"" a 1 ny"" 515000 302820 1995 ""32963 ""02""",11/25/2019
762467,"""AMERICAN ADVISORS GROUP"" 10/23/2019 11/19/2019 72 72 2 ‘Endorsed 5,237 4.93 3.25. 'N"" zy 1 ny 170000 80580 3400_'32935 ""02""",10/23/2019
762528,"""MCM HOLDINGS INC’ 10/11/2019 3/13/2020 2 1 ""Endorsed 3.728 3. 385 1.875 'N’ "" 1 ny! 220000 123860 4200 ""32962! ""02""",10/11/2019
762531,"""AMERICAN ADVISORS GROUP"" 10/28/2019 11/25/2019 u 69 2 ‘Endorsed ' 3.487 3.18 1.5 'N vy Lt “y 245000 141120 4450 ""32835 ""02""",10/28/2019
762547,"""AMERICAN ADVISORS GROUP"" 10/11/2019 11/7/2019 80 1 ‘Endorsed 5,653 5.31 3.8 'N' vy LI “y 129000 67338 2580 ""32114 | *02""",10/11/2019
762549,"""MIDWEST LOAN SOLUTIONS INC 10/21/2019 12/20/2019 79 ia 2 ""Endorsed _' 4.456 4.03 2.5 'N' zy i ny! 185000 103970 3700 ""32822 ""02""",10/21/2019
762551,"""AMERICAN ADVISORS GROUP"" 10/23/2019 11/26/2019 69 1 ‘Endorsed 5. 367 5.06 3.36 'N' zy I ny"" 168000 T7448 3360 ""32724! ""02""",10/23/2019
762258,"""BANK OF AMERICA NA CHARLOTTE” 12/11/2009 1/22/2010 62 64 2 ‘Assigned 5.56 5.56 o IN’ rt ut uo 350000 221550 5500 ""85338 | ""02""",12/11/2009


## arm_prdc_typ

In [124]:
list(final_df['arm_prdc_typ'].unique())

[nan,
 '6A',
 '08P',
 '06P',
 '68P',
 '2B',
 '66P',
 '7P',
 '3P',
 '5P',
 '6P',
 '2N',
 '65P',
 '67P',
 '30N',
 '260000F',
 '1K',
 '20N',
 '5X',
 '8L',
 '28P',
 '36P',
 '5A',
 '27K',
 '25P',
 '53P',
 '7K',
 '8P',
 '8A',
 '7T',
 '2K',
 '1H',
 '9A',
 '7R',
 '3N',
 '9S',
 '77P',
 '1L',
 '390000T',
 '0N',
 '4L',
 '87P',
 '85P',
 '8G',
 '83P',
 '75P',
 '86P',
 '73P',
 '2P',
 '72P',
 '96P',
 '05P',
 '6B',
 '88P',
 '79P',
 '8B',
 '562P',
 '625P',
 '39P',
 '300000T',
 '35P',
 '1T',
 '9P',
 '3H',
 '1A',
 '7A',
 '7M',
 '7L',
 '58P',
 '8S',
 '2A',
 '63P',
 '603P',
 '33P',
 '98P',
 '55P',
 '62P',
 '4A',
 '2H',
 '3L',
 '20H',
 '5S',
 '30H',
 '201T',
 '1G',
 '15H',
 '1B',
 '5L',
 '199T',
 '3A',
 '2T',
 '1M',
 '7B',
 '3B',
 '1N',
 '3S',
 '20K',
 '9561S',
 '31H',
 '9171L',
 '22H',
 '3K',
 '25H',
 '21H',
 '0L',
 '201L',
 '5R',
 '7111L',
 '2181L',
 '1O',
 '20R',
 '1I',
 '74T',
 '3M',
 '6411L',
 '12N',
 '7V',
 '1S',
 '2735S',
 '7S',
 '15N',
 '16N',
 '4877S',
 '80S',
 '8M',
 '2132S',
 '868A',
 '4Q',
 '6S'

In [126]:
final_df[['source_pdf', 'row_in_pdf', 'raw_text', 'arm_indx_typ', 'arm_prdc_typ', 'max_clm_amt']].head(20)

Unnamed: 0,source_pdf,row_in_pdf,raw_text,arm_indx_typ,arm_prdc_typ,max_clm_amt
0,100,1,"""GMFS LLC"" 12/31/2012 2/25/2013 2/16/2017 64 1 “Terminated 5.06 5.06 oR uo 117000 74529 0 ‘36108 *02""",,,117000.0
1,100,2,"""ONE REVERSE MORTGAGE LLC’ 12/15/2012 1/25/2013 23 73 2 ‘Assigned 4.5 4.5 o'N’ r uo uo 200000 136200 o ""36312 ""02""",R,,200000.0
2,100,3,"""SUN WEST MORTGAGE CO_INC’ 1/11/2013, 3/4/2013 ca 1 ‘Assigned 5.3 5.3 oN i i a 130000 86710 0 '36571 ""02""",A,,130000.0
3,100,4,"“MSR_ASSET VEHICLE LLC’ 2/20/2013 7/25/2013 76 1 ‘Assigned 5.06 5.06 o IN’ Fr uo 300000 207900 5000 ""36078 ""02""",,,300000.0
4,100,5,"""SUN WEST MORTGAGE CO_TNC’ 1/26/2013 4/22/2013 2 1_‘Endorsed 4,99 4.99 oN’ - — uo 240000 161040 2500 ""35966 ""02""",,,240000.0
5,100,6,"""AMERICAN ADVISORS GROUP"" 1/31/2013 3/27/2013 66 74 2 ‘Assigned 5.06 5.06 oN’ Fr uo 122000 74 2500 ""36544 *02""",,,122000.0
6,100,7,"""MSR ASSET VEHICLE LLC"" 2/1/2013 3/1/2013 15 1 ‘Endorsed 5.06 5.06 oN Fr a oo 150000 103350 3000 ""35222 ""02""",,,150000.0
7,100,8,"""DITECH MORTGAGE CORP"" 1/31/2013 3/14/2013 79 1 ""Assigned ' 3.99 3.99 oN’ r uo 155000 109430 3100 ""36207 | ""02""",,,155000.0
8,100,9,"ONE REVERSE MORTGAGE LLC’ 12/22/2012 1/31/2013 62 1'Endorsed _' 4.99 4.99 o'N' uo 160000 99040 2500 ""35053"" ""02""",,,160000.0
9,100,10,"""ONE REVERSE MORTGAGE LLC’ 1/25/2013 2/25/2013 68 1 ‘Endorsed 2.83 4.345 2.625 'N’ uw 1 ay 23000 14996, 2500 ""35211 ' ""02""",,,23000.0


## hecm_margin

In [127]:
# Create a filter for NaN values in clsing_dt
mask_nan_date = final_df['hecm_margin'].isna()

# Display only the two columns for those specific rows
final_df.loc[mask_nan_date, ['source_pdf', 'row_in_pdf', 'raw_text', 'hecm_margin']].head(70)

Unnamed: 0,source_pdf,row_in_pdf,raw_text,hecm_margin
0,100,1,"""GMFS LLC"" 12/31/2012 2/25/2013 2/16/2017 64 1 “Terminated 5.06 5.06 oR uo 117000 74529 0 ‘36108 *02""",
1,100,2,"""ONE REVERSE MORTGAGE LLC’ 12/15/2012 1/25/2013 23 73 2 ‘Assigned 4.5 4.5 o'N’ r uo uo 200000 136200 o ""36312 ""02""",
2,100,3,"""SUN WEST MORTGAGE CO_INC’ 1/11/2013, 3/4/2013 ca 1 ‘Assigned 5.3 5.3 oN i i a 130000 86710 0 '36571 ""02""",
3,100,4,"“MSR_ASSET VEHICLE LLC’ 2/20/2013 7/25/2013 76 1 ‘Assigned 5.06 5.06 o IN’ Fr uo 300000 207900 5000 ""36078 ""02""",
4,100,5,"""SUN WEST MORTGAGE CO_TNC’ 1/26/2013 4/22/2013 2 1_‘Endorsed 4,99 4.99 oN’ - — uo 240000 161040 2500 ""35966 ""02""",
5,100,6,"""AMERICAN ADVISORS GROUP"" 1/31/2013 3/27/2013 66 74 2 ‘Assigned 5.06 5.06 oN’ Fr uo 122000 74 2500 ""36544 *02""",
6,100,7,"""MSR ASSET VEHICLE LLC"" 2/1/2013 3/1/2013 15 1 ‘Endorsed 5.06 5.06 oN Fr a oo 150000 103350 3000 ""35222 ""02""",
7,100,8,"""DITECH MORTGAGE CORP"" 1/31/2013 3/14/2013 79 1 ""Assigned ' 3.99 3.99 oN’ r uo 155000 109430 3100 ""36207 | ""02""",
8,100,9,"ONE REVERSE MORTGAGE LLC’ 12/22/2012 1/31/2013 62 1'Endorsed _' 4.99 4.99 o'N' uo 160000 99040 2500 ""35053"" ""02""",
10,100,11,"""DITECH MORTGAGE CORP"" 1/11/2013, 2/26/2013 70 70 2 ‘Assigned 4.99 4.99 oN’ 7 uo 45000 29835 0 ""35950 *02""",


In [128]:
# 1. Identify the counts of NaN in 'hecm_margin' per 'source_pdf'
nan_counts = final_df[final_df['hecm_margin'].isna()]['source_pdf'].value_counts()

# 2. Get the name of the source_pdf with the most NaNs
most_nan_pdf = nan_counts.idxmax()
print(f"The source_pdf with the most NaNs is: {most_nan_pdf}")

# 3. Create a sorting index based on the counts (descending)
# This ensures final_df is ordered by the frequency of NaNs in each PDF
final_df['nan_rank'] = final_df['source_pdf'].map(nan_counts).fillna(0)

# 4. Sort the original dataframe and filter for only NaN values
final_df_sorted_filtered = final_df[final_df['hecm_margin'].isna()].sort_values(
    by='nan_rank', 
    ascending=False
).drop(columns=['nan_rank']) # Drop helper column after sorting

# Display results
final_df_sorted_filtered.head(30)

The source_pdf with the most NaNs is: 17258


Unnamed: 0,source_pdf,row_in_pdf,raw_text,lender,clsing_dt,endrsmt_dt,tmntn_dt,Borr_Age,Coborr_Age,Borr_Cnt,es_status,int_rt,int_rt_10yr,hecm_margin,pd_stmln_flg,rt_typ,arm_indx_typ,arm_prdc_typ,max_clm_amt,init_prncpl_lmt,hecm_orgntn_fees,prop_addr_zip_cd,loan_typ
541587,17256,47,UNITY MORTGAGE CORP aso erates 447008 1 Teme $85 885 166 aT v 7500 2050 9 35150 2,UNITY MORTGAGE CORP,,,,,,,,,,,V,,,,447008.0,7500.0,0,35150.0,2.0
541800,17257,104,FINANCIAL FREEDOM SENIOR FUNDING CORP 11/1372001 32172002 azOIt n 2 Temewted 587 0 S87 15.6 wot v 52500527075 © 36575 a,FINANCIAL FREEDOM SENIOR FUNDING CORP,,,,,,,,15.6,,,N,V,A,,1372001.0,32172000.0,0,,36575.0
541801,17257,105,FINANCIAL FREEDOM SENIOR FUNDING CORP 707002 §/167002 1278700376 1 Temnated 682 0 682 15.N MoT v 7000 «zn 0 38606 2,FINANCIAL FREEDOM SENIOR FUNDING CORP,,,,,,,,,,,N,V,,,707002.0,167002.0,0,38606.0,2.0
541802,17257,106,EVERHOME MORTGAGE COMPANY LLC vimzo0g sane Tonaor 1 Temnated 654 0 654 1S.N wor v 1443369600027 38601 oo,EVERHOME MORTGAGE COMPANY LLC,,,,,,,,,,,N,V,,1S,1443370000000.0,38601.0,0,,38601.0
541803,17257,107,WELLS FARGO BANK NA ago raze? sans 1 Temesed 685 655. 1S.N wot v ‘40000 25320 © 35020 a,WELLS FARGO BANK NA,,,,,,,,,,,N,V,A,1S,40000.0,25320.0,0,25320.0,35020.0
541804,17257,108,WELLS FARGO BANK NA zrsooe zane 1118200575 a 2 Teminstes 642 0 642 15.N wot v 144335 94073.08 © 36542 i,WELLS FARGO BANK NA,,,,,,,,94073.08,,,A,N,V,,1118201000.0,144335.0,0,,36542.0
541805,17257,109,"FINANCIAL FREEDOM SENIOR FUNDING CORP 1277001 32172002 7naz008 = &S SS 2 Temated 642 B42 15N wT v 1176000 62756 0 35173, 2",FINANCIAL FREEDOM SENIOR FUNDING CORP,,,,,,,,,,,S,V,,15N,1277001.0,32172000.0,0,35173.0,2.0
541806,17257,110,WELLS FARGO BANK NA ‘vawzong 22002 m 7S 2 Tememes 655 655 SN wor v 144336 93529.73 © 3603 2,WELLS FARGO BANK NA,,,,,,,,93529.73,,,M,V,,7S,22002.0,144336.0,0,,2.0
541807,17257,111,WELLS FARGO BANK NA zisgooz riezoez 2aqor | @ 1 Temnted 642 542 15 wor v 1131000 rss2 0 35226 a,WELLS FARGO BANK NA,,,,,,,,,,,V,A,,,1131000.0,35226.0,0,,35226.0
541808,17257,112,"FINANCIAL FREEDOM SENIOR FUNDING CORP 12792001 57152002 6282017 77 1 Teminwied 427 0 726 206 AT v 5000 40105 © 35023, 2",FINANCIAL FREEDOM SENIOR FUNDING CORP,,,,,,,,,,,V,,,,12792000.0,57152000.0,0,35023.0,2.0


In [129]:
# Update the hecm_margin column to 1.5 for rows where source_pdf is 17258
final_df.loc[final_df['source_pdf'] == 17258, 'hecm_margin'] = 1.5

# Display a summary of the change
updated_rows = (final_df['source_pdf'] == 17258).sum()
print(f"Updated {updated_rows} rows for source_pdf 17258.")

Updated 156 rows for source_pdf 17258.


## prop_addr_zip_cd

In [130]:
final_df[final_df['prop_addr_zip_cd'].isna()].head()

Unnamed: 0,source_pdf,row_in_pdf,raw_text,lender,clsing_dt,endrsmt_dt,tmntn_dt,Borr_Age,Coborr_Age,Borr_Cnt,es_status,int_rt,int_rt_10yr,hecm_margin,pd_stmln_flg,rt_typ,arm_indx_typ,arm_prdc_typ,max_clm_amt,init_prncpl_lmt,hecm_orgntn_fees,prop_addr_zip_cd,loan_typ,nan_rank
14,100,15,"""AMERICAN ADVISORS GROUP"" 1/29/2013, 3/29/2013 2/1/2021 80 76 2 ‘Terminated’ 5.06 5.06 oN’ i oo 190000 131670 035650 ""02""",AMERICAN ADVISORS GROUP,1/29/2013,3/29/2013,2/1/2021,80.0,76.0,2.0,Terminated,5.06,5.06,,I,,,,190000.0,131670.0,0,,2.0,71.0
16,100,17,"""NETWORK FUNDING LP"" 1/21/2013 2/12/2013 70 66 2 ‘Assigned 5.06 5.06 oN r i a 400000 254800 o'3511 ""02""",NETWORK FUNDING LP,1/21/2013,2/12/2013,,,,,Assigned,5.06,5.06,,R,I,A,,400000.0,254800.0,0,,2.0,71.0
24,100,25,"""GENERATION MORTGAGE COMPANY” 1/18/2013 2/21/2013 10/1/2014 63 65 2 “Terminated 5.06 5.06 o'N' uo 625500 390937. 5 036532 ""02""",GENERATION MORTGAGE COMPANY,1/18/2013,2/21/2013,10/1/2014,63.0,65.0,2.0,Terminated,5.06,5.06,,O,N,,,625500.0,390937.0,0,,2.0,71.0
29,100,30,"""PINANCE OF AMERICA REVERSE LLC” 1/18/2013 4/2/2013, 72 a4 2 ‘Assigned 4.5 45 oN’ Fr ot uo 375000 251625 0 135146 ""02""",FINANCE OF AMERICA REVERSE LLC,1/18/2013,4/2/2013,,,,,Assigned,4.5,,,,,,,375000.0,251625.0,0,,2.0,71.0
30,100,31,"""AMERICAN ADVISORS GROUP"" 1/15/2013, 3/14/2013 10/31/2017 63 1 ‘Terminated’ 5.06 5.06. oN’ - — uo 80000 50000 036268 ""02""",AMERICAN ADVISORS GROUP,1/15/2013,3/14/2013,10/31/2017,63.0,,1.0,Terminated,5.06,5.06,,,,,,80000.0,50000.0,0,,2.0,71.0


In [131]:
# Define the list of loan types to exclude
exclude_loans = [1.0, 2.0, 5.0, 0.0, 6.0, 3.0]

# Filter rows where:
# 1. loan_type is NOT in our exclusion list
# 2. loan_type is NOT NaN (since the prompt says not in (..., NaN))
# 3. prop_addr_zip_cd IS NaN
filtered_df = final_df[
    (~final_df['loan_typ'].isin(exclude_loans)) & 
    (final_df['loan_typ'].notna()) & 
    (final_df['prop_addr_zip_cd'].isna())
]

# Display the result
print(f"Found {len(filtered_df)} rows matching the criteria.")
filtered_df[['source_pdf', 'row_in_pdf','raw_text', 'prop_addr_zip_cd', 'loan_typ']].head(50)

Found 55201 rows matching the criteria.


Unnamed: 0,source_pdf,row_in_pdf,raw_text,prop_addr_zip_cd,loan_typ
103,1000,26,"""BANK OF AMERICA NA CHARLOTTE"" 1/14/2010 2/11/2010 1/30/2012 92 1 ‘Terminated’ 2.483 5.71 2.25 'N' uw 1 ae 455000 351322, 49 2500 ""95118 | ""ot""",,95118.0
111,1000,34,"""BINANCIAL FREEDOM ACQUISITION LLC 1/22/2010 2/26/2010 10/21/2014 85 a 2 ""Terminated"" 2.733 5.92 2.5 'R uw a uM 625500 444105 5500 ""94403 ""ot""",,94403.0
119,1000,42,"""BANK OF AMERICA NA CHARLOTTE’ 1/14/2010 2/11/2010 7/12/2019 5 1 ‘Terminated’ 2.733 5.94 2.5 'N’ uw me oo 515000 319299. 81 6000 '94080 “ou”",,94080.0
133,1000,56,"""SUN WEST MORTGAGE CO INC’ 7/1/2010 8/2/2010 11/25/2020 85 1 “Terminated 3.097 5.95 2.75 'N' uw 1 ae 535000 380920 6000 ""94602! ""oa""",,94602.0
154,1000,77,"""WELLS FARGO BANK NA"" 2/13/2010 3/18/2010 2/13/2019 ul 1 ‘Terminated’ 2.728 6.09 2.5 'N' Ww 1 iM 625500 358389. 43 6000 ""94121 ""ot""",,94121.0
189,10,34,"""PINANCIAL FREEDOM SENIOR FUNDING CORP” 3/9/2005 3/24/2005 63 1 ""Endorsed"" 47 5.87 15 'N’ uw T ny 172632 101334, 98 0 ""36216 ""ot""",,36216.0
199,10,44,"""PINANCIAL FREEDOM SENIOR FUNDING CORP” 5/6/2005 7/29/2005 1/18/2017 ca it 2 ""Terminated"" 4,83 5.74 15 'N’ mw T ny! 72000 52473.01 0 135634 ""ot""",,135634.0
208,10,53,"""WELLS FARGO BANK NA"" 5/24/2005 6/22/2005 11/3/2020 it 1 ""Terminated"" 4,82 5.61 1.5 'N uw T ny! 74000 54760 0 36005 ""ou""",,36005.0
219,10,64,"""PINANCIAL FREEDOM SENIOR FUNDING CORP” 9/29/2005 11/30/2005 5/23/2018 81 1 ""Terminated 5.38 5.73 15 'N' uw T ny 53000 42241 0 ‘36451 ‘ot",,36451.0
240,1,7,"TUNITY MORTGAGE CORP"" 8/10/1992 10/5/1992 4/3/2003 73 a 2 “Terminated 8.27 8.27 1.6 'N' zy T ny"" 102700 47550.1 0 ""35216 ‘ol’",,35216.0


In [132]:
# 1. Define the exclusion list (ensuring numeric types match the dataframe)
exclude_loans = [1.0, 2.0, 5.0, 0.0, 6.0, 3.0]

# 2. Define the mask based on your criteria:
# - loan_typ NOT in exclude_loans
# - loan_typ IS NOT NaN
# - prop_addr_zip_cd IS NaN
# 1. Perform your logic to move the values
mask = (
    (~final_df['loan_typ'].isin(exclude_loans)) & 
    (final_df['loan_typ'].notna()) & 
    (final_df['prop_addr_zip_cd'].isna())
)

# Convert loan_typ to int first to drop decimals, then assign
final_df.loc[mask, 'prop_addr_zip_cd'] = final_df.loc[mask, 'loan_typ'].astype(int)

# 2. THE CRITICAL STEP: Convert the entire column to pandas' nullable integer type
final_df['prop_addr_zip_cd'] = pd.to_numeric(final_df['prop_addr_zip_cd'], errors='coerce').astype('Int64')

# 3. Verify the display
final_df[mask][['prop_addr_zip_cd', 'loan_typ']].head()

Unnamed: 0,prop_addr_zip_cd,loan_typ
103,95118,95118.0
111,94403,94403.0
119,94080,94080.0
133,94602,94602.0
154,94121,94121.0


In [133]:
import pandas as pd
import re

def extract_zip_from_raw_text(raw_text):
    """
    Extract the second-to-last value from raw_text, which should be a 5-digit zip code.
    """
    if pd.isna(raw_text) or not isinstance(raw_text, str):
        return None
    
    # Split by whitespace to get all tokens
    tokens = raw_text.split()
    
    if len(tokens) < 2:
        return None
    
    # Get the second-to-last token
    second_last = tokens[-2]
    
    # Clean the token: remove common punctuation that might be attached
    # Keep only digits and decimal points initially
    cleaned = re.sub(r'[^\d.]', '', second_last)
    
    # Extract all sequences of digits
    digit_sequences = re.findall(r'\d+', cleaned)
    
    if not digit_sequences:
        return None
    
    # Find the first 5-digit sequence (most likely to be the zip code)
    for seq in digit_sequences:
        if len(seq) == 5:
            return seq
    
    # If no exact 5-digit sequence, try to extract 5 digits from the longest sequence
    longest = max(digit_sequences, key=len)
    if len(longest) >= 5:
        # Try to extract last 5 digits (zip codes often appear at the end)
        return longest[-5:]
    
    # If still no match, pad with leading zeros if we have fewer than 5 digits
    if len(longest) < 5:
        return longest.zfill(5)
    
    return None

# Apply the function to fill missing prop_addr_zip_cd values
final_df.loc[final_df['prop_addr_zip_cd'].isna(), 'prop_addr_zip_cd'] = \
    final_df.loc[final_df['prop_addr_zip_cd'].isna(), 'raw_text'].apply(extract_zip_from_raw_text)

# Optional: Convert to string and ensure 5 digits with leading zeros
final_df['prop_addr_zip_cd'] = final_df['prop_addr_zip_cd'].apply(
    lambda x: str(int(float(x))).zfill(5) if pd.notna(x) else x
)

print("Updated zip codes:")
final_df[['raw_text', 'prop_addr_zip_cd']].head(20)

Updated zip codes:


Unnamed: 0,raw_text,prop_addr_zip_cd
0,"""GMFS LLC"" 12/31/2012 2/25/2013 2/16/2017 64 1 “Terminated 5.06 5.06 oR uo 117000 74529 0 ‘36108 *02""",36108
1,"""ONE REVERSE MORTGAGE LLC’ 12/15/2012 1/25/2013 23 73 2 ‘Assigned 4.5 4.5 o'N’ r uo uo 200000 136200 o ""36312 ""02""",36312
2,"""SUN WEST MORTGAGE CO_INC’ 1/11/2013, 3/4/2013 ca 1 ‘Assigned 5.3 5.3 oN i i a 130000 86710 0 '36571 ""02""",36571
3,"“MSR_ASSET VEHICLE LLC’ 2/20/2013 7/25/2013 76 1 ‘Assigned 5.06 5.06 o IN’ Fr uo 300000 207900 5000 ""36078 ""02""",36078
4,"""SUN WEST MORTGAGE CO_TNC’ 1/26/2013 4/22/2013 2 1_‘Endorsed 4,99 4.99 oN’ - — uo 240000 161040 2500 ""35966 ""02""",35966
5,"""AMERICAN ADVISORS GROUP"" 1/31/2013 3/27/2013 66 74 2 ‘Assigned 5.06 5.06 oN’ Fr uo 122000 74 2500 ""36544 *02""",36544
6,"""MSR ASSET VEHICLE LLC"" 2/1/2013 3/1/2013 15 1 ‘Endorsed 5.06 5.06 oN Fr a oo 150000 103350 3000 ""35222 ""02""",35222
7,"""DITECH MORTGAGE CORP"" 1/31/2013 3/14/2013 79 1 ""Assigned ' 3.99 3.99 oN’ r uo 155000 109430 3100 ""36207 | ""02""",36207
8,"ONE REVERSE MORTGAGE LLC’ 12/22/2012 1/31/2013 62 1'Endorsed _' 4.99 4.99 o'N' uo 160000 99040 2500 ""35053"" ""02""",35053
9,"""ONE REVERSE MORTGAGE LLC’ 1/25/2013 2/25/2013 68 1 ‘Endorsed 2.83 4.345 2.625 'N’ uw 1 ay 23000 14996, 2500 ""35211 ' ""02""",35211


## Stats post processing

In [134]:
# Calculate the percentage of NaNs for each column
(final_df.isna().sum() / len(final_df)) * 100

source_pdf           0.000000
row_in_pdf           0.000000
raw_text             0.000000
lender               0.081583
clsing_dt            5.839955
endrsmt_dt          11.366537
tmntn_dt            55.488900
Borr_Age            66.719638
Coborr_Age          91.327870
Borr_Cnt            66.719638
es_status            0.058362
int_rt              10.424033
int_rt_10yr         25.335427
hecm_margin         62.977510
pd_stmln_flg        12.884058
rt_typ              39.815480
arm_indx_typ        79.943711
arm_prdc_typ        98.766929
max_clm_amt          3.121809
init_prncpl_lmt      9.901365
hecm_orgntn_fees     0.000000
prop_addr_zip_cd     5.306920
loan_typ             1.311129
nan_rank             0.000000
dtype: float64

In [135]:
final_df.isna().sum()

source_pdf               0
row_in_pdf               0
raw_text                 0
lender                 787
clsing_dt            56336
endrsmt_dt          109649
tmntn_dt            535282
Borr_Age            643621
Coborr_Age          881008
Borr_Cnt            643621
es_status              563
int_rt              100557
int_rt_10yr         244402
hecm_margin         607522
pd_stmln_flg        124288
rt_typ              384086
arm_indx_typ        771189
arm_prdc_typ        952770
max_clm_amt          30115
init_prncpl_lmt      95515
hecm_orgntn_fees         0
prop_addr_zip_cd     51194
loan_typ             12648
nan_rank                 0
dtype: int64

# Save

In [136]:
# delete column nan_rank from final_df
final_df = final_df.drop(columns=["nan_rank"], errors="ignore")
final_df.head()

Unnamed: 0,source_pdf,row_in_pdf,raw_text,lender,clsing_dt,endrsmt_dt,tmntn_dt,Borr_Age,Coborr_Age,Borr_Cnt,es_status,int_rt,int_rt_10yr,hecm_margin,pd_stmln_flg,rt_typ,arm_indx_typ,arm_prdc_typ,max_clm_amt,init_prncpl_lmt,hecm_orgntn_fees,prop_addr_zip_cd,loan_typ
0,100,1,"""GMFS LLC"" 12/31/2012 2/25/2013 2/16/2017 64 1 “Terminated 5.06 5.06 oR uo 117000 74529 0 ‘36108 *02""",GMFS LLC,12/31/2012,2/25/2013,2/16/2017,64.0,,1.0,Terminated,5.06,5.06,,,,,,117000.0,74529.0,0,36108,2.0
1,100,2,"""ONE REVERSE MORTGAGE LLC’ 12/15/2012 1/25/2013 23 73 2 ‘Assigned 4.5 4.5 o'N’ r uo uo 200000 136200 o ""36312 ""02""",ONE REVERSE MORTGAGE LLC,12/15/2012,1/25/2013,,,,,Assigned,4.5,4.5,,O,N,R,,200000.0,136200.0,0,36312,2.0
2,100,3,"""SUN WEST MORTGAGE CO_INC’ 1/11/2013, 3/4/2013 ca 1 ‘Assigned 5.3 5.3 oN i i a 130000 86710 0 '36571 ""02""",SUN WEST MORTGAGE CO INC,1/11/2013,3/4/2013,,,,,Assigned,5.3,5.3,,I,I,A,,130000.0,86710.0,0,36571,2.0
3,100,4,"“MSR_ASSET VEHICLE LLC’ 2/20/2013 7/25/2013 76 1 ‘Assigned 5.06 5.06 o IN’ Fr uo 300000 207900 5000 ""36078 ""02""",MSR ASSET VEHICLE LLC,2/20/2013,7/25/2013,,,,,Assigned,5.06,5.06,,O,,,,300000.0,207900.0,0,36078,2.0
4,100,5,"""SUN WEST MORTGAGE CO_TNC’ 1/26/2013 4/22/2013 2 1_‘Endorsed 4,99 4.99 oN’ - — uo 240000 161040 2500 ""35966 ""02""",SUN WEST MORTGAGE CO INC,1/26/2013,4/22/2013,,,,,Endorsed,4.99,,,,,,,240000.0,161040.0,0,35966,2.0


In [137]:
final_df.to_csv(r'processed_output_V1.csv', index=False)