# Data Cleaning - Geospatial Data

In [1]:
import pandas as pd
import os
import geopandas as gpd
import matplotlib.pyplot as plt
from geopy.geocoders import Nominatim


In [2]:
# specify the directory containing the CSV files
directory = '../../data/quarterly_financials'

# create an empty list to store the dataframes
dfs = []

# loop over the CSV files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        # read the CSV file into a dataframe and append it to the list
        path = os.path.join(directory, filename)
        df = pd.read_csv(path)
        dfs.append(df)

# concatenate the dataframes into a single dataframe
quarterly_financials = pd.concat(dfs, ignore_index=True)


In [3]:
# print the combined dataframe
display(quarterly_financials.shape)
display(quarterly_financials.head())

(971541, 15)

Unnamed: 0,ZIP,BKCLASS,REPDTE,DEPDOM,ASSET,STNAME,EQ,NAME,CITY,ADDRESS,ENDEFYMD,CERT,ESTYMD,LIAB,ID
0,2110,SM,19960630,7531683.0,28840879,MASSACHUSETTS,1556358.0,STATE STREET BANK&TRUST CO,BOSTON,225 FRANKLIN ST.,99991231.0,14,17920101,27284521.0,14_19960630
1,1970,NM,19960630,117100.0,138177,MASSACHUSETTS,17019.0,EASTERN BANK&TRUST CO,SALEM,217 ESSEX ST.,20021122.0,28,18030101,121158.0,28_19960630
2,36830,SM,19960630,188495.0,227366,ALABAMA,19350.0,AUBURNBANK,AUBURN,100 N. GAY ST.,99991231.0,35,19070103,208016.0,35_19960630
3,36732,NM,19960630,111234.0,124124,ALABAMA,11965.0,ROBERTSON BANKING CO,DEMOPOLIS,216 N. WALNUT AVE.,99991231.0,39,18700101,112159.0,39_19960630
4,36867,NM,19960630,70753.0,79828,ALABAMA,8738.0,PHENIX-GIRARD BANK,PHENIX CITY,8TH AVE. &13TH ST.,99991231.0,41,19040504,71090.0,41_19960630


In [4]:
display(len(quarterly_financials['BKCLASS'].unique()))
display(quarterly_financials['BKCLASS'].unique())


7

array(['SM', 'NM', 'N', 'SB', 'SI', 'SL', 'OI'], dtype=object)

Zip coordinates

In [5]:
# bring in
zip_coordinates = pd.read_csv('../../data/coordinate_data/2022_Gaz_zcta_national.txt', sep='\t')

# remove whitespace in col names
zip_coordinates.columns = [col.strip() for col in zip_coordinates.columns]

# drop unnecesary columns
zip_coordinates.drop(['ALAND', 'ALAND_SQMI', 'AWATER', 'AWATER_SQMI'], axis=1, inplace=True)

# check
display(zip_coordinates.shape)
display(zip_coordinates.head())

(33791, 3)

Unnamed: 0,GEOID,INTPTLAT,INTPTLONG
0,601,18.180555,-66.749961
1,602,18.361945,-67.175597
2,603,18.457399,-67.124867
3,606,18.158327,-66.932928
4,610,18.29396,-67.127182


In [6]:
display(len(quarterly_financials['ZIP'].unique()))


10754

Combine

In [7]:
# Rename the 'GEOID' column in 'zip_coordinates' to match the 'ZIP' column in 'quarterly_financials'
zip_coordinates = zip_coordinates.rename(columns={'GEOID': 'ZIP'})

# Merge the two dataframes on the 'ZIP' column
merged_df = quarterly_financials.merge(zip_coordinates, on='ZIP', how='left')

# Create a new column 'coordinates' with the combined 'INTPTLAT' and 'INTPTLONG' columns as a tuple
merged_df['zip_coordinates'] = list(zip(merged_df['INTPTLAT'], merged_df['INTPTLONG']))

# Update the 'quarterly_financials' dataframe with the new 'coordinates' column
quarterly_financials = merged_df

# Print the updated 'quarterly_financials' dataframe
display(quarterly_financials.shape)
display(quarterly_financials.head())

(971541, 18)

Unnamed: 0,ZIP,BKCLASS,REPDTE,DEPDOM,ASSET,STNAME,EQ,NAME,CITY,ADDRESS,ENDEFYMD,CERT,ESTYMD,LIAB,ID,INTPTLAT,INTPTLONG,zip_coordinates
0,2110,SM,19960630,7531683.0,28840879,MASSACHUSETTS,1556358.0,STATE STREET BANK&TRUST CO,BOSTON,225 FRANKLIN ST.,99991231.0,14,17920101,27284521.0,14_19960630,42.358254,-71.051927,"(42.358254, -71.051927)"
1,1970,NM,19960630,117100.0,138177,MASSACHUSETTS,17019.0,EASTERN BANK&TRUST CO,SALEM,217 ESSEX ST.,20021122.0,28,18030101,121158.0,28_19960630,42.529248,-70.868017,"(42.529248, -70.868017)"
2,36830,SM,19960630,188495.0,227366,ALABAMA,19350.0,AUBURNBANK,AUBURN,100 N. GAY ST.,99991231.0,35,19070103,208016.0,35_19960630,32.534872,-85.493755,"(32.534872, -85.493755)"
3,36732,NM,19960630,111234.0,124124,ALABAMA,11965.0,ROBERTSON BANKING CO,DEMOPOLIS,216 N. WALNUT AVE.,99991231.0,39,18700101,112159.0,39_19960630,32.417456,-87.892213,"(32.417456, -87.892213)"
4,36867,NM,19960630,70753.0,79828,ALABAMA,8738.0,PHENIX-GIRARD BANK,PHENIX CITY,8TH AVE. &13TH ST.,99991231.0,41,19040504,71090.0,41_19960630,32.498054,-85.02359,"(32.498054, -85.02359)"


In [8]:
# removed unmatched zips
quarterly_financials.dropna(subset=['INTPTLAT'], inplace=True)

# drop unnecesary columns
quarterly_financials.drop(['INTPTLAT', 'INTPTLONG', 'ENDEFYMD',	'CERT',	'ESTYMD', 'LIAB', 'ID', 'ADDRESS', 'EQ', 'ZIP'], axis=1, inplace=True)

# rename
quarterly_financials.rename(columns={
    'BKCLASS': 'bank_class', 
    'REPDTE': 'report_date', 
    'DEPDOM': 'deposits', 
    'ASSET': 'assets', 
    'STNAME': 'state', 
    'NAME': 'name',
    'CITY': 'city'
    }, inplace=True)

display(quarterly_financials.head())

Unnamed: 0,bank_class,report_date,deposits,assets,state,name,city,zip_coordinates
0,SM,19960630,7531683.0,28840879,MASSACHUSETTS,STATE STREET BANK&TRUST CO,BOSTON,"(42.358254, -71.051927)"
1,NM,19960630,117100.0,138177,MASSACHUSETTS,EASTERN BANK&TRUST CO,SALEM,"(42.529248, -70.868017)"
2,SM,19960630,188495.0,227366,ALABAMA,AUBURNBANK,AUBURN,"(32.534872, -85.493755)"
3,NM,19960630,111234.0,124124,ALABAMA,ROBERTSON BANKING CO,DEMOPOLIS,"(32.417456, -87.892213)"
4,NM,19960630,70753.0,79828,ALABAMA,PHENIX-GIRARD BANK,PHENIX CITY,"(32.498054, -85.02359)"


In [9]:
# change to title case
quarterly_financials[['state', 'name', 'city']] = quarterly_financials[['state', 'name', 'city']].apply(lambda x: x.str.title())

# format date
quarterly_financials['report_date'] = pd.to_datetime(quarterly_financials['report_date'], format='%Y%m%d')

# set to float
quarterly_financials['deposits'] = quarterly_financials['deposits'].astype(float)
quarterly_financials['assets'] = quarterly_financials['assets'].astype(float)


display(quarterly_financials.head())


Unnamed: 0,bank_class,report_date,deposits,assets,state,name,city,zip_coordinates
0,SM,1996-06-30,7531683.0,28840879.0,Massachusetts,State Street Bank&Trust Co,Boston,"(42.358254, -71.051927)"
1,NM,1996-06-30,117100.0,138177.0,Massachusetts,Eastern Bank&Trust Co,Salem,"(42.529248, -70.868017)"
2,SM,1996-06-30,188495.0,227366.0,Alabama,Auburnbank,Auburn,"(32.534872, -85.493755)"
3,NM,1996-06-30,111234.0,124124.0,Alabama,Robertson Banking Co,Demopolis,"(32.417456, -87.892213)"
4,NM,1996-06-30,70753.0,79828.0,Alabama,Phenix-Girard Bank,Phenix City,"(32.498054, -85.02359)"


In [10]:
quarterly_financials['bank_class'] = quarterly_financials['bank_class'].replace({
    'N':  'Commercial bank, national charter, Fed member',
    'NM': 'Commercial bank, state charter, Fed non-member',
    'OI': 'Insured U.S. branch of a foreign chartered institution',
    'SB': 'Federal savings banks',
    'SI': 'State chartered stock savings banks',
    'SL': 'State chartered stock savings and loan association',
    'SM': 'Commercial bank, state charter, Fed member',
    'NC': 'Noninsured non-deposit commercial bank',
    'NS': 'Noninsured stock savings bank',
    'CU': 'State or federally chartered credit union',
    })

display(quarterly_financials.shape)
display(quarterly_financials.head())



(937293, 8)

Unnamed: 0,bank_class,report_date,deposits,assets,state,name,city,zip_coordinates
0,"Commercial bank, state charter, Fed member",1996-06-30,7531683.0,28840879.0,Massachusetts,State Street Bank&Trust Co,Boston,"(42.358254, -71.051927)"
1,"Commercial bank, state charter, Fed non-member",1996-06-30,117100.0,138177.0,Massachusetts,Eastern Bank&Trust Co,Salem,"(42.529248, -70.868017)"
2,"Commercial bank, state charter, Fed member",1996-06-30,188495.0,227366.0,Alabama,Auburnbank,Auburn,"(32.534872, -85.493755)"
3,"Commercial bank, state charter, Fed non-member",1996-06-30,111234.0,124124.0,Alabama,Robertson Banking Co,Demopolis,"(32.417456, -87.892213)"
4,"Commercial bank, state charter, Fed non-member",1996-06-30,70753.0,79828.0,Alabama,Phenix-Girard Bank,Phenix City,"(32.498054, -85.02359)"


In [11]:
quarterly_financials['zip_coordinates'].isna().sum()


0

Plot

In [23]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display


def plot_bank_assets_by_date(date):
    # Filter the GeoDataFrame by the selected date
    filtered_gdf = gdf[gdf['report_date'] == date]

    # Plot the points on a map
    fig, ax = plt.subplots(figsize=(10, 6))
    filtered_gdf.plot(ax=ax, markersize=50, color='red', marker='o', label="Bank Locations")
    ax.set_title(f"Bank Assets Locations for {date}")
    ax.set_xlabel("Longitude")
    ax.set_ylabel("Latitude")
    plt.legend()
    plt.show()

# Get the unique report dates sorted
unique_dates = sorted(quarterly_financials['report_date'].unique())

# Create the date slider
date_slider = widgets.SelectionSlider(
    options=unique_dates,
    description='Report Date:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True
)

# Display the date slider and plot the data based on the selected date
widgets.interact(plot_bank_assets_by_date, date=date_slider)

interactive(children=(SelectionSlider(continuous_update=False, description='Report Date:', options=(numpy.date…

<function __main__.plot_bank_assets_by_date(date)>