# Create Legacy and New Datasets

In [1]:
import pandas as pd
from tqdm import tqdm

In [2]:
# Variables to keep

balancesheet = ["RCFD2170", "RCON2170", "RCFN2170", "RCFD3368", "RCON3368", "RCFN3368", "RCON2192", "RCON2948", "RCON2215", "RCON6810", "RCON0352", "RCON6648", "RCON6645", "RCON6646", "RCON2604", "RCONJ473", "RCONJ474", "RCONA223", "RCON0071", "RCON2122", "RCON0390", "RCON2146", "RCON3545", "RCON1754", "RCON1772", "RCON1350", "RCONB987", "RCONB989", "RCON8725", "RCON3450", "RCONA126", "RCONA589", "RIAD3210", "RCON3210", "RCON2385", "RCON3809", "RCON6631", "RCON6636", "RCON8693", "RCON8697", "RCON8701", "RCON8705", "RCON8709", "RCON8713", "RCON8733", "RCON8737", "RCON8741", "RCON8745", "RCON8766", "RCON8767", "RCONS582", "RCONS583", "RCONS584", "RCONS603", "RCONS604", "RCONS605", "RCON0081", 'RCONA549', 'RCONA550', 'RCONA551', 'RCONA552', 'RCONA553', 'RCONA554', 'RCONA555', 'RCONA556', 'RCONA557', 'RCONA558', 'RCONA559', 'RCONA560', 'RCONA561', 'RCONA562', 'RCONA564', 'RCONA565', 'RCONA566', 'RCONA567', 'RCONA568', 'RCONA569', 'RCONA570', 'RCONA571', 'RCONA572', 'RCONA573', 'RCONA574', 'RCONA575', "RCONA579", "RCONA580", "RCONA581", "RCONA582", "RCONA584", "RCONA585", "RCONA586", "RCONA587", "RCONHK07", "RCONHK08", "RCONHK09", "RCONHK10", "RCONHK12", "RCONHK13", "RCONHK14", "RCONHK15", "RCONF055", "RCONF056", "RCONF057", "RCONF058", "RCONF060", "RCONF061", "RCONF062", "RCONF063"]
incomestatement = ["RIAD4340", "RIADC914", "RIAD4341", "RIAD4074", "RIADC899", "RIADC900", "RIADB525", "RIAD4842", "RIAD4079", "RIAD4093", "RIADC902", "RIADC903", "RIADC904", "RIADC905", "RIADC907", "RIAD4097", "RIAD4239", "RIAD4230", "RIADJJ33", "RIADKW02", "RIADC901", "RIAD4235", "RIAD4107", "RIAD4073", "RIAD4115", "RIAD4020", "RIAD4010", "RIAD4011", "RIAD4012", "RIAD4013", "RIAD4060", "RIAD4069", "RIAD4075", "RIAD4077", "RIAD4091", "RIAD4092", "RIAD4101", "RIAD4126", "RIAD4170", "RIAD4185", "RIAD4200", "RIAD4218", "RIAD4435", "RIAD4436", "RIAD4507", "RIAD4508", "RIAD4513", "RIAD4518", "RIAD4619", "RIAD5466", "RIAD8622", "RIAD8757", "RIADA315", "RIADA517", "RIADA518", "RIADB488", "RIADB489", "RIADB497", "RIADGW44", "RIADHK03", "RIADHK04", "RIADHT73", "RIADHT74", "RIADHT85"]
fdic = ["RSSD9050"]

allVar = balancesheet + incomestatement + fdic

In [59]:
legacyCallReports = [f"call{year}.csv" for year in range(1976, 2011)]
legacyVar = allVar + ["RSSD9999", "RSSD9001"]

In [60]:
df = pd.concat([pd.read_csv(f"./ChicagoFedCallReportsLegacy/{legacyCallReport}", index_col = ["RSSD9999", "RSSD9001"], usecols = lambda col: col in legacyVar) for legacyCallReport in tqdm(legacyCallReports)], axis = 0, join = "outer", sort = False)

df = df[~df.index.duplicated(keep = False)]
df.to_csv(f"./legacyCallReports(73-10).csv")

100%|██████████| 35/35 [09:25<00:00, 16.16s/it]


In [61]:
newVar = allVar + ["RCON9999", "IDRSSD"]
newCallReports = [f"call{year}.csv" for year in range(2001, 2024)]

In [62]:
df = pd.concat([pd.read_csv(f"./ChicagoFedCallReports/{newCallReport}", index_col = ["RCON9999", "IDRSSD"], usecols = lambda col: col in newVar, low_memory = False) for newCallReport in tqdm(newCallReports)], axis = 0, join = "outer", sort = False)
df.to_csv(f"./newCallReports(01-23).csv")

100%|██████████| 23/23 [01:23<00:00,  3.62s/it]


In [3]:
varDict = pd.read_csv("./MDRM/MDRM_CSV.csv")
varDict["relCode"] = varDict.Mnemonic + varDict["Item Code"].str.zfill(4)
varDict = varDict[varDict.relCode.isin(allVar)][["relCode", "Item Name", "Description", "Start Date", "End Date", "Reporting Form"]].drop_duplicates()
varDict["Start Date"] = pd.to_datetime(varDict["Start Date"], format = "%m/%d/%Y %I:%M:%S %p").apply(lambda x: x.year*10**4 + x.month*10**2 + x.day)
varDict["End Date"] = pd.to_datetime(varDict["End Date"], format = "%m/%d/%Y %I:%M:%S %p", errors = "coerce").apply(lambda x: x.year*10**4 + x.month*10**2 + x.day).fillna(99999999)
varDict = varDict.groupby("relCode").agg({"Item Name": "first", "Description": "first", "Start Date": "min", "End Date": "max"})
varDict.to_csv("./finalData/finalVarDict.csv", index = True)

In [4]:
pd.Series(allVar)[~pd.Series(allVar).isin(varDict.relCode)]

Series([], dtype: object)