# New York Fed Replication

In [35]:
# Setting up 

import pandas as pd
import numpy as np

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

sample = "~//projects//equifaxmacro_proj//EquiFax3//2_10_Percent_Sample_Construction//output//sample//"
output_3 = "~//projects//equifaxmacro_proj//EquiFax3//3_Balance_In_And_Out_Construction//output//"

In [2]:
# we're only going to pull every third month, since it's quarterly 

def monyr(monnum):

    mon = (monnum + 5) % 12 + 1

    if mon < 10: 
        MONNUM = "0" + str(mon)
    else: 
        MONNUM = str(mon)

    YRNUM = int(((monnum - mon)/12) + 2006)

    DATE = str(YRNUM) + MONNUM
    
    return DATE;

quartercols = [monyr(3*quarternum) for quarternum in range(1, 66)]

## Ones that just look at balances and accounts 

In [3]:
alltl_str = output_3 + "1_balance_in_and_out.parquet"
alltl = pd.read_parquet(alltl_str, columns = quartercols).reset_index()

In [4]:
non_other_pc_types = [
            'FM', # first mortgage
            'HR', # home equity revolving 
            'AB2', # auto bank loan
            'AF2', # auto finance loan
            'BC', # bank card 
            'RT', # retail 
            'SL1', # student loan deferred
            'SL2', # student loan non-deferred  
            ]

In [5]:
# calculate totals for the classic types first
non_other = alltl[alltl.product_category.isin(non_other_pc_types)]
other = alltl[~alltl.product_category.isin(non_other_pc_types)]

In [6]:
# now, recode the equifax product categories for the non-others 

# mortgage 
non_other.loc[non_other['product_category'] == 'FM', 'product_category'] = 'first_mortgage'

# home equity revolving 
non_other.loc[non_other['product_category'] == 'HR', 'product_category'] = 'he_revolving'

# auto loan 
non_other.loc[non_other['product_category'].isin(['AB2', 'AF2']), 'product_category'] = 'auto'

# credit card
non_other.loc[non_other['product_category'].isin(['BC', 'RT']), 'product_category'] = 'credit_card'

# student loan
non_other.loc[non_other['product_category'].isin(['SL1', 'SL2']), 'product_category'] = 'student_loan'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [7]:
sum = non_other.drop(['consumer_id', 'trade_id'], axis = 1).head(100000).groupby('product_category').sum()

In [17]:
# now let's group the "other" group for each month

sum_other = other.iloc[:, 3:].head(1000).sum()
sum_other = pd.DataFrame(sum_other).T


In [23]:
sum_other.index = ['other']

In [28]:
a = pd.concat([sum, sum_other], axis = 0).T
a.columns = a.columns + "_bal"
a

Unnamed: 0,auto_bal,credit_card_bal,first_mortgage_bal,he_revolving_bal,student_loan_bal,other_bal
200509,19598671.0,16685436.0,210621220.0,16106602.0,7290885.0,685612.0
200512,19731469.0,15643891.0,214778780.0,17384171.0,7679110.0,704257.0
200603,20043822.0,15325567.0,222715266.0,17062215.0,8475854.0,706941.0
200606,20251237.0,15647998.0,230380957.0,17607817.0,8847016.0,718711.0
200609,20487943.0,15629123.0,236577998.0,17518739.0,9332841.0,739066.0
200612,20554424.0,16004968.0,236795132.0,17987982.0,9681522.0,969575.0
200703,20224278.0,15277200.0,242410928.0,17437228.0,10324845.0,863620.0
200706,19962624.0,15920295.0,251443726.0,19183336.0,10655587.0,849002.0
200709,20210152.0,16300289.0,254391176.0,20520435.0,11032424.0,818086.0
200712,20613620.0,16933425.0,254411069.0,21267362.0,11660218.0,789634.0


now we need to append these to each other and we're done 

# Number of accounts by loan type 

In [31]:
acc = non_other.drop(['consumer_id', 'trade_id'], axis = 1).head(10000).groupby('product_category').count().T

In [33]:
acc.columns = acc.columns + "_acc"
acc

product_category,auto_acc,credit_card_acc,first_mortgage_acc,he_revolving_acc,student_loan_acc
200509,214,1701,177,46,127
200512,207,1686,177,51,122
200603,207,1678,173,47,122
200606,217,1683,179,46,107
200609,220,1709,178,46,94
200612,220,1702,176,47,103
200703,221,1678,181,46,100
200706,222,1691,186,42,103
200709,223,1684,176,43,107
200712,216,1694,173,44,117


* Percent of balance 90+ days delinquent by loan type 
* geography!

# going back to the 10% sample files

In [36]:
# now let's just read in one month at a time

mon1_str = sample + "200507_10perc.parquet"
mon1 = pd.read_parquet(mon1_str)

In [37]:
mon1.head()

Unnamed: 0,consumer_id,archive_date,zip_code,inquiries_12_months,age_oldest_account,age_oldest_mortgage_account,age_newest_account,number_of_accounts,number_accounts_opened_within_12_months,number_accounts_always_satisfactory,number_accounts_major_derogatory,number_revolving_accts_greater_than_or_equal_to_50_percent_utilization,bankcard_accts_over_75_percent_utilization,number_accounts_past_due,total_past_due_amount,bankruptcy_flag,foreclosure_flag,number_3rd_party_collection_accts,total_amount_3rd_party_collections,number_open_bankcard_accounts,number_open_mortgage_accounts,vantage_score_3,state,trade_id,origination_date_open,origination_portfolio_type,origination_product_category,origination_vantage_score3,product_category,small_business_owner_flag,pim_score,consumer_age,mortgage_indicator,deceased_consumer,terms,status_category,balance,high_credit,monthly_payment,portfolio_type,transferred_sold_flag,date_reported,narrcode_1,narrcode_2,narrcode_3,narrcode_4,ecoa,rate_status,scheduled_payment_amount,date_of_last_activity,date_of_last_payment,actual_payment_amount,payment_frequency,account_type,activity_designator,origination_vantage_score4,origination_Bankruptcy_Navigator_Index,origination_Consumer_Income_Score,vantage_score_4,bankruptcy_Navigator_Index,consumer_Income_Score,revolver_Transactor_Behavior_Last_6_Months,revolver_Transactor_Behavior_Last_12_Months,revolver_Transactor_Behavior_Last_24_Months,percent_Actual_Payment_to_Scheduled_Payment_Auto_Last_6_Months,percent_Actual_Payment_to_Scheduled_Payment_Auto_Last_12_Months,percent_Actual_Payment_to_Scheduled_Payment_Auto_Last_24_Months,percent_Actual_Payment_to_Scheduled_Payment_Mortage_Last_6_Months,percent_Actual_Payment_to_Scheduled_Payment_Mortage_Last_12_Months,percent_Actual_Payment_to_Scheduled_Payment_Mortage_Last_24_Months,mortgage_inquiries_last_1_month,potential_Mortgage_inquiries_last_1_month,origination_Industry_code,industry_code,consumer_age_archive,joint_Holders_In_Set,weight,m_y,csv_name,id_num
5,7997902,2005-07-26,7735.0,1.0,154.0,22.0,2.0,22.0,3.0,22.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9999998.0,4.0,1.0,779,NJ,191208105,200505.0,R,RT,779.0,RT,,51,6,1,0,,1,0,6000,0,R,1,2005-06-01,233,0,0,0,I,1,,,,,,7,,775.0,360.0,37.0,775,360,37,9.0,9.0,9.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,0,0,DC,DC,6,0,1,200507,200507_2_ADS.csv,67247961
6,6373039,2005-07-26,30043.0,8.0,139.0,82.0,9.0,25.0,2.0,5.0,3.0,8.0,2.0,13.0,10369.0,0.0,0.0,0.0,9999998.0,2.0,1.0,420,GA,191208327,200311.0,R,RT,420.0,RT,,35,3,1,0,,1,189,254,10,R,0,2005-07-01,233,0,0,0,I,1,10.0,2005-07-01,2005-06-01,50.0,M,7,,456.0,1.0,31.0,456,1,31,9.0,9.0,9.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,0,0,CG,CG,3,0,1,200507,200507_2_ADS.csv,67247962
13,15170225,2005-07-26,32217.0,0.0,226.0,9998.0,15.0,12.0,0.0,11.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,651.0,1.0,98.0,762,FL,191211683,199108.0,R,RT,762.0,RT,,41,3,0,0,,1,3523,7500,360,R,0,2005-07-01,233,0,0,0,I,1,360.0,2005-07-01,2005-06-01,,,7,,710.0,388.0,59.0,710,388,59,9.0,9.0,9.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,0,0,DC,DC,3,0,1,200507,200507_2_ADS.csv,67247969
20,14157373,2005-07-26,95062.0,99.0,166.0,90.0,6.0,19.0,1.0,18.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9999998.0,2.0,1.0,789,CA,192492211,199607.0,R,RT,789.0,RT,,50,3,1,0,,8,0,230,0,R,1,2005-07-01,244,65,0,0,I,9,,2001-12-01,2002-10-01,,,7,B,765.0,309.0,41.0,765,309,41,9.0,9.0,9.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,0,0,DC,DC,3,0,1,200507,200507_2_ADS.csv,22923984
31,17458230,2005-07-26,29645.0,0.0,366.0,366.0,6.0,36.0,1.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9999998.0,9.0,1.0,783,SC,192496127,198805.0,R,BC,783.0,BC,,48,5,1,0,,1,0,5800,0,R,0,2005-07-01,233,0,0,0,J,1,,2000-12-01,2000-12-01,,,18,,791.0,450.0,47.0,791,450,47,9.0,9.0,9.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,0,0,BB,BB,5,0,1,200507,200507_2_ADS.csv,22923995


## total number of new accounts and inquiries


In [38]:
# clean the exception values to zero 
mon1.loc[mon1['inquiries_12_months']>=92, 'inquiries_12_months'] = 0
mon1.loc[mon1['number_accounts_opened_within_12_months']>=92, 'number_accounts_opened_within_12_months'] = 0

In [41]:
acc_iq = mon1[['consumer_id', 'inquiries_12_months', 'number_accounts_opened_within_12_months']].drop_duplicates()

In [42]:
mon1.shape

(13903042, 80)

In [43]:
acc_iq.shape

(1975837, 3)

In [44]:
acc_iq_sum = acc_iq[['inquiries_12_months', 'number_accounts_opened_within_12_months']].sum()

In [45]:
acc_iq_vars = ['consumer_id', 'inquiries_12_months', 'number_accounts_opened_within_12_months']

acc_iq_sum

inquiries_12_months                        3530814.0
number_accounts_opened_within_12_months    2397473.0
dtype: float64

## Credit score at origination: Mortgages and Auto loans

In [52]:
orig_cred = mon1[['transferred_sold_flag', 'origination_vantage_score3', 'product_category']]

In [55]:
orig_cred.columns = ['new_origination_flag', 'origination_vantage_score3', 'product_category']

In [61]:
 # find people with new mortgages and calculate percentiles 
mort = orig_cred[(orig_cred['product_category'] == "FM") & (orig_cred['new_origination_flag'] == 1)]
mp = mort['origination_vantage_score3'].describe(percentiles = [0.1, 0.25, 0.5]).iloc[4:7]

# find people with new auto loans and calculate percentils 
auto = orig_cred[(orig_cred['product_category'].isin(['AB2', 'AF2'])) & (orig_cred['new_origination_flag'] == 1)]
ap = auto['origination_vantage_score3'].describe(percentiles = [0.1, 0.25, 0.5]).iloc[4:7]

# create the values for the rows and make labels
row = pd.DataFrame(pd.concat([mp, ap]))
row.index = ['mort_10%', 'mort_25%', 'mort_50%', 'auto_10%', 'auto_25%', 'auto_50%']


In [62]:
orig_cred_cols = ['transferred_sold_flag', 'origination_vantage_score3', 'product_category']
row

Unnamed: 0,origination_vantage_score3
mort_10%,584.0
mort_25%,653.0
mort_50%,728.0
auto_10%,524.0
auto_25%,590.0
auto_50%,670.0


## Derogatory debt

In [66]:
derog_vars = ['status_category', 'balance']

In [67]:
derog = mon1[derog_vars]

In [68]:
derog.balance.sum()

142732607043

In [73]:
derog_stat = derog.groupby('status_category').sum()/derog.balance.sum()
derog_stat.index = ['Miscellaneous',
                    'Current',
                    '30 DPD',
                    '60 DPD',
                    '90 DPD',
                    '120 DPD or Collections',
                    'Foreclosure Started',
                    'Closed-Positive',
                    'Closed-Severe Derogatory',
                    'Closed-Bankruptcy']

In [74]:
derog_stat

Unnamed: 0,balance
Miscellaneous,0.012915
Current,0.950366
30 DPD,0.013224
60 DPD,0.004043
90 DPD,0.001849
120 DPD or Collections,0.005138
Foreclosure Started,0.002186
Closed-Positive,0.0
Closed-Severe Derogatory,0.00812
Closed-Bankruptcy,0.002159


In [71]:
mon2_str = sample + "201010_10perc.parquet"
mon2 = pd.read_parquet(mon2_str, columns = derog_vars)

In [72]:
mon2.groupby('status_category').sum()/mon2.balance.sum()

Unnamed: 0_level_0,balance
status_category,Unnamed: 1_level_1
0,0.015047
1,0.883369
2,0.018594
3,0.009752
4,0.005928
5,0.031297
6,0.018895
7,0.0
8,0.012682
9,0.004435


In [75]:
derog_type_vars = ['product_category', 'status_category', 'balance']

In [76]:
derog_type = mon1[derog_type_vars]

In [78]:
derog_type['sd'] = (derog_type.status_category >= 4) * 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  derog_type['sd'] = (derog_type.status_category >= 4) * 1


In [89]:
# now, recode the equifax product categories for the non-others 
def recode_nyfrb(non_other): 
    # mortgage 
    non_other.loc[non_other['product_category'] == 'FM', 'product_category'] = 'first_mortgage'

    # home equity revolving 
    non_other.loc[non_other['product_category'] == 'HR', 'product_category'] = 'he_revolving'

    # auto loan 
    non_other.loc[non_other['product_category'].isin(['AB2', 'AF2']), 'product_category'] = 'auto'

    # credit card
    non_other.loc[non_other['product_category'].isin(['BC', 'RT']), 'product_category'] = 'credit_card'

    # student loan
    non_other.loc[non_other['product_category'].isin(['SL1', 'SL2']), 'product_category'] = 'student_loan'
    
    non_other = non_other[non_other.product_category.isin(['first_mortgage', 'he_revolving', 'auto', 'credit_card', 'student_loan'])]
    
    return non_other;

In [90]:
derog_type = recode_nyfrb(derog_type)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [91]:
derog_type.loc[derog_type.sd==1, ['product_category', 'balance']].groupby(['product_category']).sum()/derog_type[['product_category', 'balance']].groupby('product_category').sum()

Unnamed: 0_level_0,balance
product_category,Unnamed: 1_level_1
auto,0.022711
credit_card,0.087769
first_mortgage,0.009869
he_revolving,0.002131
student_loan,0.073219


## Third party collections

In [92]:
tpc_vars = ['consumer_id', 'number_3rd_party_collection_accts','total_amount_3rd_party_collections']

tpc = mon1[tpc_vars]

In [93]:
tpc.

Unnamed: 0,consumer_id,number_3rd_party_collection_accts,total_amount_3rd_party_collections
5,7997902,0.0,9999998.0
6,6373039,0.0,9999998.0
13,15170225,1.0,651.0
20,14157373,0.0,9999998.0
31,17458230,0.0,9999998.0


In [None]:
# drop duplicates, recode variables, let's do this another day. 