In [1]:
import pandas as pd
import datetime 

data_dir = '../data/'
now = datetime.datetime.now()

def __my_flatten_cols(self, how="_".join, reset_index=True):
    how = (lambda iter: list(iter)[-1]) if how == "last" else how
    self.columns = [how(filter(None, map(str, levels))) for levels in self.columns.values] \
                    if isinstance(self.columns, pd.MultiIndex) else self.columns
    return self.reset_index() if reset_index else self
pd.DataFrame.my_flatten_cols = __my_flatten_cols

In [2]:
#Mine-specific data
mine_data = pd.read_csv(data_dir + 'msha_mine_20190209-0.csv', escapechar='\\')
       
mine_rename = {'controller_id':'curr_ctrlr_id', 'controller_nm':'curr_ctrlr_nm', 
                'oper_id':'curr_oper_id', 'oper_nm':'curr_oper_nm'}
                
mine_data = mine_data.rename(index=str, columns=mine_rename)

In [3]:
#Controller-Operation relationship data
cntrlr_hist_data = pd.read_csv(data_dir + 'msha_controller_history_20190105-0.csv')
cntrlr_hist_data['ctrlr_end_dt'] = cntrlr_hist_data['ctrlr_end_dt'].fillna('2019-01-01')
cntrlr_hist_data['ctrlr_start_dt'] =  pd.to_datetime(cntrlr_hist_data['ctrlr_start_dt'], 
                                                      format='%Y-%m-%d')
cntrlr_hist_data['ctrlr_end_dt'] =  pd.to_datetime(cntrlr_hist_data['ctrlr_end_dt'], 
                                                      format='%Y-%m-%d')

In [4]:
def curr_mine_data_merge(mine_level_agg):
    
    keep_cols = ['mine_id','curr_mine_nm', 'c_m_ind', 'mine_type_cd', 'curr_stat_cd',
           'curr_stat_dt', 'curr_ctrlr_id', 'curr_ctrlr_nm', 'curr_oper_id',
           'curr_oper_nm', 'state_abbr', 'fips_cnty_nm', 'curr_ownr_beg_dt']
    mine_data_refined = mine_data[keep_cols]
    
    agg_mine = pd.merge(mine_level_agg, mine_data_refined, 
                        how='left', on='mine_id', suffixes=('_a','_m'))
    #agg_mine_del = pd.merge(agg_mine, mine_del_summary, 
    #                        how='left', on='mine_id', suffixes=('_am','_d'))

    return agg_mine

In [8]:
def catod_mine_data_merge(delinquency_data):
    catod_del_merge = pd.merge(cntrlr_hist_data,delinquency_data,how='left',left_on='oper_id',right_on='violator_id')

    #Filters for the filtering we'll be doing on catod_del_merge
    gt_dates = catod_del_merge['delinquent_date'] >= catod_del_merge['ctrlr_start_dt']
    lt_dates = catod_del_merge['delinquent_date'] <= catod_del_merge['ctrlr_end_dt']

    #So here's where we drop the records where the dates don't match up
    catod_filtered = catod_del_merge[(gt_dates) & (lt_dates)]

    #We're getting rid of columns so that we can drop duplicates and get back to our original
    #delinquency_data count
    catod_reduced = catod_filtered[['oper_id','ctrlr_id','ctrlr_start_dt','ctrlr_end_dt','oper_nm',
                                    'ass_case_nbr','curr_ctrlr_id','mine_id','delinquent_date',
                                    'ending_balance']].drop_duplicates()

    catod_reduced = catod_reduced.rename(index=str, columns={'oper_id':'catod_oper_id',
                                                             'ctrlr_id':'catod_ctrlr_id',
                                                             'ctrlr_start_dt':'catod_ctrlr_start_dt',
                                                             'ctrlr_end_dt':'catod_ctrlr_end_dt',
                                                             'oper_nm':'catod_oper_nm'})
    return catod_reduced

In [12]:
#Delinquency data
delinquency_data = pd.read_csv(data_dir + 'debtbyage_20181205_REFINED.csv')
delinquency_data['Delinquent Date'] =  pd.to_datetime(delinquency_data['Delinquent Date'], 
                                                      format='%m/%d/%Y')
delinquency_data['year'] = pd.DatetimeIndex(delinquency_data['Delinquent Date']).year
delinquency_data['month'] = pd.DatetimeIndex(delinquency_data['Delinquent Date']).month

#because this delinquency data is fucked, we're going to remove everything that
#isn't related to delinquencies and then do our catod and current joins
delinquency_data = delinquency_data[['Assess Case Nbr','Mine ID','Violator ID','Violator Name', 
                                     'Violator Type','Controller ID','Controller Name','Age Category',
                                     'Delinquent Date','Delinquent Type','Ending Balance','year','month']]

delinquency_data = delinquency_data.rename(index=str, columns={'Assess Case Nbr':'ass_case_nbr',
                                                              'Mine ID':'mine_id',
                                                              'Violator ID':'violator_id',
                                                              'Violator Name':'violator_nm',
                                                              'Controller ID':'del_ctrlr_id',
                                                              'Controller Name':'del_ctrlr_nm',
                                                              'Violator Type':'violator_type',
                                                              'Age Category':'age_cateogry',
                                                              'Delinquent Date':'delinquent_date',
                                                              'Delinquent Type':'delinquent_type',
                                                              'Ending Balance':'ending_balance'})

earliest_dict = delinquency_data.groupby('mine_id')['delinquent_date'].agg('min').to_dict()
earliest_series = delinquency_data.groupby('mine_id')['delinquent_date'].agg('min')
earliest_df = pd.DataFrame({'mine_id':earliest_series.index, 'earliest_date':earliest_series.values})  
#Need to round earliest_date up to the next year so we can factor only years with full delinquency
earliest_df['earliest_year'] = earliest_df['earliest_date'].map(lambda x: x.year+1)

mine_del_summary = delinquency_data.groupby('mine_id').agg({'delinquent_date':['max','min'],
                                                            'ending_balance':['sum','count']}).my_flatten_cols()
mine_del_summary = mine_del_summary.rename(index=str, columns={'delinquent_date_max':'latest_del_date',
                                           'delinquent_date_min':'earliest_del_date','ending_balance_sum':'total_due',
                                           'ending_balance_count':'num_del_records'})

del_data_curr_join = curr_mine_data_merge(delinquency_data)
del_data_curr_catod_reduced = catod_mine_data_merge(del_data_curr_join)
del_data_curr_catod = curr_mine_data_merge(del_data_curr_catod_reduced)

In [24]:
print(del_data_curr_catod.columns)
justice_family_responsible = del_data_curr_catod[del_data_curr_catod['curr_ctrlr_id_a'].isin(['C04355','0091855'])]
justice_family_originated = del_data_curr_catod[del_data_curr_catod['catod_ctrlr_id'].isin(['C04355','0091855'])]
#justice_family_responsible.to_csv(data_dir + 'analysis/justice/20190325-justice-tester-curr-catod-v2.csv')
print(justice_family_responsible['ending_balance'].sum())
print(justice_family_originated['ending_balance'].sum())

Index(['catod_oper_id', 'catod_ctrlr_id', 'catod_ctrlr_start_dt',
       'catod_ctrlr_end_dt', 'catod_oper_nm', 'ass_case_nbr',
       'curr_ctrlr_id_a', 'mine_id', 'delinquent_date', 'ending_balance',
       'curr_mine_nm', 'c_m_ind', 'mine_type_cd', 'curr_stat_cd',
       'curr_stat_dt', 'curr_ctrlr_id_m', 'curr_ctrlr_nm', 'curr_oper_id',
       'curr_oper_nm', 'state_abbr', 'fips_cnty_nm', 'curr_ownr_beg_dt'],
      dtype='object')
4782488.26
5647431.859999999
