In [1]:
%pip install --upgrade pandas

Requirement already up-to-date: pandas in /usr/local/lib/python3.5/dist-packages (0.24.2)
[33mYou are using pip version 19.0.3, however version 19.2.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import glob
import gc

In [3]:
def convertAcq(acq_filename, includeLoanID = False):
    acq = pd.read_parquet(acq_filename).sort_values('loan_id').reset_index(drop=True)
    acq = acq.fillna(0)
    if includeLoanID:
        acq_numpy = acq[['loan_id', 'acq_def_ind', 'state_id', 'purpose_id', 'mi_type_id', \
                    'occupancy_status_id', 'product_type_id', 'property_type_id', \
                        'seller_id', 'zip3_id']].to_numpy()
    else:
        acq_numpy = acq[[           'acq_def_ind', 'state_id', 'purpose_id', 'mi_type_id', \
            'occupancy_status_id', 'product_type_id', 'property_type_id', \
                'seller_id', 'zip3_id']].to_numpy(dtype=np.int32)
    return acq_numpy

In [4]:
def convertSeq(files, includeLoanID = False):
    # mapping of index or loan id to chunk and sequence offsets
    lid_to_seq_idx =  []     
    seq_numpy = [None] * len(files)
    seq_fnames = { int(files[i].split('/')[-1].split('.')[0].split('_')[-1]): files[i] for i in range(len(files))}

    for i, fname in seq_fnames.items():
        print('processing name: {}'.format(fname))
        seq = pd.read_parquet(fname)
        seq = seq[seq.dlq <= 6 + 12] # one year 
        seq = seq.sort_values(['loan_id', 'yyyymm']).reset_index(drop=True)
        lid = seq.loan_id.to_numpy()
        lid_idx = np.concatenate((np.array([0]), np.where(lid[:-1]!=lid[1:])[0]+1, np.array([len(lid)])))

        lid_to_seq_idx.append(pd.DataFrame({'loan_id':lid[lid_idx[:-1]], 'chunk_id': i, 'seq_idx_begin':lid_idx[:-1], 'seq_idx_end':lid_idx[1:]}))
        
        if includeLoanID:
            seq_numpy[i] = seq[['loan_id', 'default_1y', 'yyyymm', 'dlq_adj', 'age', 'int_rate', 'current_upb_norm']].to_numpy(dtype=np.float64)
        else:
            seq_numpy[i] = seq[[           'default_1y', 'yyyymm', 'dlq_adj', 'age', 'int_rate', 'current_upb_norm']].to_numpy(dtype=np.float32)
        del seq
        gc.collect()
    
    print('concatenating lid_to_seq_idx')
    lid_to_seq_idx = pd.concat(lid_to_seq_idx).sort_values('loan_id').set_index('loan_id', drop='True')
    
    return lid_to_seq_idx, seq_numpy

In [5]:
def convertDataset(data_path):
    acquistion_fname = '/fnm_input_acq_parquet'
    sequence_fname = '/fnm_input_seq_parquet*'

    acquisition_nname = '/fnm_input_acq.npy'
    sequence_nname = '/fnm_input_seq_{}.npy'
    idx_to_seq_nname = '/fnm_input_idx_to_seq.npy'

    print('Data path: {}'.format(data_path))
    print('Acquistion parquet: {}'.format(data_path + acquistion_fname))
    print('Sequence parquet: {}'.format(data_path + sequence_fname))

    seq_files = sorted([f for f in glob.glob(data_path + sequence_fname, recursive=False)])
    for f in seq_files:
        print('\tSequence chunk found: {}'.format(f))

    print('Acquisition numpy: {}'.format(data_path + acquisition_nname))
    print('Sequence numpy: {}'.format(data_path + sequence_nname))
    print('Index to Sequence Index numpy: {}'.format(data_path + idx_to_seq_nname))
    
    acq_numpy = convertAcq(data_path + acquistion_fname)
    lid_to_seq_idx, seq_numpy = convertSeq(seq_files, includeLoanID = False)
    idx_to_seq = lid_to_seq_idx[['chunk_id', 'seq_idx_begin', 'seq_idx_end']].to_numpy()
    
    np.save(data_path + acquisition_nname, acq_numpy, allow_pickle=False, fix_imports=False)
    np.save(data_path + idx_to_seq_nname, idx_to_seq, allow_pickle=False, fix_imports=False)
    for chunk_idx, seq_numpy_chunk in enumerate(seq_numpy):
        np.save(data_path + sequence_nname.format(chunk_idx), seq_numpy[chunk_idx], allow_pickle=False, fix_imports=False)

In [7]:
convertDataset(data_path = '/home/user/notebooks/data/test')

Data path: /home/user/notebooks/data/test
Acquistion parquet: /home/user/notebooks/data/test/fnm_input_acq_parquet
Sequence parquet: /home/user/notebooks/data/test/fnm_input_seq_parquet*
	Sequence chunk found: /home/user/notebooks/data/test/fnm_input_seq_parquet_0
Acquisition numpy: /home/user/notebooks/data/test/fnm_input_acq.npy
Sequence numpy: /home/user/notebooks/data/test/fnm_input_seq_{}.npy
Index to Sequence Index numpy: /home/user/notebooks/data/test/fnm_input_idx_to_seq.npy
processing name: /home/user/notebooks/data/test/fnm_input_seq_parquet_0
concatenating lid_to_seq_idx


In [8]:
convertDataset(data_path = '/home/user/notebooks/data/train')

Data path: /home/user/notebooks/data/train
Acquistion parquet: /home/user/notebooks/data/train/fnm_input_acq_parquet
Sequence parquet: /home/user/notebooks/data/train/fnm_input_seq_parquet*
	Sequence chunk found: /home/user/notebooks/data/train/fnm_input_seq_parquet_0
	Sequence chunk found: /home/user/notebooks/data/train/fnm_input_seq_parquet_1
	Sequence chunk found: /home/user/notebooks/data/train/fnm_input_seq_parquet_2
	Sequence chunk found: /home/user/notebooks/data/train/fnm_input_seq_parquet_3
	Sequence chunk found: /home/user/notebooks/data/train/fnm_input_seq_parquet_4
	Sequence chunk found: /home/user/notebooks/data/train/fnm_input_seq_parquet_5
	Sequence chunk found: /home/user/notebooks/data/train/fnm_input_seq_parquet_6
	Sequence chunk found: /home/user/notebooks/data/train/fnm_input_seq_parquet_7
	Sequence chunk found: /home/user/notebooks/data/train/fnm_input_seq_parquet_8
	Sequence chunk found: /home/user/notebooks/data/train/fnm_input_seq_parquet_9
Acquisition numpy: /

In [9]:
convertDataset(data_path = '/home/user/notebooks/data/valid')

Data path: /home/user/notebooks/data/valid
Acquistion parquet: /home/user/notebooks/data/valid/fnm_input_acq_parquet
Sequence parquet: /home/user/notebooks/data/valid/fnm_input_seq_parquet*
	Sequence chunk found: /home/user/notebooks/data/valid/fnm_input_seq_parquet_0
Acquisition numpy: /home/user/notebooks/data/valid/fnm_input_acq.npy
Sequence numpy: /home/user/notebooks/data/valid/fnm_input_seq_{}.npy
Index to Sequence Index numpy: /home/user/notebooks/data/valid/fnm_input_idx_to_seq.npy
processing name: /home/user/notebooks/data/valid/fnm_input_seq_parquet_0
concatenating lid_to_seq_idx
