# Preparing libranzas

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from datetime import datetime, timedelta 
from dateutil import relativedelta
import s3fs
import pyarrow.parquet as pq
import os
import calendar
import gc


pd.set_option('max_columns',None)
pd.set_option('max_rows',None)

%matplotlib inline

In [2]:
BASE_PATH_LIBRANZA = 's3://adl-refined-dev-popular/data_orig/libranzas/'
LIBRANZA_FILE_NAME = 'libranzas_cnl_M202003.csv'
libranza_file_path = BASE_PATH_LIBRANZA+LIBRANZA_FILE_NAME

BASE_PATH_OUT_OBJECTS_LIBRANZA = 's3://adl-refined-dev-popular/parquet/TC_adquisicion/'

In [3]:
month_number = ['0' + str(i) for i in range(1,10)] + ['10','11','12']
anio_mes_strs = ['2018-' + x for x in month_number] + ['2019-' + x for x in month_number]

In [11]:
import s3fs
import pyarrow.parquet as pq

fs = s3fs.S3FileSystem()

input_path = 's3://adl-refined-dev-popular/data_parquet/productos/libranzas/productos_libranzas_dwh_M202003'
input = input_path
dataset = pq.ParquetDataset(input, filesystem=fs)
lib = dataset.read()
lib = lib.to_pandas()

del dataset

In [23]:
# lib will be a global variable in this script
# lib = pd.read_csv(libranza_file_path,
#                   sep = ';',
#                   encoding = 'latin-1',
#                   na_values = '',
#                   dtype={'id_cliente': str}
#                  )

lib.rename(columns=lambda x: x.lower(), inplace=True)
lib_original_cols = lib.columns
print('Libranza original shape: ', lib.shape)

if lib['id_cliente'].isnull().sum() > 0:
    lib = lib.loc[lib['id_cliente'].notnull()]
    
lib['id_cliente'] =  lib['id_cliente'].astype(int)

Libranza original shape:  (3015025, 66)


In [24]:
lib.head()

Unnamed: 0,sk_cliente,id_cliente,sk_rc_libranza,id_rc_libranza,sk_fe_radicacion,sk_fe_aprobacion,sk_fe_desembolso,sk_fe_finalizacion,sk_fe_estado_actual,sk_convenio_libranza,sk_persona,dk_persona,sk_oficina,sk_producto_servicio,sk_clasificacion_tradicional,cd_oficina,cd_originador,ds_originador,cd_segmento_lib,ds_segmento_lib,cd_subsegmento_lib,ds_subsegmento_lib,no_solicitud,fe_solicitud,vl_monto_solicitado,mp_reestructurado,no_obligacion,fe_desembolso,vl_monto_aprobado,vl_recapitalizacion,vl_monto_desembolsado,no_cuotas,fe_finalizacion,no_meses_per_gracia,vl_total_cuota,fe_exigibilidad_cuota_1,cd_estado_actual,ds_estado_actual,fe_estado_actual,vl_tasa,vl_tasa_efectiva_anual,cd_modalidad_pag_int,ds_modalidad_pag_int,cd_periodicidad_pag_int,ds_periodicidad_pag_int,cd_base_liquidacion,cd_tipo_libranza,ds_tipo_libranza,cd_tipo_credito,ds_tipo_credito,cd_tipo_colocacion,ds_tipo_colocacion,no_obligacion_novada,vl_monto_novado,mp_compra_ind_cartera,mp_afecta_desp_nomina,cd_linea_credito,fe_carga,fe_actua,no_particion,cd_sector,cd_subsector,cd_tipo_pagaduria,ds_sector,ds_subsector,ds_tipo_pagaduria
0,-1,-1,3141535,1000003*1056302386*LB*0000600400710427,20150318,-1,20150318,20230910,-1,-1,-1,1000003*1056302386,-1,27,16,729,4,Direccion Carteras Adquiridas,6.0,Compras de Cartera,2.0,Originar y el Cedro en Firme,0,2015-03-18,24974584.0,0,600400710427,2015-03-18,24974584.0,0.0,24974584.0,99,,0,450701.0,,,,,13.8,,V,Vencido,M,Mensual,360,2,Inorganica,1.0,Nueva,,,0,0.0,0.0,0.0,68,2019-04-12 05:59:34,,0,-1,-1,-1,-,-,-
1,-1,-1,3138492,1000003*5820463*LB*0000000000059678,20160816,-1,20160816,20230905,-1,-1,-1,1000003*5820463,-1,27,17,803,2,Origina Soluciones,6.0,Compras de Cartera,3.0,Originar y el Cedro en Flujo,0,2016-08-16,21891261.0,0,59678,2016-08-16,21891261.0,0.0,21891261.0,82,,0,590743.0,,,,,25.2,,V,Vencido,M,Mensual,360,2,Inorganica,1.0,Nueva,,,0,0.0,0.0,0.0,69,2019-04-12 05:59:34,,0,-1,-1,-1,-,-,-
2,-1,-1,3144537,1000003*2356431*LB*0000000001520731,20151210,-1,20151210,20201130,-1,-1,-1,1000003*2356431,-1,27,17,807,3,El Cedro,6.0,Compras de Cartera,3.0,Originar y el Cedro en Flujo,0,2015-10-12,5742496.0,0,1520731,2015-10-12,5742496.0,0.0,5742496.0,34,,0,181749.0,,,,,23.94,,V,Vencido,M,Mensual,360,2,Inorganica,1.0,Nueva,,,0,0.0,0.0,0.0,69,2019-04-12 05:59:34,,0,-1,-1,-1,-,-,-
3,-1,-1,3144505,1000003*17048613*LB*0000000016200288,20160929,-1,20160929,20190830,-1,-1,-1,1000003*17048613,-1,27,17,807,3,El Cedro,6.0,Compras de Cartera,3.0,Originar y el Cedro en Flujo,0,2016-09-29,3135285.0,0,16200288,2016-09-29,3135285.0,0.0,3135285.0,19,,0,136347.0,,,,,25.6809,,V,Vencido,M,Mensual,360,2,Inorganica,1.0,Nueva,,,0,0.0,0.0,0.0,69,2019-04-12 05:59:34,,0,-1,-1,-1,-,-,-
4,-1,-1,3145139,1000003*16633503*LB*0000000801007975,20160527,-1,20160527,20250805,-1,-1,-1,1000003*16633503,-1,27,17,803,2,Origina Soluciones,6.0,Compras de Cartera,3.0,Originar y el Cedro en Flujo,0,2016-05-27,24336067.0,0,801007975,2016-05-27,24336067.0,0.0,24336067.0,105,,0,568578.0,,,,,22.8,,V,Vencido,M,Mensual,360,2,Inorganica,1.0,Nueva,,,0,0.0,0.0,0.0,69,2019-04-12 05:59:34,,0,-1,-1,-1,-,-,-


In [14]:
lib['ds_estado_actual'].value_counts()

Cancelada     2590935
En Cartera     363596
Castigado       53882
                 6612
Name: ds_estado_actual, dtype: int64

In [15]:
# This is the list of columns that we will drop in the first stage
lib_constant_cols = ['sk_producto_servicio', 'cd_modalidad_pag_int', 'ds_modalidad_pag_int',
                     'cd_periodicidad_pag_int', 'ds_periodicidad_pag_int',
                     'cd_base_liquidacion', 'no_obligacion_novada']

lib.drop(columns=lib_constant_cols, inplace=True)

In [16]:
lib = lib.loc[lib['ds_tipo_libranza'] == 'Organica']
lib = lib.loc[lib['ds_tipo_credito'].notnull()]

In [25]:
lib_date_cols = ['fe_solicitud', 'fe_desembolso', 'fe_finalizacion']
for col in lib_date_cols:
    lib[col] = pd.to_datetime(lib[col], format='%Y-%m-%d', errors='coerce')

In [11]:
## filtro tiempo
begining_training_period = '2006-01-01' # TODO: verify if this period is enough
ending_training_period = '2019-12-30'

In [12]:
period_filter = (lib['fe_solicitud'] >= begining_training_period) & (lib['fe_solicitud'] <= ending_training_period)
lib = lib.loc[period_filter]

In [13]:
lib_relevant_cols = ['id_cliente', 'sk_rc_libranza', 'fe_solicitud',
                     'vl_monto_solicitado', 'fe_desembolso', 'fe_finalizacion',
                     'vl_monto_aprobado', 'no_obligacion', 'vl_monto_desembolsado',
                     'no_cuotas', 'vl_total_cuota', 'ds_estado_actual',
                     'vl_tasa', 'ds_tipo_credito', 'ds_tipo_libranza', 'vl_monto_novado',
                    'ds_sector','ds_subsector','ds_tipo_pagaduria']

lib = lib[lib_relevant_cols]

In [14]:
lib.loc[lib['fe_solicitud'].isnull(), 'fe_solicitud'] = lib.loc[lib['fe_solicitud'].isnull(), 'fe_desembolso']

In [15]:
# def add_columns_to_lib():
hoy = pd.datetime.today()

lib['años_ult_lib'] = (hoy - lib['fe_desembolso'])/(np.timedelta64(1, 'D')*365)
lib['dias_procesamiento'] = (lib['fe_desembolso'] - lib['fe_solicitud'])/np.timedelta64(1, 'D')
lib['dias_procesamiento'] = (lib['fe_desembolso'] - lib['fe_solicitud'])/np.timedelta64(1, 'D')
# this is a necessary fix because there are many dates with fecha de solicitud in 1900
lib.loc[lib['dias_procesamiento'] > 360, 'dias_procesamiento'] = lib['dias_procesamiento'].median()
lib['prop_desemb_vs_aprobado'] = lib['vl_monto_desembolsado']/lib['vl_monto_aprobado']

  


In [16]:
lib.head()

Unnamed: 0,id_cliente,sk_rc_libranza,fe_solicitud,vl_monto_solicitado,fe_desembolso,fe_finalizacion,vl_monto_aprobado,no_obligacion,vl_monto_desembolsado,no_cuotas,vl_total_cuota,ds_estado_actual,vl_tasa,ds_tipo_credito,ds_tipo_libranza,vl_monto_novado,ds_sector,ds_subsector,ds_tipo_pagaduria,años_ult_lib,dias_procesamiento,prop_desemb_vs_aprobado
2944,102652303931898101,2045943,2015-02-11,52600000.0,2015-02-17,2022-11-05,52600000.0,58703090011202,3146843.0,92,896875.0,Cancelada,12.0,Novacion,Organica,49053545.0,ORDEN NACIONAL,Policia Nacional,NACIONAL CENTRALIZADA,5.349489,6.0,0.059826
2945,102652303931898101,1030405,2007-10-02,9700000.0,2007-10-11,2012-12-05,9700000.0,58703090000278,2765114.0,60,240028.0,Cancelada,16.8,Novacion,Organica,6344836.0,ORDEN NACIONAL,Policia Nacional,NACIONAL CENTRALIZADA,12.708393,9.0,0.285063
2946,102652303931898101,1237429,2010-10-25,25000000.0,2010-10-29,2016-12-05,23720000.0,58703010174230,7500676.0,72,507763.0,Cancelada,15.48,Novacion,Organica,14686428.0,ORDEN NACIONAL,Policia Nacional,NACIONAL CENTRALIZADA,9.656339,4.0,0.316217
2948,102652303931898101,2155144,2012-09-07,26600000.0,2012-09-11,2018-10-05,26600000.0,56503010073050,2322917.0,72,528372.0,Cancelada,12.6,Novacion,Organica,22786203.0,ORDEN NACIONAL,Policia Nacional,NACIONAL CENTRALIZADA,7.785106,4.0,0.087328
2949,102652303931898101,1375772,2011-03-09,26400000.0,2011-03-10,2017-04-05,26400000.0,58703010183266,1666006.0,72,565133.0,Cancelada,15.48,Novacion,Organica,23156246.0,ORDEN NACIONAL,Policia Nacional,NACIONAL CENTRALIZADA,9.294695,1.0,0.063106


In [17]:
def prev_months_last_date(anio_mes_str):
    '''Gets the year and month of prediction and returns the previous month's last day's date'''
    anio, mes = anio_mes_str.split('-')
    anio = int(anio)
    mes = int(mes)
    if mes == 1:
        anio = int(anio - 1)
        mes = 12
    else:
        mes = mes - 1
    last_day_of_month = calendar.monthrange(anio, mes)[1]
    out = datetime(anio, mes, last_day_of_month)
    return out 

In [18]:
def get_grouped_df(anio_mes_str):
    '''Produces the extended lib_df and the grouping object that will be used
    as input to produce the final libranzas df'''
    cutoff_date = prev_months_last_date(anio_mes_str)
    ## considering records only before the predicting period, grouping by client
    lib_df = lib.loc[lib['fe_desembolso'] <= cutoff_date].sort_values(['id_cliente','fe_desembolso'], ascending=False).copy()
    gp = lib_df.groupby('id_cliente')
    return gp

In [19]:
def create_lib_df(gp):
    '''Gets the extended lib_df and the grouped object and returns the data frame ready for master table'''
    #producing the first dataframe
    lib_out = pd.DataFrame(index=gp.groups.keys()) # this is different from the actual prod code
    
    # number of records for each cliente
    lib_out['num_lib_solicitadas'] = gp.size()

    # these have a very close relation
    lib_out['prom_monto_solicitado'] = gp['vl_monto_solicitado'].mean()
    lib_out['prom_monto_aprobado'] = gp['vl_monto_aprobado'].mean()
    lib_out['prom_monto_desembolsado'] = gp['vl_monto_desembolsado'].mean()
    lib_out['prom_monto_novado'] = gp['vl_monto_novado'].mean()
    
    # Minimum years since las product
    lib_out['min_anos_ult_lib'] = gp['años_ult_lib'].min()

    # other averages
    lib_out['prom_prop_desembolso'] = gp['prop_desemb_vs_aprobado'].mean()
    lib_out['prom_n_cuotas'] = gp['no_cuotas'].mean()
    lib_out['prom_valor_cuota'] = gp['vl_total_cuota'].mean()
    lib_out['prom_tasa'] = gp['vl_tasa'].mean()

    # we have to choose one of these two...
    lib_out['prom_dias_procesamiento'] = gp['dias_procesamiento'].mean()
    lib_out['max_dias_procesamiento'] = gp['dias_procesamiento'].max()

    # counting novacion events within the historical period
    lib_out['n_novaciones'] = gp['ds_tipo_credito'].value_counts().unstack()['Novacion'].fillna(0).astype(int)
    # categorical columns
    
    lib_out['sector_ultlibranza'] = gp.apply(lambda x: x['ds_sector'].iloc[0])
    lib_out['subsector_ultlibranza'] = gp.apply(lambda x: x['ds_subsector'].iloc[0])
    lib_out['tipo_pagaduria_ultlibranza'] = gp.apply(lambda x: x['ds_tipo_pagaduria'].iloc[0])
    
    lib_out.index.rename('id_cliente', inplace=True)
    lib_out.reset_index(inplace = True)
    
    return lib_out

In [20]:
def get_lib_df(anio_mes_str):
    '''Glues the get_objects_for_lib_df and create_lib_df'''
    gp = get_grouped_df(anio_mes_str)
    out = create_lib_df(gp)
    return out

In [21]:
# producing the objects 
for ix, x in enumerate(anio_mes_strs):
    
    if ix == 0:
        lib_df = get_lib_df(x)
        lib_df['periodo'] = x
    else:
        lib_temp = get_lib_df(x)
        lib_temp['periodo'] = x
        lib_df = pd.concat([lib_df,lib_temp], ignore_index=True)
    
    print(x)

2018-01
2018-02
2018-03
2018-04
2018-05
2018-06
2018-07
2018-08
2018-09
2018-10
2018-12
2019-01
2019-02
2019-03
2019-04
2019-05
2019-06
2019-07
2019-08
2019-09
2019-11
2019-12


In [22]:

gp_p = lib.groupby('id_cliente')
gp_p['ds_tipo_credito'].value_counts().unstack()['Novacion'].fillna(0).astype(int).head(10)

id_cliente
101052294884460201    0
101052314032556001    0
101055260548785601    3
101055260650910902    3
101055260683554401    1
101055260720859701    1
101055266255810701    0
101055266367157902    2
101055266603150201    2
101055266686923002    3
Name: Novacion, dtype: int64

In [23]:
gp_p['ds_tipo_credito'].value_counts().head(10)

id_cliente          ds_tipo_credito
101052294884460201  Nueva              2
101052314032556001  Nueva              1
101055260548785601  Novacion           3
                    Nueva              1
101055260650910902  Novacion           3
                    Nueva              1
101055260683554401  Novacion           1
                    Nueva              1
101055260720859701  Novacion           1
                    Nueva              1
Name: ds_tipo_credito, dtype: int64

In [24]:
lib.sort_values(['id_cliente','fe_solicitud'], ascending=False).head(20)

Unnamed: 0,id_cliente,sk_rc_libranza,fe_solicitud,vl_monto_solicitado,fe_desembolso,fe_finalizacion,vl_monto_aprobado,no_obligacion,vl_monto_desembolsado,no_cuotas,vl_total_cuota,ds_estado_actual,vl_tasa,ds_tipo_credito,ds_tipo_libranza,vl_monto_novado,ds_sector,ds_subsector,ds_tipo_pagaduria,años_ult_lib,dias_procesamiento,prop_desemb_vs_aprobado
2837214,999955693775109001,841646,2008-04-16,16000000.0,2008-04-17,2013-05-05,16000000.0,18003010294628,15215387.0,60,476588.0,Cancelada,25.74,Nueva,Organica,0.0,PENSIONADOS,Caja De retiro de Las FFMM,NACIONAL CENTRALIZADA,12.190585,1.0,0.950962
2577994,999955266582332102,1423192,2011-02-03,34300000.0,2011-02-08,2018-01-08,34270000.0,1203010063300,10748344.0,84,780730.0,Cancelada,20.95,Novacion,Organica,21462745.0,PENSIONADOS,I.S.S. Pensionados,NACIONAL CENTRALIZADA,9.376887,5.0,0.313637
2577993,999955266582332102,1645223,2009-07-14,27000000.0,2009-07-24,2014-07-05,27000000.0,1203010051965,21754521.0,60,785221.0,Cancelada,24.54,Novacion,Organica,3872778.0,PENSIONADOS,I.S.S. Pensionados,NACIONAL CENTRALIZADA,10.922092,10.0,0.805723
2577991,999955266582332102,842517,2007-04-19,5570000.0,2007-04-30,2012-05-05,5570000.0,4203010024491,4517765.0,60,150061.0,Cancelada,20.8,Novacion,Organica,766300.0,PENSIONADOS,I.S.S. Pensionados,NACIONAL CENTRALIZADA,13.157708,11.0,0.811089
2358040,999955266450872402,1761673,2014-06-27,29100000.0,2014-06-27,2021-07-05,29100000.0,35303070001146,28937310.0,84,533773.0,Cancelada,12.0,Nueva,Organica,0.0,ENTES TERRITORIALES,Fondos Educativos,NACIONAL CENTRALIZADA,5.993325,0.0,0.994409
2358041,999955266450872402,1405539,2010-11-10,12000000.0,2010-11-12,2016-12-05,12000000.0,36303010059187,5050873.0,72,272880.0,Cancelada,17.88,Novacion,Organica,6184277.0,ENTES TERRITORIALES,Fondos Educativos,NACIONAL CENTRALIZADA,9.617982,2.0,0.420906
2358039,999955266450872402,912371,2008-01-29,10000000.0,2008-01-30,2013-03-05,10000000.0,36303010037166,9389358.0,60,289773.0,Cancelada,24.36,Nueva,Organica,0.0,ENTES TERRITORIALES,Fondos Educativos,NACIONAL CENTRALIZADA,12.404284,1.0,0.938936
2662722,999955266207457702,2059230,2014-02-17,31500000.0,2014-02-28,2021-11-05,31500000.0,8803400000086,4701647.0,92,595403.0,Castigado,12.0,Novacion,Organica,26588541.0,PENSIONADOS,Otros Pensionados,NACIONAL CENTRALIZADA,6.319352,11.0,0.149259
2662721,999955266207457702,2200877,2014-01-20,26800000.0,2014-01-31,2021-10-05,26800000.0,8803350000886,21616129.0,92,499250.0,Cancelada,15.48,Nueva,Organica,0.0,PENSIONADOS,Otros Pensionados,NACIONAL CENTRALIZADA,6.396065,11.0,0.806572
2643727,999955260800181302,1748866,2013-04-12,6400000.0,2013-04-15,2017-11-05,6400000.0,28003480000066,6045369.0,54,161329.0,Cancelada,14.28,Nueva,Organica,0.0,ORDEN NACIONAL,Policia Nacional,NACIONAL CENTRALIZADA,7.193325,3.0,0.944589


In [25]:
print(lib_df.shape)
lib_df.head()

(17696740, 18)


Unnamed: 0,id_cliente,num_lib_solicitadas,prom_monto_solicitado,prom_monto_aprobado,prom_monto_desembolsado,prom_monto_novado,min_anos_ult_lib,prom_prop_desembolso,prom_n_cuotas,prom_valor_cuota,prom_tasa,prom_dias_procesamiento,max_dias_procesamiento,n_novaciones,sector_ultlibranza,subsector_ultlibranza,tipo_pagaduria_ultlibranza,periodo
0,101052294884460201,2,7050000.0,6385000.0,5982935.0,0.0,11.354969,0.93404,72.0,164258.5,23.61,3.5,6.0,0,PENSIONADOS,Fopep,NACIONAL CENTRALIZADA,2018-01
1,101055260548785601,4,16175000.0,16175000.0,5261896.75,10197711.5,5.234421,0.402476,63.0,365051.75,13.59,3.75,7.0,3,ORDEN NACIONAL,Policia Nacional,NACIONAL CENTRALIZADA,2018-01
2,101055260650910902,4,12125000.0,12125000.0,7560881.0,4087683.5,4.653599,0.602487,60.0,279657.25,15.0,3.25,7.0,3,ORDEN NACIONAL,Policia Nacional,NACIONAL CENTRALIZADA,2018-01
3,101055260683554401,2,18000000.0,18000000.0,14080671.5,3618687.5,2.533051,0.831607,60.0,436291.0,15.48,6.5,9.0,1,ORDEN NACIONAL,Policia Nacional,NACIONAL CENTRALIZADA,2018-01
4,101055260720859701,2,8250000.0,8250000.0,5983175.5,1797331.0,10.423462,0.745965,66.0,202629.5,19.2148,3.0,5.0,1,ENTES TERRITORIALES,Secretaría de Educacion,REGIONAL,2018-01


In [25]:
lib_df['periodo'].value_counts().sort_index()

2018-01    698767
2018-02    701667
2018-03    705438
2018-04    708765
2018-05    712736
2018-06    716736
2018-07    719945
2018-08    723117
2018-09    726875
2018-10    730421
2018-11    734160
2018-12    737894
2019-01    740698
2019-02    743308
2019-03    746464
2019-04    749799
2019-05    753314
2019-06    756430
2019-07    758698
2019-08    761997
2019-09    765360
2019-10    765364
2019-11    768485
2019-12    770302
Name: periodo, dtype: int64

In [30]:
lib_df['fecha_lib'] = pd.to_datetime(lib_df['periodo'],format='%Y-%m').dt.strftime('%Y%m').astype('int')

In [32]:
lib_df.to_parquet("s3://adl-refined-dev-popular/parquet/TC_adquisicion/base_libranzas_201801_201912",engine='pyarrow', index=False)

## Modelo

In [3]:
lib_df = pd.read_parquet("s3://adl-refined-dev-popular/parquet/TC_adquisicion/base_libranzas_201801_201912",engine='pyarrow')

In [31]:
lib_df[lib_df['id_cliente']==139955260696468901]

Unnamed: 0,id_cliente,num_lib_solicitadas,prom_monto_solicitado,prom_monto_aprobado,prom_monto_desembolsado,prom_monto_novado,min_anos_ult_lib,prom_prop_desembolso,prom_n_cuotas,prom_valor_cuota,prom_tasa,prom_dias_procesamiento,max_dias_procesamiento,n_novaciones,sector_ultlibranza,subsector_ultlibranza,tipo_pagaduria_ultlibranza,periodo,fecha_lib
30777,139955260696468901,2,15750000.0,15750000.0,9233777.0,5455780.0,7.124832,0.611499,72.0,326726.0,14.34,1.0,1.0,1,ORDEN NACIONAL,Policia Nacional,NACIONAL CENTRALIZADA,2018-01,201801
729659,139955260696468901,2,15750000.0,15750000.0,9233777.0,5455780.0,7.124832,0.611499,72.0,326726.0,14.34,1.0,1.0,1,ORDEN NACIONAL,Policia Nacional,NACIONAL CENTRALIZADA,2018-02,201802
1431496,139955260696468901,2,15750000.0,15750000.0,9233777.0,5455780.0,7.124832,0.611499,72.0,326726.0,14.34,1.0,1.0,1,ORDEN NACIONAL,Policia Nacional,NACIONAL CENTRALIZADA,2018-03,201803
2137079,139955260696468901,2,15750000.0,15750000.0,9233777.0,5455780.0,7.124832,0.611499,72.0,326726.0,14.34,1.0,1.0,1,ORDEN NACIONAL,Policia Nacional,NACIONAL CENTRALIZADA,2018-04,201804
2846053,139955260696468901,2,15750000.0,15750000.0,9233777.0,5455780.0,7.124832,0.611499,72.0,326726.0,14.34,1.0,1.0,1,ORDEN NACIONAL,Policia Nacional,NACIONAL CENTRALIZADA,2018-05,201805
3558959,139955260696468901,2,15750000.0,15750000.0,9233777.0,5455780.0,7.124832,0.611499,72.0,326726.0,14.34,1.0,1.0,1,ORDEN NACIONAL,Policia Nacional,NACIONAL CENTRALIZADA,2018-06,201806
4275834,139955260696468901,2,15750000.0,15750000.0,9233777.0,5455780.0,7.124832,0.611499,72.0,326726.0,14.34,1.0,1.0,1,ORDEN NACIONAL,Policia Nacional,NACIONAL CENTRALIZADA,2018-07,201807
4995912,139955260696468901,2,15750000.0,15750000.0,9233777.0,5455780.0,7.124832,0.611499,72.0,326726.0,14.34,1.0,1.0,1,ORDEN NACIONAL,Policia Nacional,NACIONAL CENTRALIZADA,2018-08,201808
5719182,139955260696468901,2,15750000.0,15750000.0,9233777.0,5455780.0,7.124832,0.611499,72.0,326726.0,14.34,1.0,1.0,1,ORDEN NACIONAL,Policia Nacional,NACIONAL CENTRALIZADA,2018-09,201809
6446240,139955260696468901,2,15750000.0,15750000.0,9233777.0,5455780.0,7.124832,0.611499,72.0,326726.0,14.34,1.0,1.0,1,ORDEN NACIONAL,Policia Nacional,NACIONAL CENTRALIZADA,2018-10,201810


In [26]:
lib[lib['id_cliente']==139955260696468901]

Unnamed: 0,id_cliente,sk_rc_libranza,fe_solicitud,vl_monto_solicitado,fe_desembolso,fe_finalizacion,vl_monto_aprobado,no_obligacion,vl_monto_desembolsado,no_cuotas,vl_total_cuota,ds_estado_actual,vl_tasa,ds_tipo_credito,ds_tipo_libranza,vl_monto_novado,ds_sector,ds_subsector,ds_tipo_pagaduria,años_ult_lib,dias_procesamiento,prop_desemb_vs_aprobado
88344,139955260696468901,1917182,2011-02-17,14500000.0,2011-02-18,2017-04-05,14500000.0,5703010033679,13475792.0,72,310395.0,Cancelada,15.48,Nueva,Organica,0.0,ORDEN NACIONAL,Policia Nacional,NACIONAL CENTRALIZADA,9.349489,1.0,0.929365
88345,139955260696468901,1818942,2013-05-09,17000000.0,2013-05-10,2019-06-05,17000000.0,1303480000097,4991762.0,72,343057.0,Cancelada,13.2,Novacion,Organica,10911560.0,ORDEN NACIONAL,Policia Nacional,NACIONAL CENTRALIZADA,7.124832,1.0,0.293633


In [2]:
lib_df = pd.read_parquet("s3://adl-refined-dev-popular/parquet/TC_adquisicion/base_libranzas_201801_201912",engine='pyarrow')
lib_df.count()

id_cliente                    17696740
num_lib_solicitadas           17696740
prom_monto_solicitado         17696740
prom_monto_aprobado           17696740
prom_monto_desembolsado       17696740
prom_monto_novado             17696740
prom_prop_desembolso          17696740
prom_n_cuotas                 17696740
prom_valor_cuota              17696740
prom_tasa                     17696740
prom_dias_procesamiento       17696740
max_dias_procesamiento        17696740
n_novaciones                  17696740
sector_ultlibranza            17696740
subsector_ultlibranza         17696740
tipo_pagaduria_ultlibranza    17696740
periodo                       17696740
dtype: int64

In [3]:
lib_df['sector_ultlibranza'].value_counts()

PENSIONADOS              9913414
ORDEN NACIONAL           4041447
ENTES TERRITORIALES      3141400
PRIVADO                   320500
ENTES DESENTRALIZADOS     279979
Name: sector_ultlibranza, dtype: int64

In [6]:
# Creando Data Libranza
lib_df['fecha_lib'] = pd.to_datetime(lib_df['periodo'], format='%Y-%m').dt.strftime('%Y%m').astype('int')
lib_df.head(20)

Unnamed: 0,id_cliente,num_lib_solicitadas,prom_monto_solicitado,prom_monto_aprobado,prom_monto_desembolsado,prom_monto_novado,prom_prop_desembolso,prom_n_cuotas,prom_valor_cuota,prom_tasa,prom_dias_procesamiento,max_dias_procesamiento,n_novaciones,sector_ultlibranza,subsector_ultlibranza,tipo_pagaduria_ultlibranza,periodo,fecha_lib
0,101052294884460201,2,7050000.0,6385000.0,5982935.0,0.0,0.93404,72.0,164258.5,23.61,3.5,6.0,0,PENSIONADOS,Fopep,NACIONAL CENTRALIZADA,2018-01,201801
1,101055260548785601,4,16175000.0,16175000.0,5261897.0,10197710.0,0.402476,63.0,365051.75,13.59,3.75,7.0,3,ORDEN NACIONAL,Policia Nacional,NACIONAL CENTRALIZADA,2018-01,201801
2,101055260650910902,4,12125000.0,12125000.0,7560881.0,4087684.0,0.602487,60.0,279657.25,15.0,3.25,7.0,3,ORDEN NACIONAL,Policia Nacional,NACIONAL CENTRALIZADA,2018-01,201801
3,101055260683554401,2,18000000.0,18000000.0,14080670.0,3618688.0,0.831607,60.0,436291.0,15.48,6.5,9.0,1,ORDEN NACIONAL,Policia Nacional,NACIONAL CENTRALIZADA,2018-01,201801
4,101055260720859701,2,8250000.0,8250000.0,5983176.0,1797331.0,0.745965,66.0,202629.5,19.2148,3.0,5.0,1,ENTES TERRITORIALES,Secretaría de Educacion,REGIONAL,2018-01,201801
5,101055266255810701,1,10000000.0,10000000.0,9530928.0,0.0,0.953093,60.0,269410.0,20.8,13.0,13.0,0,PENSIONADOS,I.S.S. Pensionados,NACIONAL CENTRALIZADA,2018-01,201801
6,101055266367157902,3,27133330.0,27133330.0,15335720.0,11251010.0,0.618465,98.666667,563069.666667,17.12,9.0,14.0,2,PENSIONADOS,I.S.S. Pensionados,NACIONAL CENTRALIZADA,2018-01,201801
7,101055266603150201,2,3250000.0,3250000.0,2337335.0,836307.5,0.785358,36.0,118828.0,18.48,10.5,14.0,1,PENSIONADOS,I.S.S. Pensionados,NACIONAL CENTRALIZADA,2018-01,201801
8,101055266686923002,5,17860000.0,17860000.0,10065410.0,6742780.0,0.671437,62.4,458791.8,20.576,2.0,5.0,3,PENSIONADOS,Fiduprevisora,NACIONAL CENTRALIZADA,2018-01,201801
9,101055629594214801,2,8538500.0,8538500.0,6333382.0,2091991.0,0.783307,54.0,232828.5,17.58,7.5,12.0,1,PENSIONADOS,I.S.S. Pensionados,NACIONAL CENTRALIZADA,2018-01,201801
