In [1]:
import boto3
import pandas as pd
import numpy as np
import pickle
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
import pandas_profiling
import os
from datetime import datetime, timedelta
import dateutil.relativedelta
import gc
import s3fs
import pyarrow.parquet as pq

pd.set_option('display.max_columns',None)

In [2]:
def spark_read_parquet(s3_url: str, **args):
    fs = s3fs.S3FileSystem()
    # Leyendo base
    dataset = pq.ParquetDataset(s3_url, filesystem=fs)
    table = dataset.read()
    dataframe = table.to_pandas()

    del dataset, table

    return dataframe

In [3]:
def _key_files__list(client, bucket, key):
    """return the key's size if it exist, else None"""
    response = client.list_objects_v2(Bucket=bucket, Prefix=key)
    return response.get("Contents", [])

def get_s3_files(s3_url):
    session = boto3.session.Session()
    s3 = session.client("s3")
    bucket = s3_url.split("/")[2]
    key = s3_url.split("/")[3:]
    key = "/".join(key)
    objects = _key_files__list(s3, bucket, key)
    files = ["s3://" + bucket + "/" + file["Key"] for file in objects]
    return files

def get_s3_parquets(s3_url):
    session = boto3.session.Session()
    s3 = session.client("s3")
    bucket = s3_url.split("/")[2]
    key = s3_url.split("/")[3:]
    key = "/".join(key)
    objects = _key_files__list(s3, bucket, key)
    files = ["s3://" + bucket + "/" + file["Key"][:-9] for file in objects if file["Key"].split("/")[-1]=="_SUCCESS"]
    return files

## Explorando archivos

In [None]:
bucket = 'data-bpop-dev-sandbox'
path = 's3://data-bpop-dev-sandbox/estandarizado/productos/'

In [None]:
files = [
    'pasivo-estados-cdts-fc',
    'pasivo-estados-ctas-fc',
    'pasivo-account-month-fc',
    'pasivo-balance-month'
]

### ACCOUNT

In [None]:
path_files = path + files[2] + '/'
archivos_acc = get_s3_parquets(path_files)
archivos_acc

['s3://data-bpop-dev-sandbox/estandarizado/productos/pasivo-account-month-fc/productos_pasivo-account-month-fc_ods_M202007',
 's3://data-bpop-dev-sandbox/estandarizado/productos/pasivo-account-month-fc/productos_pasivo-account-month-fc_ods_M202008',
 's3://data-bpop-dev-sandbox/estandarizado/productos/pasivo-account-month-fc/productos_pasivo-account-month-fc_ods_M202009',
 's3://data-bpop-dev-sandbox/estandarizado/productos/pasivo-account-month-fc/productos_pasivo-account-month-fc_ods_M202010',
 's3://data-bpop-dev-sandbox/estandarizado/productos/pasivo-account-month-fc/productos_pasivo-account-month-fc_ods_M202011',
 's3://data-bpop-dev-sandbox/estandarizado/productos/pasivo-account-month-fc/productos_pasivo-account-month-fc_ods_M202012',
 's3://data-bpop-dev-sandbox/estandarizado/productos/pasivo-account-month-fc/productos_pasivo-account-month-fc_ods_M202101',
 's3://data-bpop-dev-sandbox/estandarizado/productos/pasivo-account-month-fc/productos_pasivo-account-month-fc_ods_M202102',


In [None]:
for file in archivos_acc:
    print("Periodo:", file[-6:])
    prueba = spark_read_parquet(file)
    print(prueba.shape)
    del prueba
    gc.collect()

Periodo: 202007


In [None]:
len(archivos_acc)

In [None]:
prueba_marzo_acc = spark_read_parquet(archivos_acc[10])
print(prueba_marzo_acc.shape)
prueba_marzo_acc.head()

### CDT´s

In [None]:
path_files = path + files[0] + '/'
archivos_cdts = get_s3_parquets(path_files)
archivos_cdts

In [None]:
len(archivos_cdts)

#### Volumetrías

In [None]:
for file in archivos_cdts:
    print("Periodo:", file[-6:])
    prueba = spark_read_parquet(file)
    print(prueba.shape)
    del prueba
    gc.collect()
    

### Periodo: 202103

In [None]:
prueba_marzo_cdts = spark_read_parquet(archivos_cdts[11])
print(prueba_marzo_cdts.shape)
prueba_marzo_cdts.head()

### CTA´s

In [None]:
path_files = path + files[1] + '/'
archivos_ctas = get_s3_parquets(path_files)
archivos_ctas

In [None]:
for file in archivos_ctas:
    print("Periodo:", file[-6:])
    prueba = spark_read_parquet(file)
    print(prueba.shape)
    del prueba
    gc.collect()
    

In [None]:
archivos_ctas

In [None]:
len(archivos_ctas)

### Periodo: 202105

In [None]:
prueba_marzo_cta = spark_read_parquet(archivos_ctas[32])
print(prueba_marzo_cta.shape)
prueba_marzo_cta.head()

## Balances

In [None]:
path_files = path + files[3] + '/'
archivos_bal = get_s3_parquets(path_files)
archivos_bal

In [None]:
len(archivos_bal)

In [None]:
prueba_marzo_bal = spark_read_parquet(archivos_bal[36])
print(prueba_marzo_bal.shape)
prueba_marzo_bal.head()

## Uniendo Bases

In [None]:
# prueba_marzo_bal["acc"] = prueba_marzo_bal["acc"].astype("int")
# prueba_marzo_cdts["account_no"] = prueba_marzo_cdts["account_no"].astype("int")
# prueba_marzo_cta["account_no"] = prueba_marzo_cta["account_no"].astype("int")
# prueba_marzo_acc["account_no"] = prueba_marzo_acc["account_no"].astype("int")

In [None]:
marzo_cta = pd.merge(prueba_marzo_acc, prueba_marzo_cta, on="account_no", how="left")
marzo_cdts_cta = pd.merge(marzo_cta, prueba_marzo_cdts, on="account_no", how="left", suffixes=('_cta', '_cdt'))
marzo_total = pd.merge(marzo_cdts_cta, prueba_marzo_bal, left_on="account_no", right_on="acc", how="left")

In [None]:
print(prueba_marzo_acc.shape)
print(marzo_cta.shape)
print(marzo_cdts_cta.shape)
marzo_total.head()

In [None]:
marzo_total.count()

In [None]:
path_files = path + archivos_ctas[0] + '/'
archivos_ctas = get_s3_parquets(path_files, )
archivos_ctas

## Explorando Bases Buró!!

In [4]:
bucket = 'data-bpop-dev-sandbox'
path = 's3://data-bpop-dev-sandbox/estandarizado/riesgos-creditos/buro-comportamiento-financiero/'

In [5]:
path_files = path
files_bur = get_s3_parquets(path_files)
files_bur

['s3://data-bpop-dev-sandbox/estandarizado/riesgos-creditos/buro-comportamiento-financiero/riesgos-creditos_buro-comportamiento-financiero_experian_M201811',
 's3://data-bpop-dev-sandbox/estandarizado/riesgos-creditos/buro-comportamiento-financiero/riesgos-creditos_buro-comportamiento-financiero_experian_M201905',
 's3://data-bpop-dev-sandbox/estandarizado/riesgos-creditos/buro-comportamiento-financiero/riesgos-creditos_buro-comportamiento-financiero_experian_M201908',
 's3://data-bpop-dev-sandbox/estandarizado/riesgos-creditos/buro-comportamiento-financiero/riesgos-creditos_buro-comportamiento-financiero_experian_M201911',
 's3://data-bpop-dev-sandbox/estandarizado/riesgos-creditos/buro-comportamiento-financiero/riesgos-creditos_buro-comportamiento-financiero_experian_M202003',
 's3://data-bpop-dev-sandbox/estandarizado/riesgos-creditos/buro-comportamiento-financiero/riesgos-creditos_buro-comportamiento-financiero_experian_M202006',
 's3://data-bpop-dev-sandbox/estandarizado/riesgos-c

In [6]:
len(files_bur)

11

In [7]:
bur1 = spark_read_parquet(files_bur[7])
print(bur1.shape)
bur1

(532595, 134)


Unnamed: 0,tipo_id,tipo_id_homologado,no_identificacion,ds_nombre_cliente,rango_aproximado_edad,genero,ciudad_de_expedicion,acierta_a_financiero,quanto,quanto_mod,porcen_de_cuotas_vs_ingreso,respuesta,numero_obligaciones_activas,numero_creditos_cb,valor_inicial_cb,valor_saldo_cb,valor_cuotas_cb,valor_mora_cb,numero_creditos_cv,valor_inicial_cv,valor_saldo_cv,valor_cuotas_cv,valor_mora_cv,numero_creditos_cf,valor_inicial_cf,valor_saldo_cf,valor_cuotas_cf,valor_mora_cf,numero_tdc,valor_cupos,valor_utilizado,porcentaje_utilizacion,valor_cuotas,valor_mora,rango_0,rango_1,rango_2,rango_3,rango_4,rango_5,rango_6,fecha_mas_antigua_apertura,numero_creditos_sr,valor_inicial_sr,valor_saldo_sr,valor_cuotas_sr,valor_mora_sr,numero_celulares_telcos,valor_cuotas_celulares_telcos,valor_mora_telcos,numero_creditos_cooperativas,valor_inicial_cooperativas,valor_saldo_cooperativas,valor_cuotas_cooperativas,valor_mora_cooperativas,numero_creditos_codeudores,valor_saldo_codeudores,valor_cuotas_codeudores,valor_mora_codeudores,obliga_al_dia_cartera_actual,obliga_mora_30_cartera_actual,obliga_mora_60_cartera_actual,obliga_mora_90_cartera_actual,obliga_mora_120_cartera_actual,cartera_castig_cartera_actual,dudoso_recaudo_cartera_actual,ctas_en_cobrador_cartera_act,ult_año_moras_30_cartera_hist,ult_año_moras_60_cartera_hist,ult_año_moras_90_cartera_hist,ult_año_moras_120_cartera_hist,cancel_mal_manejo_cartera_hist,cartera_recupe_cartera_hist,tdc_altura_maxima_de_mora,cartera_banca_alt_max_de_mora,cartera_coope_alt_max_de_mora,cartera_hipote_alt_max_de_mora,peor_califi_trim_1_endeud,peor_calif_trim_2_endeud,ctas_de_ahorro_act_ctas_banca,ctas_ctes_act_cta_banca,ctas_embargadas_ctas_bancarias,cancel_mal_manejo_ctas_banca,ctas_saldadas_ctas_banca,total_consultas_ult_6_meses,estado_consulta,endeudamiento,num_tdc_vigentes_sin_popular,cupo_sin_popular,max_cupo_tdc_sin_popular,promedio_cupo_tdc_sin_popular,fec_mas_anti_aper_tdc_sin_popu,valor_utilizado_sin_popular,utilizacion_sin_popular,valor_cuotas_sin_popular,valor_en_mora_sin_popular,numoblvigensectorbancasin_popu,cuposectorbancario_sin_popular,maxcuposectorbancasin_popular,promcuposectorbancasin_popular,fecantiapersectorbancasinpopu,valor_utilisectorbancasin_popu,util_sector_banca_sin_popular,val_cuo_sector_banca_sin_popu,val_morasectorbancasin_popular,numoblvigensector_hip_sin_popu,cupo_sector_hip_sin_popular,max_cupo_sector_hip_sin_popu,prom_cupo_sector_hip_sin_popu,fec_masantapersectorhipsin_pop,val_util_sector_hip_sin_pop,util_sector_hip_sin_popular,val_cuotas_sector_hip_sin_popu,val_mora_sector_hip_sin_popu,cuotas_calculadas_tdc,cuotas_calculadas_créditos,cuotas_calculadas_hipotecarias,plazo_credito,plazo_hipotecario,tasa_hipotecario,tasa_creditos,plazo_tdc,cupo_sugerido_1,cupo_sugerido_2,cupo_sugerido_3,cupo_segun_mercado,cupo_ajustadi,cupo_exp_fin,cupo_final,experiencia_financiera,fecha_envio,fecha_data,tipo_cliente,id_cliente
0,4,1000005,9191,,,,EXTRANJERO,822.0,,1623000.0,,,1,,,,,,,,,,,,,,,,1.0,2500000.0,0.0,0.000000,0.0,0.0,,,,,,,,2002-08-01,,,,,,,,,,,,,,,,,,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,,0,0,0,0,1,,1.0,2500000.0,2500000.0,2500000.0,2002-08-01,0.0,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2021-01-05,2020-12-31,,102154706710734702
1,2,1000007,800003367,,,,,,,,,,64,24.0,6.334539e+09,4.527163e+09,216592000.0,0.0,,,,,,15.0,6.121110e+08,4.026440e+08,355205000.0,0.0,3.0,25000000.0,5295000.0,0.211800,826000.0,0.0,,,,,,,,2016-08-01,8.0,336000.0,57864000.0,0.0,0.0,,,,,,,,,,,,,64,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,A,B,11.0,3.0,0,0,0,4424,1,,3.0,25000000.0,10000000.0,8333333.0,2016-08-01,5295000.0,,826000.0,0.0,21.0,5.788539e+09,1.372003e+09,2.756447e+08,2020-09-01,4.103088e+09,,198766000.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,2021-01-05,2020-12-31,,411252294121262802
2,2,1000007,800003971,,,,,,,,,,4,,,,,,,,,,,2.0,2.109000e+06,1.000000e+03,1000.0,0.0,,,,,,,,,,,,,,,,,,,,1.0,0.0,170000.0,,,,,,,,,,2,0,0,0,0,0,1,0,0,0,0,0,0,0,,,,,E,E,1.0,,0,0,0,0,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2021-01-05,2020-12-31,,586952294027743202
3,2,1000007,800004065,,,,,,,,,,34,4.0,7.170700e+07,5.580500e+07,843000.0,501000.0,,,,,,4.0,0.000000e+00,0.000000e+00,0.0,5086000.0,7.0,54900000.0,45306000.0,0.825246,11082000.0,4452000.0,,,,,,,,1995-12-01,2.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,,,,,,1.0,530000.0,119000.0,0.0,22,2,0,0,1,0,0,0,7,4,2,1,0,0,4,1,,,B,B,7.0,5.0,0,0,0,1,5,,7.0,54900000.0,24000000.0,7842857.0,1995-12-01,45306000.0,,11082000.0,4452000.0,3.0,2.004000e+07,2.000000e+07,6.680000e+06,2005-05-01,4.138000e+06,,241000.0,501000.0,,,,,,,,,,,,,,,,,,,,,,,,,,2021-01-05,2020-12-31,,779452294114315901
4,2,1000007,800011002,,,,,,,,,,62,14.0,2.846955e+10,2.244108e+10,347277000.0,0.0,,,,,,16.0,3.622941e+09,1.545348e+09,74770000.0,0.0,9.0,58000000.0,722000.0,0.012448,722000.0,0.0,,,,,,,,2009-11-01,7.0,106123000.0,88588000.0,95543000.0,0.0,2.0,0.0,0.0,,,,,,4.0,11558000.0,2351000.0,0.0,62,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,B,A,10.0,,0,0,0,10,5,,9.0,58000000.0,14000000.0,6444444.0,2009-11-01,722000.0,,722000.0,0.0,9.0,2.221500e+10,5.500000e+09,2.468333e+09,2009-10-01,1.791889e+10,,330148000.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,2021-01-05,2020-12-31,,248852294027931002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
532590,1,1000003,9994732,,42.0,M,VITERBO,650.0,,4310000.0,,,14,3.0,4.317200e+07,5.818000e+06,766000.0,0.0,,,,,,2.0,6.400000e+07,3.023800e+07,1296000.0,0.0,2.0,8650000.0,2810000.0,0.324855,581000.0,0.0,,,,,,,,2018-11-01,2.0,94000.0,1539000.0,94000.0,0.0,3.0,0.0,268000.0,,,,,,,,,,13,0,0,0,1,0,0,0,0,1,1,3,0,0,0,0,,,A,A,,2.0,0,0,0,3,1,,2.0,8650000.0,6650000.0,4325000.0,2018-11-01,2810000.0,,581000.0,0.0,2.0,2.172000e+06,1.500000e+06,1.086000e+06,2018-04-01,0.000000e+00,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,2021-01-05,2020-12-31,,971552295443395102
532591,1,1000003,9994769,,41.0,M,VITERBO,696.0,,2615000.0,,,7,1.0,3.192200e+07,2.569400e+07,561000.0,0.0,,,,,,,,,,,2.0,4279000.0,4239000.0,0.990652,7000.0,0.0,,,,,,,,2008-09-01,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,,,,,,,,,,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,A,A,,2.0,0,0,0,8,1,,2.0,4279000.0,2279000.0,2139500.0,2008-09-01,4239000.0,,7000.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2021-01-05,2020-12-31,,194152295443278502
532592,1,1000003,9995028,,80.0,M,BELALCAZAR,927.0,,3072000.0,,,9,,,,,,,,,,,,,,,,5.0,27711000.0,1579000.0,0.056981,557000.0,0.0,,,,,,,,2019-09-01,1.0,63000.0,63000.0,63000.0,0.0,2.0,0.0,0.0,,,,,,,,,,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,A,A,,1.0,0,0,0,0,1,,3.0,20211000.0,10000000.0,6737000.0,2019-09-01,1422000.0,,443000.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2021-01-05,2020-12-31,,152256666402648002
532593,1,1000003,9995077,,72.0,M,BELALCAZAR,633.0,,1825000.0,,,2,1.0,2.035000e+07,1.993200e+07,383000.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,0,0,0,0,0,0,0,0,0,0,0,0,1,,0,,,A,A,,1.0,0,0,0,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2021-01-05,2020-12-31,,725955242634580402


In [8]:
files_bur[10]

's3://data-bpop-dev-sandbox/estandarizado/riesgos-creditos/buro-comportamiento-financiero/riesgos-creditos_buro-comportamiento-financiero_experian_M202109'

In [9]:
len(files_bur)

11

In [10]:
path_bur = files_bur[10]
print(path_bur)
bur2 = spark_read_parquet(path_bur)
print(bur2.shape)
bur2

s3://data-bpop-dev-sandbox/estandarizado/riesgos-creditos/buro-comportamiento-financiero/riesgos-creditos_buro-comportamiento-financiero_experian_M202109
(514756, 134)


Unnamed: 0,tipo_id,tipo_id_homologado,no_identificacion,ds_nombre_cliente,rango_aproximado_edad,genero,ciudad_de_expedicion,acierta_a_financiero,quanto,quanto_mod,porcen_de_cuotas_vs_ingreso,respuesta,numero_obligaciones_activas,numero_creditos_cb,valor_inicial_cb,valor_saldo_cb,valor_cuotas_cb,valor_mora_cb,numero_creditos_cv,valor_inicial_cv,valor_saldo_cv,valor_cuotas_cv,valor_mora_cv,numero_creditos_cf,valor_inicial_cf,valor_saldo_cf,valor_cuotas_cf,valor_mora_cf,numero_tdc,valor_cupos,valor_utilizado,porcentaje_utilizacion,valor_cuotas,valor_mora,rango_0,rango_1,rango_2,rango_3,rango_4,rango_5,rango_6,fecha_mas_antigua_apertura,numero_creditos_sr,valor_inicial_sr,valor_saldo_sr,valor_cuotas_sr,valor_mora_sr,numero_celulares_telcos,valor_cuotas_celulares_telcos,valor_mora_telcos,numero_creditos_cooperativas,valor_inicial_cooperativas,valor_saldo_cooperativas,valor_cuotas_cooperativas,valor_mora_cooperativas,numero_creditos_codeudores,valor_saldo_codeudores,valor_cuotas_codeudores,valor_mora_codeudores,obliga_al_dia_cartera_actual,obliga_mora_30_cartera_actual,obliga_mora_60_cartera_actual,obliga_mora_90_cartera_actual,obliga_mora_120_cartera_actual,cartera_castig_cartera_actual,dudoso_recaudo_cartera_actual,ctas_en_cobrador_cartera_act,ult_ano_moras_30_cartera_hist,ult_ano_moras_60_cartera_hist,ult_ano_moras_90_cartera_hist,ult_ano_moras_120_cartera_hist,cancel_mal_manejo_cartera_hist,cartera_recupe_cartera_hist,tdc_altura_maxima_de_mora,cartera_banca_alt_max_de_mora,cartera_coope_alt_max_de_mora,cartera_hipote_alt_max_de_mora,peor_califi_trim_1_endeud,peor_calif_trim_2_endeud,ctas_de_ahorro_act_ctas_banca,ctas_ctes_act_cta_banca,ctas_embargadas_ctas_bancarias,cancel_mal_manejo_ctas_banca,ctas_saldadas_ctas_banca,total_consultas_ult_6_meses,estado_consulta,endeudamiento,num_tdc_vigentes_sin_popular,cupo_sin_popular,max_cupo_tdc_sin_popular,promedio_cupo_tdc_sin_popular,fec_mas_anti_aper_tdc_sin_popu,valor_utilizado_sin_popular,utilizacion_sin_popular,valor_cuotas_sin_popular,valor_en_mora_sin_popular,numoblvigensectorbancasin_popu,cuposectorbancario_sin_popular,maxcuposectorbancasin_popular,promcuposectorbancasin_popular,fecantiapersectorbancasinpopu,valor_utilisectorbancasin_popu,util_sector_banca_sin_popular,val_cuo_sector_banca_sin_popu,val_morasectorbancasin_popular,numoblvigensector_hip_sin_popu,cupo_sector_hip_sin_popular,max_cupo_sector_hip_sin_popu,prom_cupo_sector_hip_sin_popu,fec_masantapersectorhipsin_pop,val_util_sector_hip_sin_pop,util_sector_hip_sin_popular,val_cuotas_sector_hip_sin_popu,val_mora_sector_hip_sin_popu,cuotas_calculadas_tdc,cuotas_calculadas_creditos,cuotas_calculadas_hipotecarias,plazo_credito,plazo_hipotecario,tasa_hipotecario,tasa_creditos,plazo_tdc,cupo_sugerido_1,cupo_sugerido_2,cupo_sugerido_3,cupo_segun_mercado,cupo_ajustadi,cupo_exp_fin,cupo_final,experiencia_financiera,fecha_envio,fecha_data,tipo_cliente,id_cliente
0,4,1000005,100090,,,,CROATA,832.0,,2489000.0,,,2,1.0,13500000.0,1261000.0,291000.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,,,A,A,,1.0,0,0,0,0,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2021-10-07,2021-09-30,,203652306036497202
1,4,1000005,102921,,,,EXTRANJERO,723.0,,7495000.0,,,5,2.0,59797000.0,57087000.0,1213000.0,0.0,,,,,,,,,,,1.0,600000.0,0.0,0.000000,0.0,0.0,,,,,,,,2014-12-01,,,,,,,,,,,,,,,,,,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,C,A,1.0,1.0,0,0,0,1,1,,1.0,600000.0,600000.0,600000.0,2014-12-01,0.0,,0.0,0.0,1.0,21100000.0,21100000.0,21100000.0,2019-08-01,20502000.0,,453000.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,2021-10-07,2021-09-30,,633252307924070801
2,4,1000005,103795,,,,FILIPINA,882.0,,11974000.0,,,10,,,,,,,,,,,2.0,21000.0,0.0,0.0,0.0,5.0,116333000.0,7363000.0,0.063292,2152000.0,0.0,,,,,,,,2004-04-01,,,,,,,,,,,,,,,,,,10,0,0,0,0,0,0,0,1,0,0,0,0,0,0,,,,A,A,2.0,1.0,0,0,0,0,1,,4.0,108333000.0,43538000.0,27083250.0,2004-04-01,7012000.0,,2152000.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2021-10-07,2021-09-30,,599059902783623702
3,4,1000005,104063,,,,CHILENA,910.0,,13164000.0,,,14,4.0,110880000.0,50778000.0,1736000.0,0.0,,,,,,1.0,0.0,0.0,0.0,0.0,5.0,133550000.0,5756000.0,0.043100,1082000.0,0.0,,,,,,,,1988-12-01,1.0,165000.0,0.0,364000.0,0.0,1.0,103000.0,0.0,,,,,,,,,,14,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,,,A,A,1.0,1.0,0,0,0,1,1,,5.0,133550000.0,58100000.0,26710000.0,1988-12-01,5756000.0,,1082000.0,0.0,3.0,78380000.0,37000000.0,26126667.0,2018-05-01,18499000.0,,990000.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,2021-10-07,2021-09-30,,316552308778418001
4,4,1000005,105183,,,,ALEMANA,896.0,,2571000.0,,,2,,,,,,,,,,,,,,,,1.0,16600000.0,28000.0,0.001687,28000.0,0.0,,,,,,,,1991-11-01,1.0,148000.0,224000.0,224000.0,0.0,,,,,,,,,,,,,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,A,A,,,0,0,0,0,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2021-10-07,2021-09-30,,122252294149023702
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
514751,1,1000003,9991929,,56.0,M,VITERBO,924.0,,3134000.0,,,4,2.0,25560000.0,8359000.0,674000.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1500000.0,0.0,0.0,0.0,,,,,,,,,,,,,4,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,,,A,A,,1.0,0,0,0,5,1,,,,,,,,,,,1.0,15160000.0,15160000.0,15160000.0,2018-06-01,3566000.0,,426000.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,2021-10-07,2021-09-30,,238562188568556703
514752,1,1000003,9993571,,50.0,M,VITERBO,733.0,,1744000.0,,,15,1.0,10700000.0,10700000.0,241000.0,0.0,,,,,,2.0,47250000.0,43426000.0,1000000.0,0.0,2.0,3000000.0,571000.0,0.190333,116000.0,0.0,,,,,,,,2011-05-01,1.0,0.0,0.0,142000.0,0.0,5.0,0.0,0.0,,,,,,,,,,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,A,A,,4.0,0,0,0,6,1,,2.0,3000000.0,3000000.0,1500000.0,2011-05-01,571000.0,,116000.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2021-10-07,2021-09-30,,214852309854855102
514753,1,1000003,9993676,,49.0,M,VITERBO,838.0,,8349000.0,,,11,4.0,178793000.0,168704000.0,2603000.0,0.0,,,,,,,,,,,2.0,8300000.0,6809000.0,0.820361,145000.0,0.0,,,,,,,,2008-04-01,1.0,174000.0,196000.0,196000.0,0.0,1.0,0.0,0.0,,,,,,,,,,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,A,A,,3.0,0,0,0,3,1,,2.0,8300000.0,5000000.0,4150000.0,2008-04-01,6809000.0,,145000.0,0.0,3.0,20859000.0,14922000.0,6953000.0,2019-08-01,17066000.0,,495000.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,2021-10-07,2021-09-30,,891352307088439601
514754,1,1000003,9993954,,48.0,M,VITERBO,790.0,,2171000.0,,,10,1.0,16000000.0,13851000.0,373000.0,0.0,,,,,,2.0,30641000.0,16499000.0,1180000.0,0.0,2.0,1750000.0,0.0,0.000000,0.0,0.0,,,,,,,,2006-04-01,3.0,0.0,680000.0,162000.0,0.0,1.0,0.0,0.0,,,,,,,,,,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,A,A,,1.0,0,0,0,0,1,,1.0,1150000.0,1150000.0,1150000.0,2006-04-01,0.0,,0.0,0.0,1.0,16000000.0,16000000.0,16000000.0,2020-03-01,13851000.0,,373000.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,2021-10-07,2021-09-30,,842755719929573502


In [11]:
bur1.dtypes

tipo_id                    object
tipo_id_homologado         object
no_identificacion          object
ds_nombre_cliente          object
rango_aproximado_edad     float64
                           ...   
experiencia_financiera    float64
fecha_envio                object
fecha_data                 object
tipo_cliente               object
id_cliente                 object
Length: 134, dtype: object

In [12]:
bur2.dtypes

tipo_id                    object
tipo_id_homologado         object
no_identificacion          object
ds_nombre_cliente          object
rango_aproximado_edad     float64
                           ...   
experiencia_financiera    float64
fecha_envio                object
fecha_data                 object
tipo_cliente               object
id_cliente                 object
Length: 134, dtype: object

In [13]:
bur1.dtypes.index[~(bur1.dtypes.index == bur2.dtypes.index)]

Index(['ult_año_moras_30_cartera_hist', 'ult_año_moras_60_cartera_hist',
       'ult_año_moras_90_cartera_hist', 'ult_año_moras_120_cartera_hist',
       'cuotas_calculadas_créditos'],
      dtype='object')

In [14]:
bur2.dtypes.index[~(bur1.dtypes.index == bur2.dtypes.index)]

Index(['ult_ano_moras_30_cartera_hist', 'ult_ano_moras_60_cartera_hist',
       'ult_ano_moras_90_cartera_hist', 'ult_ano_moras_120_cartera_hist',
       'cuotas_calculadas_creditos'],
      dtype='object')

In [15]:
bur1 = bur1.rename(columns={'ult_año_moras_30_cartera_hist': 'ult_ano_moras_30_cartera_hist',
                              'ult_año_moras_60_cartera_hist': 'ult_ano_moras_60_cartera_hist',
                                'ult_año_moras_90_cartera_hist':'ult_ano_moras_90_cartera_hist',
                                'ult_año_moras_120_cartera_hist':'ult_ano_moras_120_cartera_hist',
                             'cuotas_calculadas_créditos':'cuotas_calculadas_creditos'})

In [16]:
bur1.dtypes[~(bur1.dtypes == bur2.dtypes)]

Series([], dtype: object)

In [17]:
bur2.dtypes[~(bur1.dtypes == bur2.dtypes)]

Series([], dtype: object)

In [18]:
bur1['porcentaje_utilizacion']

0         0.000000
1         0.211800
2              NaN
3         0.825246
4         0.012448
            ...   
532590    0.324855
532591    0.990652
532592    0.056981
532593         NaN
532594         NaN
Name: porcentaje_utilizacion, Length: 532595, dtype: float64

In [19]:
bur2['porcentaje_utilizacion']

0              NaN
1         0.000000
2         0.063292
3         0.043100
4         0.001687
            ...   
514751         NaN
514752    0.190333
514753    0.820361
514754    0.000000
514755    0.000000
Name: porcentaje_utilizacion, Length: 514756, dtype: float64

In [41]:
if bur2['porcentaje_utilizacion'].dtypes == 'object':
        #bur2['porcentaje_utilizacion2'] = [float(str(x).replace('"','').replace(',','.')) for x in bur2['porcentaje_utilizacion']]
        bur2['porcentaje_utilizacion2'] = bur2['porcentaje_utilizacion'].astype('float64')
    
if (bur2['porcentaje_utilizacion'] > 0).sum()/bur2.shape[0] < 0.9:
    bur2['porcentaje_utilizacion2'] = np.where(bur2['valor_cupos']>0, round(bur2['valor_utilizado']/bur2['valor_cupos'],9),bur2['porcentaje_utilizacion'])

In [24]:
bur2['porcentaje_utilizacion'].count()/bur1.shape[0]

0.7312817431631915

In [37]:
bur2['porcentaje_utilizacion'][2]

0.063292445

In [38]:
bur2['porcentaje_utilizacion2'][2]

0.063292445

In [42]:
bur2.loc[~(bur2['porcentaje_utilizacion'] == bur2['porcentaje_utilizacion2']),'porcentaje_utilizacion']

0        NaN
7        NaN
8        NaN
9        NaN
10       NaN
          ..
514734   NaN
514738   NaN
514739   NaN
514746   NaN
514751   NaN
Name: porcentaje_utilizacion, Length: 139829, dtype: float64

In [43]:
bur2.loc[~(bur2['porcentaje_utilizacion'] == bur2['porcentaje_utilizacion2']),'porcentaje_utilizacion2']

0        NaN
7        NaN
8        NaN
9        NaN
10       NaN
          ..
514734   NaN
514738   NaN
514739   NaN
514746   NaN
514751   NaN
Name: porcentaje_utilizacion2, Length: 139829, dtype: float64

In [28]:
~(bur2['porcentaje_utilizacion'] == bur2['porcentaje_utilizacion2'])

0          True
1         False
2          True
3          True
4          True
          ...  
514751     True
514752     True
514753     True
514754    False
514755    False
Length: 514756, dtype: bool

In [None]:
bur1['porcentaje_utilizacion'].dtypes

In [None]:
bur2['porcentaje_utilizacion'].dtypes

In [None]:
bur1['valor_cuotas_codeudores'].mean()

In [None]:
bur2['valor_cuotas_codeudores'].mean()

## Comparaciones

In [None]:
bur1.count()/bur1.shape[0]

In [None]:
bur2.count()/bur2.shape[0]

In [None]:
cols_dif = list((bur1.count()/bur1.shape[0])[abs(bur1.count()/bur1.shape[0] - bur2.count()/bur2.shape[0])>0.1].index)
cols_dif

In [None]:
(bur1.count()/bur1.shape[0])[abs(bur1.count()/bur1.shape[0] - bur2.count()/bur2.shape[0])>0.1]*100

In [None]:
(bur2.count()/bur2.shape[0])[abs(bur1.count()/bur1.shape[0] - bur2.count()/bur2.shape[0])>0.1]*100

In [None]:
bur1[cols_dif]

In [None]:
bur2[cols_dif]

## Archivo Raw

In [None]:
path = 's3://data-bpop-dev-sandbox/landing/riesgos-creditos/buro-comportamiento-financiero/'
bucket = 'data-bpop-dev-sandbox'

path_files = path
archivos_raw = get_s3_parquets(path_files)
archivos_raw

In [None]:
bur1 = spark_read_parquet(files_bur[7])
print(bur1.shape)
bur1

In [None]:
bur2 = spark_read_parquet(files_bur[8])
print(bur2.shape)
bur2

In [None]:
bur1.dtypes

In [None]:
bur2.dtypes