In [1]:
# numero_obligaciones_activasdif
# porcentaje_utilizacion
# quanto_mod
# valor_cuotas_codeudores_smlv
# valor_utilizado_smlv

In [2]:
"""
MODULE: processing_buro
This script extracts Buró information from the Buró file
Steps:
1. Get buro files
2. Get the needed columns and rows from Buro file including the needed historical period
3. Create additional columns
"""

import numpy as np
import pandas as pd
import s3fs
import pyarrow.parquet as pq

def process_buro(input_path1, input_path2):
    """Using the input_paths for the last two buro files, this function puts everything together"""

    # 1. Gets and appends the last two buro files
    buro = get_buro_file(input_path1, input_path2)
    print('get_buro_file')
    # 2. Get the needed columns and rows from buro file including the needed historical period
    buro = get_buro_info(buro)
    print('get_buro_info')
    # 3. Perform simple imputation to output data frame
    buro_final = create_buro_df(buro)
    print('create_buro_df')
    print('buro_df created successfully')

    return buro_final


def get_buro_file(input_path1, input_path2):
    """ Gets the input_paths of the last two the buro files, appends them and drops some not useful columns
     and outputs a DataFrame
    :param
    input_path1: location of the oldest buro file
    input path2: location of the most recent buro file
    :return: DataFrame
    """

    buro_1 = spark_read_parquet(input_path1)
    buro_1.columns = buro_1.columns.str.lower()
    buro_1 = buro_1.rename(columns={'cont_id': 'id_cliente'})

    buro_1['marca_buro'] = 1
    if buro_1['no_identificacion'].isnull().sum() > 0:
        buro_1 = buro_1.loc[buro_1['no_identificacion'].notnull()]

    buro_1['no_identificacion'] = buro_1['no_identificacion'].astype('str')
    buro_1['no_identificacion'] = buro_1['no_identificacion'].astype(np.int64)

    buro_2 = spark_read_parquet(input_path2)
    buro_2.columns = buro_2.columns.str.lower()
    buro_2 = buro_2.rename(columns={'cont_id': 'id_cliente'})

    buro_2['marca_buro'] = 2
    if buro_2['no_identificacion'].isnull().sum() > 0:
        buro_2 = buro_2.loc[buro_2['no_identificacion'].notnull()]

    buro_2['no_identificacion'] = buro_2['no_identificacion'].astype('str')
    buro_2['no_identificacion'] = buro_2['no_identificacion'].astype(np.int64)

    buro = pd.concat([buro_1,buro_2], ignore_index=True)
    
    if buro['no_identificacion'].isnull().sum() > 0:
        buro = buro.loc[buro['no_identificacion'].notnull()]

    buro['no_identificacion'] = buro['no_identificacion'].astype(np.int64)
    buro['id_cliente'] = buro['id_cliente'].fillna('-999').astype(np.int64)
    buro = buro[['tipo_id','tipo_id_homologado','no_identificacion', 'fecha_envio',
                 'id_cliente', 'marca_buro', 
                 'numero_obligaciones_activas',
                 'porcentaje_utilizacion',
                 'quanto_mod',
                 'valor_cupos',
                 'valor_utilizado',
                 'valor_cuotas_codeudores']]
    
    buro = buro.fillna(0)

    return buro


def get_buro_info(buro):
    """Deletes duplicates and casts dates
    :param buro to buro file location
    :return: DataFrame
    """
    # casting dates
    buro = buro[buro['fecha_envio'].notnull()]
    buro['fecha_envio'] = buro['fecha_envio'].astype('str')
    buro['fecha_buro'] = (
                buro['fecha_envio'].str.replace('-', '').str[0:4] + buro['fecha_envio'].str.replace('-', '').str[
                                                                    4:6]).astype('int')

    # deleting duplicates
    buro = buro.sort_values(['no_identificacion', 'fecha_buro'])
    buro = buro.drop_duplicates(subset=['no_identificacion', 'fecha_buro'], keep='first')

    return buro


def create_buro_df(buro):
    """Creates client-level variables final variables
    :param buro containing buro relevant rows and columns
    :return: DataFrame
    """
    # creating lag variables
    buro['llave_lag'] = np.where(buro['no_identificacion'] == buro['no_identificacion'].shift(1), 1, 0)

    # variable cupo mercado
    buro['ano_po'] = buro['fecha_buro'].astype(str).str[0:4].astype(int)
    buro['smlv'] = np.where(buro['ano_po'] == 2018, 781242,
                            np.where(buro['ano_po'] == 2019, 828116,
                                     np.where(buro['ano_po'] == 2020, 877803,
                                              np.where(buro['ano_po'] == 2021, 908526,
                                              np.nan))))
    
    if buro['porcentaje_utilizacion'].dtypes == 'object':
        buro['porcentaje_utilizacion2'] = [float(str(x).replace('"','').replace(',','.')) for x in buro['porcentaje_utilizacion']]
        buro['porcentaje_utilizacion2'] = buro['porcentaje_utilizacion'].astype('float64')
    
    if (buro['porcentaje_utilizacion2'] > 0).sum()/buro.shape[0] < 0.7:
        buro['porcentaje_utilizacion'] = np.where(buro['valor_cupos']>0, round(buro['valor_utilizado']/buro['valor_cupos'],9),buro['porcentaje_utilizacion2'])

    buro['valor_cuotas_codeudores_smlv']= buro['valor_cuotas_codeudores']/buro['smlv']
    buro['valor_utilizado_smlv']= buro['valor_utilizado']/buro['smlv']
    
    var = 'numero_obligaciones_activas'
 
    # creating lag variables lag
    buro['llave_lag'] = np.where(buro['no_identificacion'] == buro['no_identificacion'].shift(1), 1, 0)

    buro[var+'lag1']=np.where(buro['llave_lag']==1,buro[var].shift(1),np.nan)
    buro[var+'dif']=np.where( ((np.isnan(buro[var])) | (buro[var]==0)) & ((np.isnan(buro[var+'lag1'])) | (buro[var+'lag1']==0)) ,0,
                              np.where( ((np.isnan(buro[var])) | (buro[var]==0)) & ((buro[var+'lag1']>0)) ,-1,
                              np.where( ((np.isnan(buro[var+'lag1'])) | (buro[var+'lag1']==0)) & ((buro[var]>0)) ,1,
                              np.where( (buro[var+'lag1']>0) & (buro[var]>0) , (buro[var]-buro[var+'lag1'])/buro[var+'lag1'],99))))
    
    del buro[var+'lag1'], var
    
    #Last filter
    buro = buro[buro['marca_buro'] == 2]
    
    buro_out = buro[['tipo_id','tipo_id_homologado','no_identificacion','id_cliente', 'fecha_buro', 
                     'numero_obligaciones_activasdif',
                    'porcentaje_utilizacion',
                    'quanto_mod',
                    'valor_cuotas_codeudores_smlv',
                    'valor_utilizado_smlv']]

    return buro_out

def spark_read_parquet(s3_url: str, **args):
    fs = s3fs.S3FileSystem()
    # Leyendo base
    dataset = pq.ParquetDataset(s3_url, filesystem=fs)
    table = dataset.read()
    dataframe = table.to_pandas()

    del dataset, table

    return dataframe


In [3]:
import boto3
import boto3.session

def _key_existing_size__list(client, bucket, key):
    """return the key's size if it exist, else None"""
    response = client.list_objects_v2(Bucket=bucket, Prefix=key)
    for obj in response.get("Contents", []):
        if obj["Key"] == key:
            return obj["Size"]
        
def _key_files__list(client, bucket, key):
    """return the key's size if it exist, else None"""
    response = client.list_objects_v2(Bucket=bucket, Prefix=key)
    return response.get("Contents", [])

def parquet_exists(s3_url):
    session = boto3.session.Session()
    s3 = session.client("s3")
    bucket = s3_url.split("/")[2]
    key = s3_url.split("/")[3:]
    key = "/".join(key)+ "/_SUCCESS"
    size = _key_existing_size__list(s3, bucket, key)
    if size is not None:
        print("File exists:", s3_url)
        return True
    return False

def get_s3_parquets(s3_url):
    session = boto3.session.Session()
    s3 = session.client("s3")
    bucket = s3_url.split("/")[2]
    key = s3_url.split("/")[3:]
    key = "/".join(key)
    objects = _key_files__list(s3, bucket, key)
    files = ["s3://" + bucket + "/" + file["Key"][:-9] for file in objects if file["Key"].split("/")[-1]=="_SUCCESS"]
    return files

In [4]:
s3_url = 's3://data-bpop-dev-sandbox/estandarizado/riesgos-creditos/buro-comportamiento-financiero'
files = get_s3_parquets(s3_url)
files

['s3://data-bpop-dev-sandbox/estandarizado/riesgos-creditos/buro-comportamiento-financiero/riesgos-creditos_buro-comportamiento-financiero_experian_M201811',
 's3://data-bpop-dev-sandbox/estandarizado/riesgos-creditos/buro-comportamiento-financiero/riesgos-creditos_buro-comportamiento-financiero_experian_M201905',
 's3://data-bpop-dev-sandbox/estandarizado/riesgos-creditos/buro-comportamiento-financiero/riesgos-creditos_buro-comportamiento-financiero_experian_M201908',
 's3://data-bpop-dev-sandbox/estandarizado/riesgos-creditos/buro-comportamiento-financiero/riesgos-creditos_buro-comportamiento-financiero_experian_M201911',
 's3://data-bpop-dev-sandbox/estandarizado/riesgos-creditos/buro-comportamiento-financiero/riesgos-creditos_buro-comportamiento-financiero_experian_M202003',
 's3://data-bpop-dev-sandbox/estandarizado/riesgos-creditos/buro-comportamiento-financiero/riesgos-creditos_buro-comportamiento-financiero_experian_M202006',
 's3://data-bpop-dev-sandbox/estandarizado/riesgos-c

In [5]:
input_path1 = files[-1]
input_path2 = files[-2]


buro_mdt = process_buro(input_path2, input_path1)
#buro_mdt = get_buro_file(input_path2, input_path1)
#buro_mdt = get_buro_info(buro_mdt)

get_buro_file
get_buro_info
create_buro_df
buro_df created successfully


In [6]:
buro_mdt.head()

Unnamed: 0,tipo_id,tipo_id_homologado,no_identificacion,id_cliente,fecha_buro,numero_obligaciones_activasdif,porcentaje_utilizacion,quanto_mod,valor_cuotas_codeudores_smlv,valor_utilizado_smlv
701386,1,1000003,206,421952294130855802,202110,-0.2,0.0,3806000.0,0.0,0.0
605345,1,1000003,1112,543052294130923801,202110,0.0,0.0,1735000.0,0.0,0.0
699705,1,1000003,1974,243652294131526401,202110,-0.076923,0.002421,9530000.0,0.0,0.186016
728330,1,1000003,2245,303052294131744902,202110,0.0,0.0,2471000.0,0.0,0.0
719650,1,1000003,2460,248852294131623002,202110,-0.2,0.0,5714000.0,0.0,0.0


In [7]:
buro_mdt.tail()

Unnamed: 0,tipo_id,tipo_id_homologado,no_identificacion,id_cliente,fecha_buro,numero_obligaciones_activasdif,porcentaje_utilizacion,quanto_mod,valor_cuotas_codeudores_smlv,valor_utilizado_smlv
603519,1,1000003,1235254689,543460626072472806,202110,0.0,0.984524,1489000.0,0.0,0.910266
626970,1,1000003,1235257401,273259232272927502,202110,0.333333,0.655758,1671000.0,0.0,1.19094
603520,1,1000003,1235538413,689353298356223902,202110,0.2,0.0,2216000.0,0.0,0.0
626841,1,1000003,1236238430,265662878988485706,202110,1.0,1.11,1762000.0,0.0,1.343935
686099,1,1000003,2000001262,428462819218377311,202110,0.0,0.896125,1480000.0,0.0,2.468834


In [8]:
buro_mdt.dtypes

tipo_id                            object
tipo_id_homologado                 object
no_identificacion                   int64
id_cliente                          int64
fecha_buro                          int64
numero_obligaciones_activasdif    float64
porcentaje_utilizacion            float64
quanto_mod                        float64
valor_cuotas_codeudores_smlv      float64
valor_utilizado_smlv              float64
dtype: object

## Guardando Base

In [9]:
periodo = '202112'
path_out = 's3://adl-refined-dev-popular/parquet/TC_adquisicion/base_buro_M'
file_name_out = path_out + periodo
buro_mdt.to_parquet(file_name_out,engine='pyarrow', index=False)

In [10]:
file_name_out

's3://adl-refined-dev-popular/parquet/TC_adquisicion/base_buro_M202112'

In [11]:
aaaa

NameError: name 'aaaa' is not defined

## Probar

In [None]:
"""
MODULE: processing_buro
This script extracts Buró information from the Buró file
Steps:
1. Get buro files
2. Get the needed columns and rows from Buro file including the needed historical period
3. Create additional columns
"""

import numpy as np
import pandas as pd
import s3fs
import pyarrow.parquet as pq

def process_buro(input_path1, input_path2):
    """Using the input_paths for the last two buro files, this function puts everything together"""

    # 1. Gets and appends the last two buro files
    buro = get_buro_file(input_path1, input_path2)
    print('get_buro_file')
    # 2. Get the needed columns and rows from buro file including the needed historical period
    buro = get_buro_info(buro)
    print('get_buro_info')
    # 3. Perform simple imputation to output data frame
    buro_final = create_buro_df(buro)
    print('create_buro_df')
    print('buro_df created successfully')

    return buro_final


def get_buro_file(input_path1, input_path2):
    """ Gets the input_paths of the last two the buro files, appends them and drops some not useful columns
     and outputs a DataFrame
    :param
    input_path1: location of the oldest buro file
    input path2: location of the most recent buro file
    :return: DataFrame
    """

    buro_1 = spark_read_parquet(input_path1)
    buro_1.columns = buro_1.columns.str.lower()
    buro_1 = buro_1.rename(columns={'cont_id': 'id_cliente'})

    buro_1['marca_buro'] = 1
    if buro_1['no_identificacion'].isnull().sum() > 0:
        buro_1 = buro_1.loc[buro_1['no_identificacion'].notnull()]

    buro_1['no_identificacion'] = buro_1['no_identificacion'].astype('str')
    buro_1['no_identificacion'] = buro_1['no_identificacion'].astype(np.int64)

    buro_2 = spark_read_parquet(input_path2)
    buro_2.columns = buro_2.columns.str.lower()
    buro_2 = buro_2.rename(columns={'cont_id': 'id_cliente'})

    buro_2['marca_buro'] = 2
    if buro_2['no_identificacion'].isnull().sum() > 0:
        buro_2 = buro_2.loc[buro_2['no_identificacion'].notnull()]

    buro_2['no_identificacion'] = buro_2['no_identificacion'].astype('str')
    buro_2['no_identificacion'] = buro_2['no_identificacion'].astype(np.int64)

    buro = pd.concat([buro_1,buro_2], ignore_index=True)
    
    if buro['no_identificacion'].isnull().sum() > 0:
        buro = buro.loc[buro['no_identificacion'].notnull()]

    buro['no_identificacion'] = buro['no_identificacion'].astype(np.int64)
    buro['id_cliente'] = buro['id_cliente'].fillna('-999').astype(np.int64)
    buro = buro[['tipo_id','tipo_id_homologado','no_identificacion', 'fecha_envio',
                 'id_cliente', 'marca_buro', 
                 'numero_obligaciones_activas',
                 'porcentaje_utilizacion',
                 'quanto_mod',
                 'valor_cupos',
                 'valor_utilizado',
                 'valor_cuotas_codeudores']]
    
    buro = buro.fillna(0)

    return buro


def get_buro_info(buro):
    """Deletes duplicates and casts dates
    :param buro to buro file location
    :return: DataFrame
    """
    # casting dates
    buro = buro[buro['fecha_envio'].notnull()]
    buro['fecha_envio'] = buro['fecha_envio'].astype('str')
    buro['fecha_buro'] = (
                buro['fecha_envio'].str.replace('-', '').str[0:4] + buro['fecha_envio'].str.replace('-', '').str[
                                                                    4:6]).astype('int')

    # deleting duplicates
    buro = buro.sort_values(['no_identificacion', 'fecha_buro'])
    buro = buro.drop_duplicates(subset=['no_identificacion', 'fecha_buro'], keep='first')

    return buro


def create_buro_df(buro):
    """Creates client-level variables final variables
    :param buro containing buro relevant rows and columns
    :return: DataFrame
    """
    # creating lag variables
    buro['llave_lag'] = np.where(buro['no_identificacion'] == buro['no_identificacion'].shift(1), 1, 0)

    # variable cupo mercado
    buro['ano_po'] = buro['fecha_buro'].astype(str).str[0:4].astype(int)
    buro['smlv'] = np.where(buro['ano_po'] == 2018, 781242,
                            np.where(buro['ano_po'] == 2019, 828116,
                                     np.where(buro['ano_po'] == 2020, 877803,
                                              np.where(buro['ano_po'] == 2021, 908526,
                                              np.nan))))
    
    if (buro['porcentaje_utilizacion'] > 0).sum()/buro.shape[0] > 0.9:
    
        buro['porcentaje_utilizacion'] = [float(str(x).replace('"','').replace(',','.')) for x in buro['porcentaje_utilizacion']]
        buro['porcentaje_utilizacion'] = buro['porcentaje_utilizacion'].astype('float')
    else:
        buro['porcentaje_utilizacion'] = np.where(buro['valor_cupos']>0, round(buro['valor_utilizado']/buro['valor_cupos'],4),0) 


    buro['valor_cuotas_codeudores_smlv']= buro['valor_cuotas_codeudores']/buro['smlv']
    buro['valor_utilizado_smlv']= buro['valor_utilizado']/buro['smlv']
    
    var = 'numero_obligaciones_activas'
 
    # creating lag variables lag
    buro['llave_lag'] = np.where(buro['no_identificacion'] == buro['no_identificacion'].shift(1), 1, 0)

    buro[var+'lag1']=np.where(buro['llave_lag']==1,buro[var].shift(1),np.nan)
    buro[var+'dif']=np.where( ((np.isnan(buro[var])) | (buro[var]==0)) & ((np.isnan(buro[var+'lag1'])) | (buro[var+'lag1']==0)) ,0,
                              np.where( ((np.isnan(buro[var])) | (buro[var]==0)) & ((buro[var+'lag1']>0)) ,-1,
                              np.where( ((np.isnan(buro[var+'lag1'])) | (buro[var+'lag1']==0)) & ((buro[var]>0)) ,1,
                              np.where( (buro[var+'lag1']>0) & (buro[var]>0) , (buro[var]-buro[var+'lag1'])/buro[var+'lag1'],99))))
    
    del buro[var+'lag1'], var
    
    #Last filter
    buro = buro[buro['marca_buro'] == 2]
    
    buro_out = buro[['tipo_id','tipo_id_homologado','no_identificacion','id_cliente', 'fecha_buro', 
                     'numero_obligaciones_activasdif',
                    'porcentaje_utilizacion',
                    'quanto_mod',
                    'valor_cuotas_codeudores_smlv',
                    'valor_utilizado_smlv']]

    return buro_out

def spark_read_parquet(s3_url: str, **args):
    fs = s3fs.S3FileSystem()
    # Leyendo base
    dataset = pq.ParquetDataset(s3_url, filesystem=fs)
    table = dataset.read()
    dataframe = table.to_pandas()

    del dataset, table

    return dataframe