# Preparing libranzas

In [1]:
import numpy as np
import pandas as pd
import datetime
from dateutil import relativedelta
import s3fs
import pyarrow.parquet as pq
import os
import calendar
import gc


pd.set_option('max_columns',None)
pd.set_option('max_rows',None)

%matplotlib inline

In [2]:
# 'num_lib_solicitadas',# libranzas
# 'prom_monto_novado', # libranzas
# 'prom_n_cuotas', # libranzas

In [3]:
import useful_functions as uf

# TODO: validate if the upcoming files will have the same amount of historical information
mes_ejec = '02' # Mes de la campaña
mes_corrida = 'febrero' #Mes de la campaña
year_lib = '2021'
str_date = pd.to_datetime(year_lib+'-'+mes_ejec, format = '%Y-%m')
mes_lib = (str_date - np.timedelta64(20, 'D')).month if (str_date - np.timedelta64(20, 'D')).month > 9 else '0' + str((str_date - np.timedelta64(20, 'D')).month)
mes_prod =  (str_date - np.timedelta64(35, 'D')).month if (str_date - np.timedelta64(35, 'D')).month > 9 else '0' + str((str_date - np.timedelta64(35, 'D')).month)

year = str(int(year_lib)-1) if mes_ejec == '01' else year_lib
year_prod = str(int(year_lib)-1) if (mes_ejec == '02') | (mes_ejec == '01') else year

HISTORICAL_YEARS = 13

In [4]:
fs = s3fs.S3FileSystem()

In [5]:
# -*- coding: utf-8 -*-

"""
module: processing_libranzas
This script extracts libranzas information from the libranzas file
Steps:
1. Get libranzas file
2. Get the needed columns and rows from libranzas file including the needed historical period
3. Create additional columns
4. Aggregate and produce client-level output data frame
5. Perform simple imputation to output data frame
"""



def process_libranzas(input_path: str) -> pd.DataFrame:
    """Using the input_path this function puts everything together"""
    # 1. Get libranzas file
    libranzas = get_libranzas_file(input_path)
    print('1')
    # 2. Get the needed columns and rows from libranzas file including the needed historical period
    libranzas = get_libranzas_info(libranzas)
    print('2')
    # 3. Create additional columns
    libranzas = add_columns_to_lib(libranzas)
    print('3')
    # 4. Aggregate and produce client-level output data frame
    libranzas = create_lib_df(libranzas)
    print('4')
    # 5. Perform simple imputation to output data frame
    libranzas_final = uf.simple_imputation(libranzas)
    libranzas_final['periodo'] = int(input_path[-6:])
    libranzas_final.reset_index(inplace=True)
    
    return libranzas_final


def get_libranzas_file(input_path):
    """ Gets the input_path to the libranzas file, drops some not useful columns
     and outputs a DataFrame
    :param input_path to libranzas file location
    :return: DataFrame
    """
    input = input_path
    dataset = pq.ParquetDataset(input, filesystem=fs)
    table = dataset.read()
    lib = table.to_pandas()
    lib.rename(columns=lambda x: x.lower(), inplace=True)
    if lib['id_cliente'].isnull().sum() > 0:
        lib = lib.loc[lib['id_cliente'].notnull()]

    lib['id_cliente'] = lib['id_cliente'].astype(np.int64)
    lib_constant_cols = ['sk_producto_servicio', 'cd_modalidad_pag_int',
                         'ds_modalidad_pag_int', 'cd_periodicidad_pag_int',
                         'ds_periodicidad_pag_int', 'cd_base_liquidacion',
                         'no_obligacion_novada']

    lib.drop(columns=lib_constant_cols, inplace=True)
    
    del table, dataset

    return lib


def get_libranzas_info(lib):
    """Gets libranzas relevant rows and columns
    :param lib to libranzas file location
    :return: DataFrame
    """
    # we are working only with "libranzas organicas"
    lib = lib.loc[lib['ds_tipo_libranza'] == 'Organica']

    # eliminating these records, they may have quality issues (less than 1%)
    lib = lib.loc[lib['ds_tipo_credito'].notnull()]

    # necessary date processing
    lib_date_cols = ['fe_solicitud', 'fe_desembolso']
    for col in lib_date_cols:
        lib[col] = pd.to_datetime(lib[col], dayfirst=True, errors='coerce')

    # getting records for the appropriate period
    end_historical_period = pd.Timestamp(uf.get_prev_months_last_date())
    number_of_years = datetime.timedelta(days=int(365.25*HISTORICAL_YEARS))
    beginning_historical_period = end_historical_period - number_of_years
    period_filter = (lib['fe_solicitud'] >= beginning_historical_period) & (lib['fe_solicitud'] <= end_historical_period)
    lib = lib.loc[period_filter]

    # imputing fe_solicitud missing dates (less than 0.5%)
    lib.loc[lib['fe_solicitud'].isnull(), 'fe_solicitud'] = lib.loc[lib['fe_solicitud'].isnull(), 'fe_desembolso']

    # getting only the relevant columns
    lib_relevant_cols = ['id_cliente', 'sk_rc_libranza', 'fe_solicitud',
                         'vl_monto_solicitado', 'fe_desembolso',
                         'vl_monto_aprobado', 'no_obligacion', 'vl_monto_desembolsado',
                         'no_cuotas', 'vl_total_cuota', 'ds_estado_actual',
                         'vl_tasa', 'ds_tipo_credito', 'ds_tipo_libranza', 'vl_monto_novado']

    lib = lib[lib_relevant_cols]
    lib = lib.loc[~lib['ds_estado_actual'].isin(['Cancelada','Castigado']),:]

    return lib


def add_columns_to_lib(lib):
    """Takes a libranzas DataFrame and adds some useful columns
    :param lib DataFrame containing libranzas relevant rows and columns
    :return: DataFrame
    """
    lib['dias_procesamiento'] = (lib['fe_desembolso'] - lib['fe_solicitud'])/np.timedelta64(1, 'D')
    # this is a necessary fix because there are many dates with fecha de solicitud in 1900
    lib.loc[lib['dias_procesamiento'] > 360, 'dias_procesamiento'] = lib['dias_procesamiento'].median()
    lib.loc[(lib['vl_monto_aprobado'] == 0)|(lib['vl_monto_aprobado'].isnull()),'vl_monto_aprobado'] = lib['vl_monto_desembolsado']

    return lib


def create_lib_df(lib):
    """Creates client-level variables by aggregating the columns
    :param lib containing libranzas relevant rows and columns
    :return: DataFrame
    """
    # producing the first data frame
    gp = lib.groupby('id_cliente')
    lib_out = pd.DataFrame(index=lib['id_cliente'].unique())

    # number of records for each cliente
    lib_out['num_lib_solicitadas'] = gp.size()

    # these have a very close relation
    lib_out['prom_monto_novado'] = gp['vl_monto_novado'].mean()

    # other averages
    lib_out['prom_n_cuotas'] = gp['no_cuotas'].mean()

    lib_out.index.rename('id_cliente', inplace=True)

    return lib_out


In [6]:
lib_input_path = os.path.join('s3://data-bpop-dev-sandbox/estandarizado/productos/libranzas/productos_libranzas_dwh_M'+year+str(mes_lib))

libranzas_df = process_libranzas(lib_input_path)

In [7]:
libranzas_df.describe()

Unnamed: 0,id_cliente,num_lib_solicitadas,prom_monto_novado,prom_n_cuotas,periodo
count,318010.0,318010.0,318010.0,318010.0,318010.0
mean,5.50683e+17,1.05969,13702240.0,96.117329,202101.0
std,2.594542e+17,0.252922,22466370.0,26.532691,0.0
min,1.010523e+17,1.0,0.0,12.0,202101.0
25%,3.272523e+17,1.0,0.0,84.0,202101.0
50%,5.497549e+17,1.0,2205893.0,99.0,202101.0
75%,7.752523e+17,1.0,19987700.0,120.0,202101.0
max,9.99958e+17,4.0,448531700.0,131.0,202101.0


In [8]:
libranzas_df.head()

Unnamed: 0,id_cliente,num_lib_solicitadas,prom_monto_novado,prom_n_cuotas,periodo
0,102652303931898101,1,51640752.0,84.0,202101
1,102652295880533801,1,16811819.0,99.0,202101
2,102652296797857801,1,14257698.0,108.0,202101
3,102652296820136301,1,0.0,120.0,202101
4,102652298458155701,1,0.0,120.0,202101


In [9]:
libranzas_df.count()

id_cliente             318010
num_lib_solicitadas    318010
prom_monto_novado      318010
prom_n_cuotas          318010
periodo                318010
dtype: int64

## Saving dataframe

In [10]:
path_out = "s3://adl-refined-dev-popular/parquet/TC_adquisicion/base_libranzas_M"
file_name_out = path_out+year+str(mes_lib)
libranzas_df.to_parquet(file_name_out,engine='pyarrow', index=False)

In [11]:
file_name_out

's3://adl-refined-dev-popular/parquet/TC_adquisicion/base_libranzas_M202101'