#### Habilitando aplicação Spark

In [1]:
spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
0,application_1573210380015_0001,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.
<pyspark.sql.session.SparkSession object at 0x7f9a30162c10>

#### Lendo tabela de Metadados

In [4]:
metadados = spark.read.format(
   "com.databricks.spark.csv").option(
   "header", "true").option(
   "inferSchema", "true").option(
   "delimiter", ',').load(
   's3://turing-bkt-treinamentos-etl/CreditoImobiliario/HomeCredit_columns_description.csv')
metadados.count()

219

In [5]:
metadados.registerTempTable("metadados")

In [4]:
spark.sql("""
              select 
                  Row,
                  Description                  
              from 
                  metadados
              where 
                  Table in ('credit_card_balance.csv')


""").show(50,False)

+--------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+
|Row                       |Description                                                                                                                                      |
+--------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+
|SK_ID_PREV                |ID of previous credit in Home credit related to loan in our sample. (One loan in our sample can have 0,1,2 or more previous loans in Home Credit)|
|SK_ID_CURR                |ID of loan in our sample                                                                                                                         |
|MONTHS_BALANCE            |Month of balance relative to application date (-1 means the freshest balance date)               

#### Lendo tabela bureau.csv

In [6]:
credit_card_balance = spark.read.format(
   "com.databricks.spark.csv").option(
   "header", "true").option(
   "inferSchema", "true").option(
   "delimiter", ',').load(
   's3://turing-bkt-treinamentos-etl/CreditoImobiliario/credit_card_balance.csv')

credit_card_balance.registerTempTable("credit_card_balance")
credit_card_balance.count()

3840312

In [6]:
credit_card_balance.printSchema()

root
 |-- SK_ID_PREV: integer (nullable = true)
 |-- SK_ID_CURR: integer (nullable = true)
 |-- MONTHS_BALANCE: integer (nullable = true)
 |-- AMT_BALANCE: double (nullable = true)
 |-- AMT_CREDIT_LIMIT_ACTUAL: integer (nullable = true)
 |-- AMT_DRAWINGS_ATM_CURRENT: double (nullable = true)
 |-- AMT_DRAWINGS_CURRENT: double (nullable = true)
 |-- AMT_DRAWINGS_OTHER_CURRENT: double (nullable = true)
 |-- AMT_DRAWINGS_POS_CURRENT: double (nullable = true)
 |-- AMT_INST_MIN_REGULARITY: double (nullable = true)
 |-- AMT_PAYMENT_CURRENT: double (nullable = true)
 |-- AMT_PAYMENT_TOTAL_CURRENT: double (nullable = true)
 |-- AMT_RECEIVABLE_PRINCIPAL: double (nullable = true)
 |-- AMT_RECIVABLE: double (nullable = true)
 |-- AMT_TOTAL_RECEIVABLE: double (nullable = true)
 |-- CNT_DRAWINGS_ATM_CURRENT: double (nullable = true)
 |-- CNT_DRAWINGS_CURRENT: integer (nullable = true)
 |-- CNT_DRAWINGS_OTHER_CURRENT: double (nullable = true)
 |-- CNT_DRAWINGS_POS_CURRENT: double (nullable = true)
 |

#### Explorando conceitos de negocio

In [13]:
spark.sql("""
             select
                 min(MONTHS_BALANCE) as min,
                 max(MONTHS_BALANCE) as max

             from 
                 credit_card_balance



""").show()

+---+---+
|min|max|
+---+---+
|-96| -1|
+---+---+

In [10]:
spark.sql("""
             select
                 NAME_CONTRACT_STATUS,
                 count(*) as Qt
             from 
                 credit_card_balance
             group by
                 1


""").show(50,False)

+--------------------+-------+
|NAME_CONTRACT_STATUS|Qt     |
+--------------------+-------+
|Demand              |1365   |
|Approved            |5      |
|Completed           |128918 |
|Active              |3698436|
|Signed              |11058  |
|Sent proposal       |513    |
|Refused             |17     |
+--------------------+-------+

#### Criando variaveis de primeira camada com conceito de credito 

In [7]:
credit_card_balance_01 = spark.sql(
    """
     select
         SK_ID_CURR as PK_JOIN_CB,

         sum(case when NAME_CONTRACT_STATUS in ('Active') 
             and MONTHS_BALANCE <= -3 then 1 end) as QT_CRED_Active_U03M,
         sum(case when NAME_CONTRACT_STATUS in ('Active') 
             and MONTHS_BALANCE <= -6 then 1 end) as QT_CRED_Active_U06M,
         sum(case when NAME_CONTRACT_STATUS in ('Active') 
             and MONTHS_BALANCE <= -12 then 1 end) as QT_CRED_Active_U12M,             
         sum(case when NAME_CONTRACT_STATUS in ('Active') 
             and MONTHS_BALANCE <= -24 then 1 end) as QT_CRED_Active_U24M,
         sum(case when NAME_CONTRACT_STATUS in ('Active') 
             and MONTHS_BALANCE <= -48 then 1 end) as QT_CRED_Active_U48M,
         sum(case when NAME_CONTRACT_STATUS in ('Active') 
             and MONTHS_BALANCE <= -96 then 1 end) as QT_CRED_Active_U96M,

         sum(case when NAME_CONTRACT_STATUS in ('Completed') 
             and MONTHS_BALANCE <= -3 then 1 end) as QT_CRED_Completed_U03M,
         sum(case when NAME_CONTRACT_STATUS in ('Completed') 
             and MONTHS_BALANCE <= -6 then 1 end) as QT_CRED_Completed_U06M,
         sum(case when NAME_CONTRACT_STATUS in ('Completed') 
             and MONTHS_BALANCE <= -12 then 1 end) as QT_CRED_Completed_U12M,             
         sum(case when NAME_CONTRACT_STATUS in ('Completed') 
             and MONTHS_BALANCE <= -24 then 1 end) as QT_CRED_Completed_U24M,
         sum(case when NAME_CONTRACT_STATUS in ('Completed') 
             and MONTHS_BALANCE <= -48 then 1 end) as QT_CRED_Completed_U48M,
         sum(case when NAME_CONTRACT_STATUS in ('Completed') 
             and MONTHS_BALANCE <= -96 then 1 end) as QT_CRED_Completed_U96M,


         avg(case when NAME_CONTRACT_STATUS in ('Active') 
             and MONTHS_BALANCE <= -3 then AMT_CREDIT_LIMIT_ACTUAL end) as VL_MED_CRED_Active_U03M,
         avg(case when NAME_CONTRACT_STATUS in ('Active') 
             and MONTHS_BALANCE <= -6 then AMT_CREDIT_LIMIT_ACTUAL end) as VL_MED_CRED_Active_U06M,
         avg(case when NAME_CONTRACT_STATUS in ('Active') 
             and MONTHS_BALANCE <= -12 then AMT_CREDIT_LIMIT_ACTUAL end) as VL_MED_CRED_Active_U12M,
         avg(case when NAME_CONTRACT_STATUS in ('Active') 
             and MONTHS_BALANCE <= -24 then AMT_CREDIT_LIMIT_ACTUAL end) as VL_MED_CRED_Active_U24M,
         avg(case when NAME_CONTRACT_STATUS in ('Active') 
             and MONTHS_BALANCE <= -48 then AMT_CREDIT_LIMIT_ACTUAL end) as VL_MED_CRED_Active_U48M,
         avg(case when NAME_CONTRACT_STATUS in ('Active') 
             and MONTHS_BALANCE <= -96 then AMT_CREDIT_LIMIT_ACTUAL end) as VL_MED_CRED_Active_U96M,

         min(AMT_PAYMENT_CURRENT) as MIN_AMT_PAYMENT_CURRENT,
         max(AMT_PAYMENT_CURRENT) as MAX_AMT_PAYMENT_CURRENT,
         avg(AMT_PAYMENT_CURRENT) as AVG_AMT_PAYMENT_CURRENT,

         min(AMT_PAYMENT_TOTAL_CURRENT) as MIN_AMT_PAYMENT_TOTAL_CURRENT,
         max(AMT_PAYMENT_TOTAL_CURRENT) as MAX_AMT_PAYMENT_TOTAL_CURRENT,
         avg(AMT_PAYMENT_TOTAL_CURRENT) as AVG_AMT_PAYMENT_TOTAL_CURRENT,             

         min(AMT_RECEIVABLE_PRINCIPAL) as MIN_AMT_RECEIVABLE_PRINCIPAL,
         max(AMT_RECEIVABLE_PRINCIPAL) as MAX_AMT_RECEIVABLE_PRINCIPAL,
         avg(AMT_RECEIVABLE_PRINCIPAL) as AVG_AMT_RECEIVABLE_PRINCIPAL,  
         
         min(AMT_RECIVABLE) as MIN_AMT_RECIVABLE,
         max(AMT_RECIVABLE) as MAX_AMT_RECIVABLE,
         avg(AMT_RECIVABLE) as AVG_AMT_RECIVABLE,  
  
         min(CNT_DRAWINGS_CURRENT) as MIN_CNT_DRAWINGS_CURRENT,
         max(CNT_DRAWINGS_CURRENT) as MAX_CNT_DRAWINGS_CURRENT,
         avg(CNT_DRAWINGS_CURRENT) as AVG_CNT_DRAWINGS_CURRENT

     from
         credit_card_balance             
     group by         
         SK_ID_CURR
""")

credit_card_balance_01.registerTempTable("credit_card_balance_01")
credit_card_balance_01.count()

103558

In [16]:
credit_card_balance_01.printSchema()

root
 |-- PK_JOIN_CB: integer (nullable = true)
 |-- QT_CRED_Active_U03M: long (nullable = true)
 |-- QT_CRED_Active_U06M: long (nullable = true)
 |-- QT_CRED_Active_U12M: long (nullable = true)
 |-- QT_CRED_Active_U24M: long (nullable = true)
 |-- QT_CRED_Active_U48M: long (nullable = true)
 |-- QT_CRED_Active_U96M: long (nullable = true)
 |-- QT_CRED_Completed_U03M: long (nullable = true)
 |-- QT_CRED_Completed_U06M: long (nullable = true)
 |-- QT_CRED_Completed_U12M: long (nullable = true)
 |-- QT_CRED_Completed_U24M: long (nullable = true)
 |-- QT_CRED_Completed_U48M: long (nullable = true)
 |-- QT_CRED_Completed_U96M: long (nullable = true)
 |-- VL_MED_CRED_Active_U03M: double (nullable = true)
 |-- VL_MED_CRED_Active_U06M: double (nullable = true)
 |-- VL_MED_CRED_Active_U12M: double (nullable = true)
 |-- VL_MED_CRED_Active_U24M: double (nullable = true)
 |-- VL_MED_CRED_Active_U48M: double (nullable = true)
 |-- VL_MED_CRED_Active_U96M: double (nullable = true)
 |-- MIN_AMT_PAY

#### Lendo APP_TRAIN + BUREAU

In [14]:
ABT_01 = spark.read.load("s3://treinamento-big-data-aws/Turma_20190916/ABT_v03_BrunoJ")

ABT_01.registerTempTable("ABT_01")
ABT_01.count()

307511

In [17]:
ETL_FINAL_v03 = spark.sql("""

select 
        a.*,
        b.*
    from 
        ABT_01 as a
    left join
        credit_card_balance_01 as b
    on 
        a.SK_ID_CURR = b.PK_JOIN_CB
""")

In [19]:
ETL_FINAL_v03.printSchema()

root
 |-- SK_ID_CURR: integer (nullable = true)
 |-- TARGET: integer (nullable = true)
 |-- NAME_CONTRACT_TYPE: string (nullable = true)
 |-- CODE_GENDER: string (nullable = true)
 |-- FLAG_OWN_CAR: string (nullable = true)
 |-- FLAG_OWN_REALTY: string (nullable = true)
 |-- CNT_CHILDREN: integer (nullable = true)
 |-- AMT_INCOME_TOTAL: double (nullable = true)
 |-- AMT_CREDIT: double (nullable = true)
 |-- AMT_ANNUITY: double (nullable = true)
 |-- AMT_GOODS_PRICE: double (nullable = true)
 |-- NAME_TYPE_SUITE: string (nullable = true)
 |-- NAME_INCOME_TYPE: string (nullable = true)
 |-- NAME_EDUCATION_TYPE: string (nullable = true)
 |-- NAME_FAMILY_STATUS: string (nullable = true)
 |-- NAME_HOUSING_TYPE: string (nullable = true)
 |-- REGION_POPULATION_RELATIVE: double (nullable = true)
 |-- DAYS_BIRTH: integer (nullable = true)
 |-- DAYS_EMPLOYED: integer (nullable = true)
 |-- DAYS_REGISTRATION: double (nullable = true)
 |-- DAYS_ID_PUBLISH: integer (nullable = true)
 |-- OWN_CAR_AG

In [20]:
nm_path_s3 = 's3://treinamento-big-data-aws/Turma_20190916/ABT_v04_BrunoJ/'
ETL_FINAL_v03.write.parquet(nm_path_s3, mode='overwrite')

In [15]:
ABT_01.printSchema()

root
 |-- SK_ID_CURR: integer (nullable = true)
 |-- TARGET: integer (nullable = true)
 |-- NAME_CONTRACT_TYPE: string (nullable = true)
 |-- CODE_GENDER: string (nullable = true)
 |-- FLAG_OWN_CAR: string (nullable = true)
 |-- FLAG_OWN_REALTY: string (nullable = true)
 |-- CNT_CHILDREN: integer (nullable = true)
 |-- AMT_INCOME_TOTAL: double (nullable = true)
 |-- AMT_CREDIT: double (nullable = true)
 |-- AMT_ANNUITY: double (nullable = true)
 |-- AMT_GOODS_PRICE: double (nullable = true)
 |-- NAME_TYPE_SUITE: string (nullable = true)
 |-- NAME_INCOME_TYPE: string (nullable = true)
 |-- NAME_EDUCATION_TYPE: string (nullable = true)
 |-- NAME_FAMILY_STATUS: string (nullable = true)
 |-- NAME_HOUSING_TYPE: string (nullable = true)
 |-- REGION_POPULATION_RELATIVE: double (nullable = true)
 |-- DAYS_BIRTH: integer (nullable = true)
 |-- DAYS_EMPLOYED: integer (nullable = true)
 |-- DAYS_REGISTRATION: double (nullable = true)
 |-- DAYS_ID_PUBLISH: integer (nullable = true)
 |-- OWN_CAR_AG

#### Estrategia 01 - trazer todas as varaveis numericas da tabela de publico 

In [38]:
columnList_int = [item[0] for item in application_train.dtypes if item[1].startswith('int')]
columnList_double = [item[0] for item in application_train.dtypes if item[1].startswith('double')]
columnList_string = [item[0] for item in application_train.dtypes if item[1].startswith('string')]

In [36]:
columnList_int

['SK_ID_CURR', 'TARGET', 'CNT_CHILDREN', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_ID_PUBLISH', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'HOUR_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21']

In [37]:
columnList_double

['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE', 'DAYS_REGISTRATION', 'OWN_CAR_AGE', 'CNT_FAM_MEMBERS', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'FLOORSMIN_AVG', 'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG', 'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI',

In [39]:
columnList_string

['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']

In [40]:
columns_to_drop = columnList_string
application_train_01 = application_train.drop(*columns_to_drop)

In [41]:
application_train_01.printSchema()

root
 |-- SK_ID_CURR: integer (nullable = true)
 |-- TARGET: integer (nullable = true)
 |-- CNT_CHILDREN: integer (nullable = true)
 |-- AMT_INCOME_TOTAL: double (nullable = true)
 |-- AMT_CREDIT: double (nullable = true)
 |-- AMT_ANNUITY: double (nullable = true)
 |-- AMT_GOODS_PRICE: double (nullable = true)
 |-- REGION_POPULATION_RELATIVE: double (nullable = true)
 |-- DAYS_BIRTH: integer (nullable = true)
 |-- DAYS_EMPLOYED: integer (nullable = true)
 |-- DAYS_REGISTRATION: double (nullable = true)
 |-- DAYS_ID_PUBLISH: integer (nullable = true)
 |-- OWN_CAR_AGE: double (nullable = true)
 |-- FLAG_MOBIL: integer (nullable = true)
 |-- FLAG_EMP_PHONE: integer (nullable = true)
 |-- FLAG_WORK_PHONE: integer (nullable = true)
 |-- FLAG_CONT_MOBILE: integer (nullable = true)
 |-- FLAG_PHONE: integer (nullable = true)
 |-- FLAG_EMAIL: integer (nullable = true)
 |-- CNT_FAM_MEMBERS: double (nullable = true)
 |-- REGION_RATING_CLIENT: integer (nullable = true)
 |-- REGION_RATING_CLIENT_W_

In [42]:
application_train_01.registerTempTable("application_train_01")

#### Gerando tabela de modelagem final

In [48]:
ETL_FINAL_v01 = spark.sql("""

select 
        a.*,
        b.*
    from 
        application_train_01 as a
    left join
        bureau_etl_01 as b
    on 
        a.SK_ID_CURR = b.PK_JOIN
""")

In [49]:
ETL_FINAL_v01.count()

307511

#### Salvando ABT para etapa de modelagem

In [50]:
nm_path_s3 = 's3://treinamento-big-data-aws/Turma_20190916/ABT_v01_BrunoJ/'
ETL_FINAL_v01.write.parquet(nm_path_s3, mode='overwrite')

#### Estrategia 02 - trazer todas as varaveis do publico + bureu

In [13]:
ETL_FINAL_v02 = spark.sql("""

select 
        a.*,
        b.*
    from 
        application_train as a
    left join
        bureau_etl_01 as b
    on 
        a.SK_ID_CURR = b.PK_JOIN
""")

In [14]:
nm_path_s3 = 's3://treinamento-big-data-aws/Turma_20190916/ABT_v02_BrunoJ/'
ETL_FINAL_v02.write.parquet(nm_path_s3, mode='overwrite')

In [6]:
spark.sql("""
            select
                NAME_CONTRACT_TYPE,
                count(*) as QT
            from 
                application_train
            group by
    1

""").show()

+------------------+------+
|NAME_CONTRACT_TYPE|    QT|
+------------------+------+
|   Revolving loans| 29279|
|        Cash loans|278232|
+------------------+------+

In [7]:
spark.sql("""
            select
                CODE_GENDER,
                count(*) as QT
            from 
                application_train
            group by
    1

""").show()

+-----------+------+
|CODE_GENDER|    QT|
+-----------+------+
|          F|202448|
|          M|105059|
|        XNA|     4|
+-----------+------+

In [8]:
spark.sql("""
            select
                FLAG_OWN_CAR,
                count(*) as QT
            from 
                application_train
            group by
    1

""").show()

+------------+------+
|FLAG_OWN_CAR|    QT|
+------------+------+
|           Y|104587|
|           N|202924|
+------------+------+

In [None]:
application_train_02 = spark.sql("""
select 
NAME_CONTRACT_TYPE,
CODE_GENDER
FLAG_OWN_CAR
FLAG_OWN_REALTY
 

'NAME_TYPE_SUITE', 
'NAME_INCOME_TYPE',
'NAME_EDUCATION_TYPE',
'NAME_FAMILY_STATUS', 
'NAME_HOUSING_TYPE', 
'OCCUPATION_TYPE', 
'WEEKDAY_APPR_PROCESS_START',
'ORGANIZATION_TYPE', 
'FONDKAPREMONT_MODE',
'HOUSETYPE_MODE', 
'WALLSMATERIAL_MODE',
'EMERGENCYSTATE_MODE'
""")