# PREPAID CHURN - Data preparation

## Initialization of the Spark session

In [176]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [177]:
from common.src.main.python.utils.hdfs_generic import *
import os

MAX_N_EXECUTORS=15
MIN_N_EXECUTORS=1
N_CORES_EXECUTOR=4
EXECUTOR_IDLE_MAX_TIME=120
EXECUTOR_MEMORY='32g'
DRIVER_MEMORY='16g'
N_CORES_DRIVER=1
MEMORY_OVERHEAD=N_CORES_EXECUTOR*2048
#QUEUE="root.datascience.normal"
QUEUE="root.BDPtenants.es.medium"

BDA_CORE_VERSION="1.0.0"

SPARK_COMMON_OPTS=os.environ.get('SPARK_COMMON_OPTS', '')
SPARK_COMMON_OPTS+=" --executor-memory %s --driver-memory %s" % (EXECUTOR_MEMORY, DRIVER_MEMORY)
SPARK_COMMON_OPTS+=" --conf spark.shuffle.manager=tungsten-sort"
SPARK_COMMON_OPTS+="  --queue %s" % QUEUE
APP_NAME='PrepaidChurnDataPreparation'

# Dynamic allocation configuration
SPARK_COMMON_OPTS+=" --conf spark.dynamicAllocation.enabled=true"
SPARK_COMMON_OPTS+=" --conf spark.shuffle.service.enabled=true"
SPARK_COMMON_OPTS+=" --conf spark.dynamicAllocation.maxExecutors=%s" % (MAX_N_EXECUTORS)
SPARK_COMMON_OPTS+=" --conf spark.dynamicAllocation.minExecutors=%s" % (MIN_N_EXECUTORS)
SPARK_COMMON_OPTS+=" --conf spark.dynamicAllocation.executorIdleTimeout=%s" % (EXECUTOR_IDLE_MAX_TIME)
SPARK_COMMON_OPTS+=" --conf spark.ui.port=58201"
SPARK_COMMON_OPTS+=" --conf spark.port.maxRetries=200"
SPARK_COMMON_OPTS+=" --executor-cores=%s" % (N_CORES_EXECUTOR)
SPARK_COMMON_OPTS+=" --conf spark.app.name=%s" % (APP_NAME)

BDA_ENV = os.environ.get('BDA_USER_HOME', '')

# Attach bda-core-ra codebase
SPARK_COMMON_OPTS+=" --files \
{}/scripts/properties/red_agent/nodes.properties,\
{}/scripts/properties/red_agent/nodes-de.properties,\
{}/scripts/properties/red_agent/nodes-es.properties,\
{}/scripts/properties/red_agent/nodes-ie.properties,\
{}/scripts/properties/red_agent/nodes-it.properties,\
{}/scripts/properties/red_agent/nodes-pt.properties,\
{}/scripts/properties/red_agent/nodes-uk.properties".format(*[BDA_ENV]*7)

os.environ["SPARK_COMMON_OPTS"] = SPARK_COMMON_OPTS
os.environ["PYSPARK_SUBMIT_ARGS"] = "%s pyspark-shell " % SPARK_COMMON_OPTS

#print os.environ.get('SPARK_COMMON_OPTS', '')
#print os.environ.get('PYSPARK_SUBMIT_ARGS', '')

sc, sparkSession, sqlContext = run_sc()
print sc.defaultParallelism

2


In [178]:
# This literal_eval is needed since 
# we have to read from a textfile
# which is formatted as python objects.
# It is totally safe.
from ast import literal_eval

# Standard Library stuff:
from functools import partial
from datetime import date, timedelta, datetime

# Numpy stuff
from numpy import (nan as np_nan, round as np_round, int64 as np_int64)
import numpy as np


# Spark stuff
from pyspark.sql import SparkSession
from pyspark import StorageLevel
from pyspark.sql.functions import (udf, col, decode, when, lit, lower, upper, concat,
                                   translate, count, sum as sql_sum, max as sql_max, min as sql_min,
                                   round, 
                                   mean, stddev, datediff,
                                   length,
                                   countDistinct,
                                   hour, date_format, collect_set, collect_list,
                                   year, month, dayofmonth,
                                   rank, expr, lag, coalesce, row_number,
                                   isnull, isnan,
                                   unix_timestamp,
                                   regexp_replace
                                  )
from pyspark.sql.types import DoubleType, StringType, IntegerType, ArrayType, FloatType

from pyspark.sql import DataFrameStatFunctions as statFunc

from pyspark.sql.window import Window

import json
from collections import OrderedDict

from subprocess import Popen, PIPE
import datetime, calendar
from pyspark.sql import functions as F

import pandas as pd
pd.set_option('display.max_columns', 500)


In [179]:
spark = (SparkSession.builder
         .appName("Prepaid Churn Model - Data Preparation")
         .master("yarn")
         .config("spark.submit.deployMode", "client")
         .config("spark.ui.showConsoleProgress", "true")
         .enableHiveSupport()
         .getOrCreate()
         )

In [180]:
import datetime as dt
start_time = dt.datetime.now()

## Definición de fechas para la preparación de los datos.

In [181]:
from dateutil.relativedelta import relativedelta
from pyspark.sql.functions import substring
from pyspark.sql.functions import (unix_timestamp, udf,col,max as sql_max, avg, stddev as sql_stddev, when, count, isnull, concat, lpad, trim, lit, sum as sql_sum, length, upper)

right_now = dt.datetime.now()
today = right_now.day

#Mirar en hue para ver de qué meses tenemos datos

MONTH_ANALYSIS = "20191130"

MONTH_BEFORE_M_1 = "20190930"
MONTH_BEFORE_M_2 = "20190831"
MONTH_BEFORE_M_3 = "20190731" 
MONTH_BEFORE_M_4 = "20190630"
MONTH_BEFORE_M_5 = "20190531"

## Comenzamos obteniendo aquellos clientes cuyas líneas llevan al menos 4 meses activas.

- Month M-1:

In [182]:
hdfs_partition_path = 'year=' + str(int(MONTH_BEFORE_M_1[:4])) + '/month=' + str(int(MONTH_BEFORE_M_1[4:6])) + '/day=14' # Cogemos la 'foto' de mitad de mes

hdfs_write_path_common='/data/udf/vf_es/amdocs_ids/'

path_customer = hdfs_write_path_common +'customer/'+hdfs_partition_path
path_service = hdfs_write_path_common +'service/'+hdfs_partition_path

customerDF_load = (spark.read.load(path_customer))
serviceDF_load = (spark.read.load(path_service))

month_M_1 = (customerDF_load
              .join(serviceDF_load, 'NUM_CLIENTE', 'inner')
             )

In [183]:
month_M_1 = month_M_1.filter(col('RGU') == 'prepaid')

- Month M-2:

In [184]:
month_to_predict_M_2 = (sqlContext.read.format('csv')
                    .options(header='false', inferSchema='true', delimiter = '\t')
                    .load('/data/raw/vf_es/cvm/ES_CVM_PREPFINAL_DATALAB_M/1.1/csv/partitioned_month='+MONTH_BEFORE_M_2[0:6]+'/year='+MONTH_BEFORE_M_2[0:4]+'/month='+str(int(MONTH_BEFORE_M_2[4:6])))
                   ) #buscar en hue para ver de qué meses tenemos datos

In [185]:
month_to_predict_clean_M_2= month_to_predict_M_2.drop(*[ '_c37','_c38','_c39','_c40','_c41','_c42', '_c43', '_c44', '_c45', '_c46', '_c47', '_c48', '_c49', '_c50',
                                                '_c51', '_c52', '_c53', '_c54', '_c55', '_c56', '_c57', '_c58', 'day'])

In [186]:
month_previo = spark.read.table('raw_es.vf_pre_ac_final')

In [187]:
month_previo_clean = month_previo.drop(*['sfid_canje',
 'partitioned_month',
 'year',
 'month',
 'day'])

In [188]:
i = 0
while i < len(month_to_predict_clean_M_2.columns):
    month_to_predict_clean_M_2 = month_to_predict_clean_M_2.withColumnRenamed(month_to_predict_clean_M_2.columns[i], month_previo_clean.columns[i])
    i += 1

In [189]:
month_M_2 = (month_to_predict_clean_M_2
           .where(col("estado_servicio") == 'AC')
           .select(["Fecha_ejecucion", 'num_documento_cliente', "msisdn", "num_prepago", "estado_servicio",
                      "num_pospago","tipo_documento_comprador", "codigo_plan_precios",
                      "x_fecha_nacimiento", "fx_1llamada", 'min_llam_ultmes', 'num_sms_ultmes', 'ult3meses_total', 'media_ult3meses', 'diasdesdeultrecarga',
                      'numrecargasult3meses_total', 'cobertura_4g', 'lortad', 'deuda', 'flag_huella_ono',
                      'flag_4g_aperturas', 'flag_4g_nodos', 'flag_huella_vf', 'flag_huella_neba', 'flag_huella_euskaltel',
                      'flag_beneficio_activo'])
          )

- Month M-3:

In [190]:
month_to_predict_M_3 = (sqlContext.read.format('csv')
                    .options(header='false', inferSchema='true', delimiter = '\t')
                    .load('/data/raw/vf_es/cvm/ES_CVM_PREPFINAL_DATALAB_M/1.1/csv/partitioned_month='+MONTH_BEFORE_M_3[0:6]+'/year='+MONTH_BEFORE_M_3[0:4]+'/month='+str(int(MONTH_BEFORE_M_3[4:6])))
                   )

In [191]:
month_to_predict_clean_M_3 = month_to_predict_M_3.drop(*[ '_c37','_c38','_c39','_c40','_c41','_c42', '_c43', '_c44', '_c45', '_c46', '_c47', '_c48', '_c49', '_c50',
                                                '_c51', '_c52', '_c53', '_c54', '_c55', '_c56', '_c57', '_c58', 'day'])

In [192]:
i = 0
while i < len(month_to_predict_clean_M_3.columns):
    month_to_predict_clean_M_3 = month_to_predict_clean_M_3.withColumnRenamed(month_to_predict_clean_M_3.columns[i], month_previo_clean.columns[i])
    i += 1

In [193]:
month_M_3 = (month_to_predict_clean_M_3
           .where(col("estado_servicio") == 'AC')
           .select(["msisdn"])
          )

- Month M-4:

In [194]:
month_to_predict_M_4 = (sqlContext.read.format('csv')
                    .options(header='false', inferSchema='true', delimiter = '\t')
                    .load('/data/raw/vf_es/cvm/ES_CVM_PREPFINAL_DATALAB_M/1.1/csv/partitioned_month='+MONTH_BEFORE_M_4[0:6]+'/year='+MONTH_BEFORE_M_4[0:4]+'/month='+str(int(MONTH_BEFORE_M_4[4:6])))
                   )

In [195]:
month_to_predict_clean_M_4 = month_to_predict_M_4.drop(*[ '_c37','_c38','_c39','_c40','_c41','_c42', '_c43', '_c44', '_c45', '_c46', '_c47', '_c48', '_c49', '_c50',
                                                '_c51', '_c52', '_c53', '_c54', '_c55', '_c56', '_c57', '_c58', 'day'])

In [196]:
i = 0
while i < len(month_to_predict_clean_M_4.columns):
    month_to_predict_clean_M_4 = month_to_predict_clean_M_4.withColumnRenamed(month_to_predict_clean_M_4.columns[i], month_previo_clean.columns[i])
    i += 1

In [197]:
month_M_4 = (month_to_predict_clean_M_4
           .where(col("estado_servicio") == 'AC')
           .select(["msisdn"])
          )

- Month M-5:

In [198]:
month_to_predict_M_5 = (sqlContext.read.format('csv')
                    .options(header='false', inferSchema='true', delimiter = '\t')
                    .load('/data/raw/vf_es/cvm/ES_CVM_PREPFINAL_DATALAB_M/1.1/csv/partitioned_month='+MONTH_BEFORE_M_5[0:6]+'/year='+MONTH_BEFORE_M_5[0:4]+'/month='+str(int(MONTH_BEFORE_M_5[4:6])))
                   )

In [199]:
month_to_predict_clean_M_5 = month_to_predict_M_5.drop(*[ '_c37','_c38','_c39','_c40','_c41','_c42', '_c43', '_c44', '_c45', '_c46', '_c47', '_c48', '_c49', '_c50',
                                                '_c51', '_c52', '_c53', '_c54', '_c55', '_c56', '_c57', '_c58', 'day'])

In [200]:
i = 0
while i < len(month_to_predict_clean_M_5.columns):
    month_to_predict_clean_M_5 = month_to_predict_clean_M_5.withColumnRenamed(month_to_predict_clean_M_5.columns[i], month_previo_clean.columns[i])
    i += 1

In [201]:
month_M_5 = (month_to_predict_clean_M_5
           .where(col("estado_servicio") == 'AC')
           .select(["msisdn"])
          )

### Join de tablas:

In [202]:
df_joined_pre1 = month_M_1.join(month_M_2, on = 'msisdn', how = 'inner')

In [203]:
df_joined_pre2 = df_joined_pre1.join(month_M_3, on = 'msisdn', how = 'inner')

In [204]:
df_prepago_pre3 = df_joined_pre2.join(month_M_4, on = 'msisdn', how = 'inner')

In [205]:
df_prepago_PREVIOS = df_prepago_pre3.join(month_M_5, on = 'msisdn', how = 'inner')

In [206]:
df_prepago_PREVIOS.count()

1521622

## Limpieza de nacionalidades.

In [207]:
most_frequent_countries = [
                           u"marruecos",
                           u"argelia",
                           u"rumania",
                           u"colombia",
                           u"italia",
                           u"ecuador",
                           u"alemania",
                           u"bulgaria",
                           u"francia",
                           u"brasil",
                           u"argentina",
                           u"bolivia",
                           u"portugal",
                           u"paraguay",
                           u"china",
                           u"venezuela",
                           u"honduras",
                           u"india",
                           u"corea del sur"
                          ]

espana = ["españa", "espana"]
pakistan = ["pakistán", "pakistan"]
mexico = ['mejico', 'méjico', 'mexico', 'méxico']
peru = ['peru', 'perú']
japon = ['japón', 'japon']
ucrania = ['ukrania', 'ucrania']
dominicanos = ['república dominicana', 'rep. dominicana', 'republica dominicana', 'r. dominicana']
afganistan = ['afganistan', 'afganistán']
rusia = ['rusia', 'rusia blanca']
reino_unido = ['reino unido', 'gran bretaña', 'united kingdom', 'uk', 'u.k.']
estados_unidos = ['estados unidos', 'estados unidos de america', 'estados unidos de américa',
                  'united states', 'united states of america', 'usa', 'u.s.a.']

df_prepago_PREVIOS = df_prepago_PREVIOS.withColumn("nacionalidad",
                                       when(lower(col("nacionalidad"))
                                            .isin(most_frequent_countries+espana+reino_unido+estados_unidos+peru+pakistan+mexico+japon+ucrania+dominicanos+afganistan+rusia),
                                            lower(col("nacionalidad")))
                                       .otherwise(lit("Other")))

df_prepago_PREVIOS = df_prepago_PREVIOS.withColumn('nacionalidad_final', 
                                                           when(lower(col('nacionalidad')).isin(reino_unido), 'REINO UNIDO')
                                               .when(lower(col('nacionalidad')).isin(espana), 'ESPAÑA')
                                               .when(lower(col('nacionalidad')).isin(estados_unidos), 'ESTADOS UNIDOS')
                                               .when(lower(col('nacionalidad')).isin(mexico), 'MEXICO')
                                               .when(lower(col('nacionalidad')).isin(rusia), 'RUSIA')
                                               .when(lower(col('nacionalidad')).isin(japon), 'JAPON')
                                               .when(lower(col('nacionalidad')).isin(ucrania), 'UCRANIA')
                                               .when(lower(col('nacionalidad')).isin(dominicanos), 'R. DOMINICANA')
                                               .when(lower(col('nacionalidad')).isin(afganistan), 'AFGANISTAN')
                                               .when(lower(col('nacionalidad')).isin(pakistan), 'PAKISTAN')
                                               .when(lower(col('nacionalidad')).isin(peru), 'PERU').otherwise("OTHER")
                                              )

In [208]:
df_prepago_PREVIOS = df_prepago_PREVIOS.withColumn('tipo_documento_comprador',
                                       when(upper(df_prepago_PREVIOS['tipo_documento_comprador']).like('N%I%F%'), 'N.I.F.')
                                       .when(upper(df_prepago_PREVIOS['tipo_documento_comprador']).like('D%N%I%'), 'N.I.F.')
                                       .when(upper(df_prepago_PREVIOS['tipo_documento_comprador']).like('C%I%F%'), 'C.I.F.')
                                       .when(upper(df_prepago_PREVIOS['tipo_documento_comprador']).like('N%I%E%'), 'N.I.E.')
                                       .when(upper(df_prepago_PREVIOS['tipo_documento_comprador']).like('TARJ%RESI%'), 'N.I.E.')
                                       .when(upper(df_prepago_PREVIOS['tipo_documento_comprador']).like('PAS%'), 'Pasaporte')
                                       .otherwise(''))


## Definición de target con tabla de portabilidades.

---

- Ejecutar sólo cuando preparemos el IDS de predicción:

In [209]:
df_prepago_labeled = df_prepago_PREVIOS.withColumn('Churned', lit(None).cast(IntegerType()))

---

- Ejecutar sólo cuando preparemos el IDS de entrenamiento:

In [34]:
portados = (sqlContext.read.format('csv')
                    .options(header='false', inferSchema='true', delimiter = '\t')
                    .load('/data/raw/vf_es/cvm/ES_CVM_PREDNOACTIV_DATALAB_M/1.1/csv/partitioned_month='+ 
                          str(MONTH_ANALYSIS[0:6])+'/year='+MONTH_ANALYSIS[0:4]+'/month='+str(int(MONTH_ANALYSIS[4:6]))
                         )
           )

In [35]:
portados = (portados
            .withColumnRenamed('_c0', 'msisdn')
            .withColumnRenamed('_c1', 'motivo_desactivacion')
            .withColumnRenamed('_c2', 'extractdate')
           )

In [36]:
df_definitivo = df_prepago_PREVIOS.join(portados.select('msisdn', 'motivo_desactivacion'), how = 'leftouter', on = 'msisdn')

In [37]:
df_prepago_labeled = df_definitivo.withColumn('Churned', when(col('motivo_desactivacion') == 'PORTADO', 1).otherwise(0)).drop(*['motivo_desactivacion'])

In [38]:
df_prepago_labeled.groupBy('Churned').count().show()

+-------+-------+
|Churned|  count|
+-------+-------+
|      1|  10766|
|      0|1629660|
+-------+-------+



---

## Añadimos información de diferentes fuentes para ir construyendo nuestro IDS.

- ## Cálculos agregados:

### Lectura de tabla de transferencias de saldo.

In [210]:
from pyspark.sql.functions import unix_timestamp, from_unixtime, struct, concat_ws
from pyspark.sql.types import TimestampType

conv_to_timestamp_udf = udf(lambda x: from_unixtime(unix_timestamp(str(x[0])+"/"+str(x[1])+"/"+str(x[2]), 'dd/MM/yyy')), TimestampType())

In [211]:
df_balance_receptor_M_1 = (spark.read.table("raw_es.prepaid_transfbalance")
                        .withColumn('entry_ts', from_unixtime(unix_timestamp(concat_ws('/',"day", "month", "year"), 'dd/MM/yyy')))
                        .filter(col("year") == (int(MONTH_BEFORE_M_1[:4])))
                        .filter(col("month") == (int(MONTH_BEFORE_M_1[4:6])))
                        .filter(col("day") <= 15)
                        .groupBy(["msisdn_receptor", "month", "year"])
                        .agg(sql_sum("importe_traspasado").alias("importe_traspasado_receptor_M-1"),
                                count("importe_traspasado").alias("num_rec_M-1"))
                        .withColumnRenamed('msisdn_receptor', 'msisdn')
                        .select(col("msisdn"),
                                col("importe_traspasado_receptor_M-1"),
                               col("num_rec_M-1")))
    
df_balance_receptor_M_2 = (spark.read.table("raw_es.prepaid_transfbalance")
              .withColumn('entry_ts', from_unixtime(unix_timestamp(concat_ws('/',"day", "month", "year"), 'dd/MM/yyy')))
                        .filter(col("year") == (int(MONTH_BEFORE_M_2[:4])))
                        .filter(col("month") == (int(MONTH_BEFORE_M_2[4:6])))
                                      .groupBy(["msisdn_receptor", "month", "year"])
                                      .agg(sql_sum("importe_traspasado").alias("importe_traspasado_receptor_M-2"),
                                           count("importe_traspasado").alias("num_rec_M-2"))
                                      .withColumnRenamed('msisdn_receptor', 'msisdn')
                                      .select(col("msisdn"),
                                              col("importe_traspasado_receptor_M-2"),
                                             col("num_rec_M-2")))
    
df_balance_receptor_M_3 = (spark.read.table("raw_es.prepaid_transfbalance")
              .withColumn('entry_ts', from_unixtime(unix_timestamp(concat_ws('/',"day", "month", "year"), 'dd/MM/yyy')))
                        .filter(col("year") == (int(MONTH_BEFORE_M_3[:4])))
                        .filter(col("month") == (int(MONTH_BEFORE_M_3[4:6])))
                                      .groupBy(["msisdn_receptor", "month", "year"])
                                      .agg(sql_sum("importe_traspasado").alias("importe_traspasado_receptor_M-3"),
                                           count("importe_traspasado").alias("num_rec_M-3"))
                                      .withColumnRenamed('msisdn_receptor', 'msisdn')
                                      .select(col("msisdn"),
                                              col("importe_traspasado_receptor_M-3"),
                                             col("num_rec_M-3")))
    
df_balance_receptor_M_4 = (spark.read.table("raw_es.prepaid_transfbalance")
              .withColumn('entry_ts', from_unixtime(unix_timestamp(concat_ws('/',"day", "month", "year"), 'dd/MM/yyy')))
                        .filter(col("year") == (int(MONTH_BEFORE_M_4[:4])))
                        .filter(col("month") == (int(MONTH_BEFORE_M_4[4:6])))
                                      .groupBy(["msisdn_receptor", "month", "year"])
                                      .agg(sql_sum("importe_traspasado").alias("importe_traspasado_receptor_M-4"),
                                           count("importe_traspasado").alias("num_rec_M-4"))
                                      .withColumnRenamed('msisdn_receptor', 'msisdn')
                                      .select(col("msisdn"),
                                              col("importe_traspasado_receptor_M-4"),
                                             col("num_rec_M-4")))
                                       

In [212]:
df_balance_receptor_pre1 = (df_balance_receptor_M_1
                       .join(df_balance_receptor_M_2, on = 'msisdn' ,how = 'left'))

df_balance_receptor_pre2 = (df_balance_receptor_pre1
                            .join(df_balance_receptor_M_3, on = 'msisdn' ,how = 'left'))

df_balance_receptor = (df_balance_receptor_pre2
                       .join(df_balance_receptor_M_4, on = 'msisdn' ,how = 'left'))


In [213]:
df_balance_emisor_M_1 = (spark.read.table("raw_es.prepaid_transfbalance")
                        .withColumn('entry_ts', from_unixtime(unix_timestamp(concat_ws('/',"day", "month", "year"), 'dd/MM/yyy')))
                        .filter(col("year") == (int(MONTH_BEFORE_M_1[:4])))
                        .filter(col("month") == (int(MONTH_BEFORE_M_1[4:6])))
                        .filter(col("day") < 15)
                        .withColumnRenamed('msisdn_emisor', 'msisdn')
                                      .groupBy("msisdn")
                                      .agg(sql_sum("importe_traspasado").alias("importe_traspasado_emisor_M-1"),
                                           sql_sum("importe_cargo").alias("importe_cargo_emisor_M-1"),
                                           count("importe_traspasado").alias("num_em_M-1"))
                                      .select(col("msisdn"), 
                                              col("importe_traspasado_emisor_M-1"),
                                              col("importe_cargo_emisor_M-1"),
                                             col("num_em_M-1")))

df_balance_emisor_M_2 = (spark.read.table("raw_es.prepaid_transfbalance")
              .withColumn('entry_ts', from_unixtime(unix_timestamp(concat_ws('/',"day", "month", "year"), 'dd/MM/yyy')))
                        .filter(col("year") == (int(MONTH_BEFORE_M_2[:4])))
                        .filter(col("month") == (int(MONTH_BEFORE_M_2[4:6])))
                         .withColumnRenamed('msisdn_emisor', 'msisdn')
                                      .groupBy("msisdn")
                                      .agg(sql_sum("importe_traspasado").alias("importe_traspasado_emisor_M-2"),
                                           sql_sum("importe_cargo").alias("importe_cargo_emisor_M-2"),
                                           count("importe_traspasado").alias("num_em_M-2"))
                                      .select(col("msisdn"), 
                                              col("importe_traspasado_emisor_M-2"),
                                              col("importe_cargo_emisor_M-2"),
                                             col("num_em_M-2")))

df_balance_emisor_M_3 = (spark.read.table("raw_es.prepaid_transfbalance")
              .withColumn('entry_ts', from_unixtime(unix_timestamp(concat_ws('/',"day", "month", "year"), 'dd/MM/yyy')))
                        .filter(col("year") == (int(MONTH_BEFORE_M_3[:4])))
                        .filter(col("month") == (int(MONTH_BEFORE_M_3[4:6])))
                         .withColumnRenamed('msisdn_emisor', 'msisdn')
                                      .groupBy("msisdn")
                                      .agg(sql_sum("importe_traspasado").alias("importe_traspasado_emisor_M-3"),
                                           sql_sum("importe_cargo").alias("importe_cargo_emisor_M-3"),
                                           count("importe_traspasado").alias("num_em_M-3"))
                                      .select(col("msisdn"), 
                                              col("importe_traspasado_emisor_M-3"),
                                              col("importe_cargo_emisor_M-3"),
                                             col("num_em_M-3")))

df_balance_emisor_M_4 = (spark.read.table("raw_es.prepaid_transfbalance")
              .withColumn('entry_ts', from_unixtime(unix_timestamp(concat_ws('/',"day", "month", "year"), 'dd/MM/yyy')))
                        .filter(col("year") == (int(MONTH_BEFORE_M_4[:4])))
                        .filter(col("month") == (int(MONTH_BEFORE_M_4[4:6])))
                         .withColumnRenamed('msisdn_emisor', 'msisdn')
                                      .groupBy("msisdn")
                                      .agg(sql_sum("importe_traspasado").alias("importe_traspasado_emisor_M-4"),
                                           sql_sum("importe_cargo").alias("importe_cargo_emisor_M-4"),
                                           count("importe_traspasado").alias("num_em_M-4"))
                                      .select(col("msisdn"), 
                                              col("importe_traspasado_emisor_M-4"),
                                              col("importe_cargo_emisor_M-4"),
                                             col("num_em_M-4")))
                                       

In [214]:
df_balance_emisor_pre1 = (df_balance_emisor_M_1
                       .join(df_balance_emisor_M_2, on = 'msisdn' ,how = 'left'))

df_balance_emisor_pre2 = (df_balance_emisor_pre1
                            .join(df_balance_emisor_M_3, on = 'msisdn' ,how = 'left'))

df_balance_emisor = (df_balance_emisor_pre2
                       .join(df_balance_emisor_M_4, on = 'msisdn' ,how = 'left'))

### Información de Topups.

In [215]:
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import *

udf_parse_importe = udf(lambda x:int(x.replace("\+",""))/10000,IntegerType())

In [216]:
df_topups_M_1 = (spark.read.table("raw_es.billingtopsups_rechargescash")
                        .filter(col("year") == (int(MONTH_BEFORE_M_1[:4])))
                        .filter(col("month") == (int(MONTH_BEFORE_M_1[4:6])))
                        .filter(col("day") <= 15) # El modelo se suele ejecutar el día 20/21 de cada mes, y el retraso con el que llega la información suele ser de 2 o 3 días.
                        .withColumnRenamed('ndc_msisdn', 'msisdn')
                       .withColumn('importe_int_M-1', regexp_replace('importe', "\\+", ''))
                       .withColumn('importe_int_corrected_M-1', col('importe_int_M-1')/10000)
                       .groupBy("msisdn")
                        .agg(sql_sum("importe_int_corrected_M-1").alias("tu_amount_M-1"),
                               count("importe").alias("tu_num_M-1"))
                        .select(col("msisdn"),
                                col("tu_amount_M-1"),
                                col("tu_num_M-1"))
                       .withColumn("tu_bin_M-1", when(col("tu_num_M-1")>0, 1).otherwise(0))
                      )

df_topups_M_2 = (spark.read.table("raw_es.billingtopsups_rechargescash")
                            .filter(col("year") == (int(MONTH_BEFORE_M_2[:4])))
                        .filter(col("month") == (int(MONTH_BEFORE_M_2[4:6])))
                       .withColumn('importe_int_M-2', regexp_replace('importe', "\\+", ''))
                       .withColumn('importe_int_corrected_M-2', col('importe_int_M-2')/10000)
                       .withColumnRenamed('ndc_msisdn', 'msisdn')
                       .groupBy("msisdn")
                        .agg(sql_sum("importe_int_corrected_M-2").alias("tu_amount_M-2"),
                               count("importe").alias("tu_num_M-2"))
                        .select(col("msisdn"),
                                col("tu_amount_M-2"),
                                col("tu_num_M-2"))
                       .withColumn("tu_bin_M-2", when(col("tu_num_M-2")>0, 1).otherwise(0))
                      )

df_topups_M_3 = (spark.read.table("raw_es.billingtopsups_rechargescash")
                            .filter(col("year") == (int(MONTH_BEFORE_M_3[:4])))
                            .filter(col("month") == (int(MONTH_BEFORE_M_3[4:6])))
                            .withColumn('importe_int_M-3', regexp_replace('importe', "\\+", ''))
                            .withColumn('importe_int_corrected_M-3', col('importe_int_M-3')/10000)
                            .withColumnRenamed('ndc_msisdn', 'msisdn')
                       .groupBy("msisdn")
                        .agg(sql_sum("importe_int_corrected_M-3").alias("tu_amount_M-3"),
                               count("importe").alias("tu_num_M-3"))
                        .select(col("msisdn"),
                                col("tu_amount_M-3"),
                                col("tu_num_M-3"))
                       .withColumn("tu_bin_M-3", when(col("tu_num_M-3")>0, 1).otherwise(0))
                      )

df_topups_M_4 = (spark.read.table("raw_es.billingtopsups_rechargescash")
                        .filter(col("year") == (int(MONTH_BEFORE_M_4[:4])))
                        .filter(col("month") == (int(MONTH_BEFORE_M_4[4:6])))
                       .withColumn('importe_int_M-4', regexp_replace('importe', "\\+", ''))
                       .withColumn('importe_int_corrected_M-4', col('importe_int_M-4')/10000)
                       .withColumnRenamed('ndc_msisdn', 'msisdn')
                       .groupBy("msisdn")
                        .agg(sql_sum("importe_int_corrected_M-4").alias("tu_amount_M-4"),
                               count("importe").alias("tu_num_M-4"))
                        .select(col("msisdn"),
                                col("tu_amount_M-4"),
                                col("tu_num_M-4"))
                       .withColumn("tu_bin_M-4", when(col("tu_num_M-4")>0, 1).otherwise(0))
                      )

In [217]:
df_topups_pre1 = (df_topups_M_1
                       .join(df_topups_M_2, on = 'msisdn' ,how = 'left'))

df_topups_pre2 = (df_topups_pre1
                            .join(df_topups_M_3, on = 'msisdn' ,how = 'left'))

df_topups = (df_topups_pre2
                       .join(df_topups_M_4, on = 'msisdn' ,how = 'left'))


### Adelantos de saldo.

In [218]:
df_advance_solicitado_M_1 = (spark.read.table("raw_es.prepaid_advancebalance")
                        .filter(col("year") == (int(MONTH_BEFORE_M_1[:4])))
                        .filter(col("month") == (int(MONTH_BEFORE_M_1[4:6])))
                        .filter(col('day') < 15) # El modelo se suele ejecutar el día 20/21 de cada mes, y el retraso con el que llega la información suele ser de 2 o 3 días.
                        .groupBy("msisdn")
                        .agg(sql_sum("importe_anticipo").alias("abal_amount_M-1"),
                               count("importe_anticipo").alias("abal_num_M-1"))
                        .select(col("msisdn"), 
                                col("abal_amount_M-1"),
                                col("abal_num_M-1"))
                        )

df_advance_solicitado_M_2 = (spark.read.table("raw_es.prepaid_advancebalance")
                        .filter(col("year") == (int(MONTH_BEFORE_M_2[:4])))
                        .filter(col("month") == (int(MONTH_BEFORE_M_2[4:6])))
                       .groupBy("msisdn")
                        .agg(sql_sum("importe_anticipo").alias("abal_amount_M-2"),
                               count("importe_anticipo").alias("abal_num_M-2"))
                        .select(col("msisdn"), 
                                col("abal_amount_M-2"),
                                col("abal_num_M-2"))
                        )

df_advance_solicitado_M_3 = (spark.read.table("raw_es.prepaid_advancebalance")
                        .filter(col("year") == (int(MONTH_BEFORE_M_3[:4])))
                        .filter(col("month") == (int(MONTH_BEFORE_M_3[4:6])))
                        .groupBy("msisdn")
                        .agg(sql_sum("importe_anticipo").alias("abal_amount_M-3"),
                               count("importe_anticipo").alias("abal_num_M-3"))
                        .select(col("msisdn"), 
                                col("abal_amount_M-3"),
                                col("abal_num_M-3"))
                        )

df_advance_solicitado_M_4 = (spark.read.table("raw_es.prepaid_advancebalance")
                        .filter(col("year") == (int(MONTH_BEFORE_M_4[:4])))
                        .filter(col("month") == (int(MONTH_BEFORE_M_4[4:6])))
                       .groupBy("msisdn")
                        .agg(sql_sum("importe_anticipo").alias("abal_amount_M-4"),
                               count("importe_anticipo").alias("abal_num_M-4"))
                        .select(col("msisdn"), 
                                col("abal_amount_M-4"),
                                col("abal_num_M-4"))
                        )

In [219]:
df_advance_solicitado_pre1 = (df_advance_solicitado_M_1
                       .join(df_advance_solicitado_M_2, on = 'msisdn' ,how = 'left'))

df_advance_solicitado_pre2 = (df_advance_solicitado_pre1
                            .join(df_advance_solicitado_M_3, on = 'msisdn' ,how = 'left'))

df_advance_solicitado = (df_advance_solicitado_pre2
                       .join(df_advance_solicitado_M_4, on = 'msisdn' ,how = 'left'))

In [220]:
df_advance_recuperado_M_1 = (spark.read.table("raw_es.prepaid_advancebalance")
                       .filter(col("year") == (int(MONTH_BEFORE_M_1[:4])))
                       .filter(col("month") == (int(MONTH_BEFORE_M_1[4:6])))
                       .filter(col('day') < 15) # El modelo se suele ejecutar el día 20/21 de cada mes, y el retraso con el que llega la información suele ser de 2 o 3 días.
                       .groupBy("msisdn")
                       .agg(sql_sum("imp_recuperado").alias("abal_payment_M-1"),
                            count("imp_recuperado").alias("abal_payment_num_M-1"))
                       .select(col("msisdn"), 
                               col("abal_payment_M-1"),
                               col("abal_payment_num_M-1")))

df_advance_recuperado_M_2 = (spark.read.table("raw_es.prepaid_advancebalance")
                       .filter(col("year") == (int(MONTH_BEFORE_M_2[:4])))
                        .filter(col("month") == (int(MONTH_BEFORE_M_2[4:6])))
                       .groupBy("msisdn")
                       .agg(sql_sum("imp_recuperado").alias("abal_payment_M-2"),
                            count("imp_recuperado").alias("abal_payment_num_M-2"))
                       .select(col("msisdn"), 
                               col("abal_payment_M-2"),
                               col("abal_payment_num_M-2")))


df_advance_recuperado_M_3 = (spark.read.table("raw_es.prepaid_advancebalance")
                       .filter(col("year") == (int(MONTH_BEFORE_M_3[:4])))
                       .filter(col("month") == (int(MONTH_BEFORE_M_3[4:6])))
                       .groupBy("msisdn")
                       .agg(sql_sum("imp_recuperado").alias("abal_payment_M-3"),
                            count("imp_recuperado").alias("abal_payment_num_M-3"))
                       .select(col("msisdn"), 
                               col("abal_payment_M-3"),
                               col("abal_payment_num_M-3")))

df_advance_recuperado_M_4 = (spark.read.table("raw_es.prepaid_advancebalance")
                       .filter(col("year") == (int(MONTH_BEFORE_M_4[:4])))
                        .filter(col("month") == (int(MONTH_BEFORE_M_4[4:6])))
                       .groupBy("msisdn")
                       .agg(sql_sum("imp_recuperado").alias("abal_payment_M-4"),
                            count("imp_recuperado").alias("abal_payment_num_M-4"))
                       .select(col("msisdn"), 
                               col("abal_payment_M-4"),
                               col("abal_payment_num_M-4")))

In [221]:
df_advance_recuperado_pre1 = (df_advance_recuperado_M_1
                       .join(df_advance_recuperado_M_2, on = 'msisdn',how = 'left'))

df_advance_recuperado_pre2 = (df_advance_recuperado_pre1
                            .join(df_advance_recuperado_M_3, on = 'msisdn',how = 'left'))

df_advance_recuperado = (df_advance_recuperado_pre2
                       .join(df_advance_recuperado_M_4, on = 'msisdn',how = 'left'))


### Balances de saldo.

In [222]:
df_balance_M_1 = (spark.read.table("raw_es.prepaid_clientbalance")
                    .filter(col("year") == (int(MONTH_BEFORE_M_1[:4])))
                    .filter(col("month") == (int(MONTH_BEFORE_M_1[4:6])))
                    .filter(col('day') < 15) # El modelo se suele ejecutar el día 20/21 de cada mes, y el retraso con el que llega la información suele ser de 2 o 3 días.
                    .groupBy('msisdn')
                    .agg(F.max('saldo').alias('max_saldo_M-1'),
                        F.min('saldo').alias('min_saldo_M-1'))
                    .withColumn('diff_saldo_M-1', col('max_saldo_M-1') - col ('min_saldo_M-1'))
                 )

df_balance_M_2 = (spark.read.table("raw_es.prepaid_clientbalance")
                    .filter(col("year") == (int(MONTH_BEFORE_M_2[:4])))
                    .filter(col("month") == (int(MONTH_BEFORE_M_2[4:6])))
                    .groupBy('msisdn')
                    .agg(F.max('saldo').alias('max_saldo_M-2'),
                        F.min('saldo').alias('min_saldo_M-2'))
                    .withColumn('diff_saldo_M-2', col('max_saldo_M-2') - col ('min_saldo_M-2'))
                 )

df_balance_M_3 = (spark.read.table("raw_es.prepaid_clientbalance")
                    .filter(col("year") == (int(MONTH_BEFORE_M_3[:4])))
                    .filter(col("month") == (int(MONTH_BEFORE_M_3[4:6])))
                    .groupBy('msisdn')
                    .agg(F.max('saldo').alias('max_saldo_M-3'),
                        F.min('saldo').alias('min_saldo_M-3'))
                    .withColumn('diff_saldo_M-3', col('max_saldo_M-3') - col ('min_saldo_M-3'))
                 )
                  
df_balance_M_4 = (spark.read.table("raw_es.prepaid_clientbalance")
                    .filter(col("year") == (int(MONTH_BEFORE_M_4[:4])))
                    .filter(col("month") == (int(MONTH_BEFORE_M_4[4:6])))
                    .groupBy('msisdn')
                    .agg(F.max('saldo').alias('max_saldo_M-4'),
                        F.min('saldo').alias('min_saldo_M-4'))
                    .withColumn('diff_saldo_M-4', col('max_saldo_M-4') - col ('min_saldo_M-4'))
                 )

In [223]:
df_balance_pre1 = df_balance_M_1.join(df_balance_M_2, on = 'msisdn' , how = 'left')
df_balance_pre2 = df_balance_pre1.join(df_balance_M_3, on = 'msisdn' , how = 'left') 
df_balance = df_balance_pre2.join(df_balance_M_4, on = 'msisdn' , how = 'left').withColumn('diff_saldo_4meses', col('max_saldo_M-4') - col('max_saldo_M-1'))

In [224]:
df_balance = (df_balance
 .withColumn('DIFF_SALDO_M-1', col('max_saldo_M-1') - col('min_saldo_M-1'))
 .withColumn('DIFF_SALDO_M-2', col('max_saldo_M-2') - col('min_saldo_M-2'))
 .withColumn('DIFF_SALDO_M-3', col('max_saldo_M-3') - col('min_saldo_M-3'))
 .withColumn('DELTA_DIFF_SALDO', col('DIFF_SALDO_M-3') - col('DIFF_SALDO_M-1'))
)

## Consumo de voz y SMS.

In [225]:
df_voicesms_M_1 = (spark.read.table("raw_es.prepaid_trafficvoicesms")
                       .filter(col("year") == (int(MONTH_BEFORE_M_1[:4])))
                       .filter(col("month") == (int(MONTH_BEFORE_M_1[4:6])))
                       .filter(col("day") <= 15)
                 .withColumn("numeroorigen_M-1", trim(col("numeroorigen")))
                 .withColumn("numerodestino_M-1", trim(col("numerodestino")))
                 .withColumn("vozsms_M-1", lower(trim(col("vozsms"))))      
                 .withColumn("voice_amount_M-1", when(lower(col("vozsms")) == 'voz', col("importecobrado")).otherwise(0))
                 .withColumn("sms_amount_M-1", when(lower(col("vozsms")) == 'sms', col("importecobrado")).otherwise(0))
              .groupBy("numeroorigen_M-1")
              .agg(sql_sum("voice_amount_M-1").alias("voice_amount_M-1"),
                   sql_sum("sms_amount_M-1").alias("sms_amount_M-1"),
                   sql_sum("importecobrado").alias("voicesms_amount_M-1"),
                   count(when((lower(col("vozsms_M-1")) == 'voz'), col("vozsms_M-1")).otherwise(None)).alias("voice_num_M-1"),
                   count(when((lower(col("vozsms_M-1")) == 'sms'), col("vozsms_M-1")).otherwise(None)).alias("sms_num_M-1"),
                   sql_sum(when((lower(col("vozsms_M-1")) == 'voz'), col("airduration")).otherwise(None)).alias("voice_duration_M-1"),
                   avg(when((lower(col("vozsms")) == 'voz'), col("airduration")).otherwise(None)).alias("voice_avg_duration_M-1"),
                   countDistinct(when((lower(col("vozsms_M-1")) == 'voz'), col("numerodestino_M-1")).otherwise(None)).alias("voice_num_distinct_rec_M-1"),
                   countDistinct(when((lower(col("vozsms_M-1")) == 'sms'), col("numerodestino_M-1")).otherwise(None)).alias("sms_num_distinct_rec_M-1"))
             .withColumnRenamed("numeroorigen_M-1", "msisdn")  
               )

df_voicesms_M_2 = (spark.read.table("raw_es.prepaid_trafficvoicesms")
                       .filter(col("year") == (int(MONTH_BEFORE_M_2[:4])))
                       .filter(col("month") == (int(MONTH_BEFORE_M_2[4:6])))
                 .withColumn("numeroorigen_M-2", trim(col("numeroorigen")))
                 .withColumn("numerodestino_M-2", trim(col("numerodestino")))
                 .withColumn("vozsms_M-2", lower(trim(col("vozsms"))))      
                 .withColumn("voice_amount_M-2", when(lower(col("vozsms")) == 'voz', col("importecobrado")).otherwise(0))
                 .withColumn("sms_amount_M-2", when(lower(col("vozsms")) == 'sms', col("importecobrado")).otherwise(0))
              .groupBy("numeroorigen_M-2")
              .agg(sql_sum("voice_amount_M-2").alias("voice_amount_M-2"),
                   sql_sum("sms_amount_M-2").alias("sms_amount_M-2"),
                   sql_sum("importecobrado").alias("voicesms_amount_M-2"),
                   count(when((lower(col("vozsms_M-2")) == 'voz'), col("vozsms_M-2")).otherwise(None)).alias("voice_num_M-2"),
                   count(when((lower(col("vozsms_M-2")) == 'sms'), col("vozsms_M-2")).otherwise(None)).alias("sms_num_M-2"),
                   sql_sum(when((lower(col("vozsms_M-2")) == 'voz'), col("airduration")).otherwise(None)).alias("voice_duration_M-2"),
                   avg(when((lower(col("vozsms_M-2")) == 'voz'), col("airduration")).otherwise(None)).alias("voice_avg_duration_M-2"),
                   countDistinct(when((lower(col("vozsms_M-2")) == 'voz'), col("numerodestino_M-2")).otherwise(None)).alias("voice_num_distinct_rec_M-2"),
                   countDistinct(when((lower(col("vozsms_M-2")) == 'sms'), col("numerodestino_M-2")).otherwise(None)).alias("sms_num_distinct_rec_M-2"))
             .withColumnRenamed("numeroorigen_M-2", "msisdn")  
               )

df_voicesms_M_3 = (spark.read.table("raw_es.prepaid_trafficvoicesms")
                       .filter(col("year") == (int(MONTH_BEFORE_M_3[:4])))
                       .filter(col("month") == (int(MONTH_BEFORE_M_3[4:6])))
                 .withColumn("numeroorigen_M-3", trim(col("numeroorigen")))
                 .withColumn("numerodestino_M-3", trim(col("numerodestino")))
                 .withColumn("vozsms_M-3", lower(trim(col("vozsms"))))      
                 .withColumn("voice_amount_M-3", when(lower(col("vozsms")) == 'voz', col("importecobrado")).otherwise(0))
                 .withColumn("sms_amount_M-3", when(lower(col("vozsms")) == 'sms', col("importecobrado")).otherwise(0))
              .groupBy("numeroorigen_M-3")
              .agg(sql_sum("voice_amount_M-3").alias("voice_amount_M-3"),
                   sql_sum("sms_amount_M-3").alias("sms_amount_M-3"),
                   sql_sum("importecobrado").alias("voicesms_amount_M-3"),
                   count(when((lower(col("vozsms_M-3")) == 'voz'), col("vozsms")).otherwise(None)).alias("voice_num_M-3"),
                   count(when((lower(col("vozsms_M-3")) == 'sms'), col("vozsms")).otherwise(None)).alias("sms_num_M-3"),
                   sql_sum(when((lower(col("vozsms_M-3")) == 'voz'), col("airduration")).otherwise(None)).alias("voice_duration_M-3"),
                   avg(when((lower(col("vozsms_M-3")) == 'voz'), col("airduration")).otherwise(None)).alias("voice_avg_duration_M-3"),
                   countDistinct(when((lower(col("vozsms_M-3")) == 'voz'), col("numerodestino_M-3")).otherwise(None)).alias("voice_num_distinct_rec_M-3"),
                   countDistinct(when((lower(col("vozsms_M-3")) == 'sms'), col("numerodestino_M-3")).otherwise(None)).alias("sms_num_distinct_rec_M-3"))
             .withColumnRenamed("numeroorigen_M-3", "msisdn")  
               )

df_voicesms_M_4 = (spark.read.table("raw_es.prepaid_trafficvoicesms")
                       .filter(col("year") == (int(MONTH_BEFORE_M_4[:4])))
                       .filter(col("month") == (int(MONTH_BEFORE_M_4[4:6])))
                 .withColumn("numeroorigen_M-4", trim(col("numeroorigen")))
                 .withColumn("numerodestino_M-4", trim(col("numerodestino")))
                 .withColumn("vozsms_M-4", lower(trim(col("vozsms"))))      
                 .withColumn("voice_amount_M-4", when(lower(col("vozsms")) == 'voz', col("importecobrado")).otherwise(0))
                 .withColumn("sms_amount_M-4", when(lower(col("vozsms")) == 'sms', col("importecobrado")).otherwise(0))
              .groupBy("numeroorigen_M-4")
              .agg(sql_sum("voice_amount_M-4").alias("voice_amount_M-4"),
                   sql_sum("sms_amount_M-4").alias("sms_amount_M-4"),
                   sql_sum("importecobrado").alias("voicesms_amount_M-4"),
                   count(when((lower(col("vozsms_M-4")) == 'voz'), col("vozsms_M-4")).otherwise(None)).alias("voice_num_M-4"),
                   count(when((lower(col("vozsms_M-4")) == 'sms'), col("vozsms_M-4")).otherwise(None)).alias("sms_num_M-4"),
                   sql_sum(when((lower(col("vozsms_M-4")) == 'voz'), col("airduration")).otherwise(None)).alias("voice_duration_M-4"),
                   avg(when((lower(col("vozsms_M-4")) == 'voz'), col("airduration")).otherwise(None)).alias("voice_avg_duration_M-4"),
                   countDistinct(when((lower(col("vozsms_M-4")) == 'voz'), col("numerodestino_M-4")).otherwise(None)).alias("voice_num_distinct_rec_M-4"),
                   countDistinct(when((lower(col("vozsms_M-4")) == 'sms'), col("numerodestino_M-4")).otherwise(None)).alias("sms_num_distinct_rec_M-4"))
             .withColumnRenamed("numeroorigen_M-4", "msisdn")  
               )


In [226]:
df_voicesms_pre1 = (df_voicesms_M_1
                       .join(df_voicesms_M_2, on = 'msisdn' ,how = 'left'))

df_voicesms_pre2 = (df_voicesms_pre1
                            .join(df_voicesms_M_3, on = 'msisdn' ,how = 'left'))

df_voicesms = (df_voicesms_pre2
                       .join(df_voicesms_M_4, on = 'msisdn' ,how = 'left'))

## Información de recargas.

In [227]:
recargas_cols = spark.read.table('raw_es.vf_pre_recargas').columns

In [228]:
recargas_cols.remove('partitioned_month')
recargas_cols.remove('year')
recargas_cols.remove('month')
recargas_cols.remove('day')

- Mes M-2:

In [229]:
df_recargas_M_2 = (spark.read.option("delimiter", "\t").option("header", False).csv("/data/raw/vf_es/cvm/ES_CVM_PREPREC_DATALAB_M/1.1/csv/partitioned_month="
                                                                                  +MONTH_BEFORE_M_2[0:6]+"/year="+MONTH_BEFORE_M_2[0:4]+"/month="+str(int(MONTH_BEFORE_M_2[4:6]))+"/day=0"))

In [230]:
i = 0

cols_previas = df_recargas_M_2.columns

for c in recargas_cols:
    df_recargas_M_2 = df_recargas_M_2.withColumnRenamed(cols_previas[i], c)
    i += 1

In [231]:
df_recargas_M_2 = (df_recargas_M_2
                   .withColumn('cdmetodo_M-2', col('cdmetodo').cast(DoubleType()))
                   .withColumn('imporcaj_M-2', col('imporcaj').cast(DoubleType()))
                   .withColumn('importar_M-2', col('importar').cast(DoubleType()))
                   .withColumn('imporotr_M-2', col('imporotr').cast(DoubleType()))
                   .withColumn('acreccaj_M-2', col('acreccaj').cast(DoubleType()))
                   .withColumn('acrectar_M-2', col('acrectar').cast(DoubleType()))
                   .withColumn('acrecotr_M-2', col('acrecotr').cast(DoubleType()))
                  )

In [232]:
df_recargas_M_2 = (df_recargas_M_2
                   .groupBy('nif')
                   .agg(sum(col('importar_M-2')).alias('sum-importar_M-2'),
                        sum(col('imporcaj_M-2')).alias('sum-imporcaj_M-2'),
                        sum(col('imporotr_M-2')).alias('sum-imporotr_M-2'),
                        sum(col('acreccaj_M-2')).alias('sum-acreccaj_M-2'),
                        sum(col('acrectar_M-2')).alias('sum-acrectar_M-2'),
                        sum(col('acrecotr_M-2')).alias('sum-acrecotr_M-2'),
                        F.avg(col('cdmetodo_M-2')).alias('sum-cdmetodo_M-2'),
                       )
                  )

- Mes M-3:

In [233]:
df_recargas_M_3 = (spark.read.option("delimiter", "\t").option("header", False).csv("/data/raw/vf_es/cvm/ES_CVM_PREPREC_DATALAB_M/1.1/csv/partitioned_month="
                                                                                  +MONTH_BEFORE_M_3[0:6]+"/year="+MONTH_BEFORE_M_3[0:4]+"/month="+str(int(MONTH_BEFORE_M_3[4:6]))+"/day=0"))

In [234]:
i = 0

cols_previas = df_recargas_M_3.columns

for c in recargas_cols:
    df_recargas_M_3 = df_recargas_M_3.withColumnRenamed(cols_previas[i], c)
    i += 1

In [235]:
df_recargas_M_3 = (df_recargas_M_3
                   .withColumn('cdmetodo_M-3', col('cdmetodo').cast(DoubleType()))
                   .withColumn('imporcaj_M-3', col('imporcaj').cast(DoubleType()))
                   .withColumn('importar_M-3', col('importar').cast(DoubleType()))
                   .withColumn('imporotr_M-3', col('imporotr').cast(DoubleType()))
                   .withColumn('acreccaj_M-3', col('acreccaj').cast(DoubleType()))
                   .withColumn('acrectar_M-3', col('acrectar').cast(DoubleType()))
                   .withColumn('acrecotr_M-3', col('acrecotr').cast(DoubleType()))
                  )

In [236]:
df_recargas_M_3 = (df_recargas_M_3
                   .groupBy('nif')
                   .agg(sum(col('importar_M-3')).alias('sum-importar_M-3'),
                        sum(col('imporcaj_M-3')).alias('sum-imporcaj_M-3'),
                        sum(col('imporotr_M-3')).alias('sum-imporotr_M-3'),
                        sum(col('acreccaj_M-3')).alias('sum-acreccaj_M-3'),
                        sum(col('acrectar_M-3')).alias('sum-acrectar_M-3'),
                        sum(col('acrecotr_M-3')).alias('sum-acrecotr_M-3'),
                        F.avg(col('cdmetodo_M-3')).alias('sum-cdmetodo_M-3'),
                       )
                  )

- Mes M-4:

In [237]:
df_recargas_M_4 = (spark.read.option("delimiter", "\t").option("header", False).csv("/data/raw/vf_es/cvm/ES_CVM_PREPREC_DATALAB_M/1.1/csv/partitioned_month="
                                                                                  +MONTH_BEFORE_M_4[0:6]+"/year="+MONTH_BEFORE_M_4[0:4]+"/month="+str(int(MONTH_BEFORE_M_4[4:6]))+"/day=0"))

In [238]:
i = 0

cols_previas = df_recargas_M_4.columns

for c in recargas_cols:
    df_recargas_M_4 = df_recargas_M_4.withColumnRenamed(cols_previas[i], c)
    i += 1

In [239]:
df_recargas_M_4 = (df_recargas_M_4
                   .withColumn('cdmetodo_M-4', col('cdmetodo').cast(DoubleType()))
                   .withColumn('imporcaj_M-4', col('imporcaj').cast(DoubleType()))
                   .withColumn('importar_M-4', col('importar').cast(DoubleType()))
                   .withColumn('imporotr_M-4', col('imporotr').cast(DoubleType()))
                   .withColumn('acreccaj_M-4', col('acreccaj').cast(DoubleType()))
                   .withColumn('acrectar_M-4', col('acrectar').cast(DoubleType()))
                   .withColumn('acrecotr_M-4', col('acrecotr').cast(DoubleType()))
                  )

In [240]:
df_recargas_M_4 = (df_recargas_M_4
                   .groupBy('nif')
                   .agg(sum(col('importar_M-4')).alias('sum-importar_M-4'),
                        sum(col('imporcaj_M-4')).alias('sum-imporcaj_M-4'),
                        sum(col('imporotr_M-4')).alias('sum-imporotr_M-4'),
                        sum(col('acreccaj_M-4')).alias('sum-acreccaj_M-4'),
                        sum(col('acrectar_M-4')).alias('sum-acrectar_M-4'),
                        sum(col('acrecotr_M-4')).alias('sum-acrecotr_M-4'),
                        F.avg(col('cdmetodo_M-4')).alias('sum-cdmetodo_M-4'),
                       )
                  )

- Mes M-5:

In [241]:
df_recargas_M_5 = (spark.read.option("delimiter", "\t").option("header", False).csv("/data/raw/vf_es/cvm/ES_CVM_PREPREC_DATALAB_M/1.1/csv/partitioned_month="
                                                                                  +MONTH_BEFORE_M_5[0:6]+"/year="+MONTH_BEFORE_M_5[0:4]+"/month="+str(int(MONTH_BEFORE_M_5[4:6]))+"/day=0"))

In [242]:
i = 0

cols_previas = df_recargas_M_5.columns

for c in recargas_cols:
    df_recargas_M_5 = df_recargas_M_5.withColumnRenamed(cols_previas[i], c)
    i += 1

In [243]:
df_recargas_M_5 = (df_recargas_M_5
                   .withColumn('cdmetodo_M-5', col('cdmetodo').cast(DoubleType()))
                   .withColumn('imporcaj_M-5', col('imporcaj').cast(DoubleType()))
                   .withColumn('importar_M-5', col('importar').cast(DoubleType()))
                   .withColumn('imporotr_M-5', col('imporotr').cast(DoubleType()))
                   .withColumn('acreccaj_M-5', col('acreccaj').cast(DoubleType()))
                   .withColumn('acrectar_M-5', col('acrectar').cast(DoubleType()))
                   .withColumn('acrecotr_M-5', col('acrecotr').cast(DoubleType()))
                  )

In [244]:
df_recargas_M_5 = (df_recargas_M_5
                   .groupBy('nif')
                   .agg(sum(col('importar_M-5')).alias('sum-importar_M-5'),
                        sum(col('imporcaj_M-5')).alias('sum-imporcaj_M-5'),
                        sum(col('imporotr_M-5')).alias('sum-imporotr_M-5'),
                        sum(col('acreccaj_M-5')).alias('sum-acreccaj_M-5'),
                        sum(col('acrectar_M-5')).alias('sum-acrectar_M-5'),
                        sum(col('acrecotr_M-5')).alias('sum-acrecotr_M-5'),
                        F.avg(col('cdmetodo_M-5')).alias('sum-cdmetodo_M-5'),
                       )
                  )

- Join de todas la información de recargas:

In [245]:
df_recargas_pre1 = df_recargas_M_2.join(df_recargas_M_3, on = 'nif', how = 'leftouter')

In [246]:
df_recargas_pre2 = df_recargas_pre1.join(df_recargas_M_4, on = 'nif', how = 'leftouter')

In [247]:
df_recargas_final = df_recargas_pre2.join(df_recargas_M_5, on = 'nif', how = 'leftouter')

## Información del 'Monthly Fact'.

- Mes M-2:  para hacer el tablón de entrenamiento de las predicciones de noviembre, no había datos del mes M-2 (julio), entonces he puesto M-3

In [248]:
df_monthly_fac_M_2 = (spark.read.option("delimiter", "\t").option("header", False).csv("/data/raw/vf_es/cvm/ES_CVM_PREPAMONTHFACT_DATALAB_M/1.1/csv/partitioned_month="
                                                                                  +MONTH_BEFORE_M_2[0:6]+"/year="+MONTH_BEFORE_M_2[0:4]+"/month="+str(int(MONTH_BEFORE_M_2[4:6]))+"/day=0"))

In [249]:
previos = spark.read.table('raw_es.vf_pre_prepaid_monthly_fact').columns

In [250]:
previos.remove('msisdn')
previos.remove('partitioned_month')
previos.remove('year')
previos.remove('month')
previos.remove('day')

In [251]:
i = 1

df_monthly_fac_M_2 = df_monthly_fac_M_2.withColumnRenamed('_c0', 'msisdn')

mon_cols = df_monthly_fac_M_2.columns

for c in previos:
    df_monthly_fac_M_2 = df_monthly_fac_M_2.withColumnRenamed(mon_cols[i], c+'_M-2')
    i += 1

In [252]:
df_monthly_fac_M_2 = df_monthly_fac_M_2.drop(*['fx_ciclo', 'mes', 'partitioned_month', 'year', 'month', 'day'])

- Mes M-3:

In [254]:
df_monthly_fac_M_3 = (spark.read.option("delimiter", "\t").option("header", False).csv("/data/raw/vf_es/cvm/ES_CVM_PREPAMONTHFACT_DATALAB_M/1.1/csv/partitioned_month="
                                                                                  +MONTH_BEFORE_M_4[0:6]+"/year="+MONTH_BEFORE_M_4[0:4]+"/month="+str(int(MONTH_BEFORE_M_4[4:6]))+"/day=0"))

In [255]:
i = 1

df_monthly_fac_M_3 = df_monthly_fac_M_3.withColumnRenamed('_c0', 'msisdn')

mon_cols = df_monthly_fac_M_3.columns

for c in previos:
    df_monthly_fac_M_3 = df_monthly_fac_M_3.withColumnRenamed(mon_cols[i], c+'_M-3')
    i += 1

In [256]:
df_monthly_fac_M_3 = df_monthly_fac_M_3.drop(*['fx_ciclo', 'mes', 'partitioned_month', 'year', 'month', 'day'])

- Mes M-4:

In [257]:
df_monthly_fac_M_4 = (spark.read.option("delimiter", "\t").option("header", False).csv("/data/raw/vf_es/cvm/ES_CVM_PREPAMONTHFACT_DATALAB_M/1.1/csv/partitioned_month="
                                                                                  +MONTH_BEFORE_M_4[0:6]+"/year="+MONTH_BEFORE_M_4[0:4]+"/month="+str(int(MONTH_BEFORE_M_4[4:6]))+"/day=0"))

In [258]:
i = 1

df_monthly_fac_M_4 = df_monthly_fac_M_4.withColumnRenamed('_c0', 'msisdn')

mon_cols = df_monthly_fac_M_4.columns

for c in previos:
    df_monthly_fac_M_4 = df_monthly_fac_M_4.withColumnRenamed(mon_cols[i], c+'_M-4')
    i += 1

In [259]:
df_monthly_fac_M_4 = df_monthly_fac_M_4.drop(*['fx_ciclo', 'mes', 'partitioned_month', 'year', 'month', 'day'])

- Mes M-5:

In [260]:
df_monthly_fac_M_5 = (spark.read.option("delimiter", "\t").option("header", False).csv("/data/raw/vf_es/cvm/ES_CVM_PREPAMONTHFACT_DATALAB_M/1.1/csv/partitioned_month="
                                                                                  +MONTH_BEFORE_M_5[0:6]+"/year="+MONTH_BEFORE_M_5[0:4]+"/month="+str(int(MONTH_BEFORE_M_5[4:6]))+"/day=0"))

In [261]:
i = 1

df_monthly_fac_M_5 = df_monthly_fac_M_5.withColumnRenamed('_c0', 'msisdn')

mon_cols = df_monthly_fac_M_5.columns

for c in previos:
    df_monthly_fac_M_5 = df_monthly_fac_M_5.withColumnRenamed(mon_cols[i], c+'_M-5')
    i += 1

In [262]:
df_monthly_fac_M_5 = df_monthly_fac_M_5.drop(*['fx_ciclo', 'mes', 'partitioned_month', 'year', 'month', 'day'])

- Join de tablas del *`Monthly Fac`*:

In [263]:
df_monthlyfact_pre0 = df_monthly_fac_M_2.join(df_monthly_fac_M_3, on = 'msisdn', how = 'leftouter')

In [264]:
df_monthlyfact_pre1 = df_monthlyfact_pre0.join(df_monthly_fac_M_4, on = 'msisdn', how = 'leftouter')

In [265]:
df_monthlyfact_final = df_monthlyfact_pre1.join(df_monthly_fac_M_5, on = 'msisdn', how = 'leftouter')

In [266]:
monthly_fac_cols = df_monthlyfact_final.columns

monthly_fac_cols.remove('msisdn')

for c in monthly_fac_cols:
    df_monthlyfact_final = df_monthlyfact_final.withColumn(c, col(c).cast(DoubleType()))

## Consumo de datos.

In [267]:
df_data_consumed_M_1 = (spark.read.table("raw_es.prepaid_trafficdata")
                 .filter(col("year") == (int(MONTH_BEFORE_M_1[:4])))
                 .filter(col("month") == (int(MONTH_BEFORE_M_1[4:6])))
                 .filter(col("month") <= 15)
                 .withColumn("data_mb", col("volumen")/(1024*1024))
                 .groupBy("msisdn")
                 .agg(sql_sum("data_mb").alias("data_mb_M-1"),
                      sql_sum("CARGOREAL").alias("data_amount_M-1"),
                      count("data_mb").alias("num_conexions_M-1"))
                 .select(*["data_mb_M-1", "msisdn", "data_amount_M-1", "num_conexions_M-1"])
                       )

df_data_consumed_M_2 = (spark.read.table("raw_es.prepaid_trafficdata")
                 .filter(col("year") == (int(MONTH_BEFORE_M_2[:4])))
                 .filter(col("month") == (int(MONTH_BEFORE_M_2[4:6])))
                 .withColumn("data_mb", col("volumen")/(1024*1024))
                 .groupBy("msisdn")
                 .agg(sql_sum("data_mb").alias("data_mb_M-2"),
                      sql_sum("CARGOREAL").alias("data_amount_M-2"),
                      count("data_mb").alias("num_conexions_M-2"))
                 .select(*["data_mb_M-2", "msisdn", "data_amount_M-2", "num_conexions_M-2"])
                       )

df_data_consumed_M_3 = (spark.read.table("raw_es.prepaid_trafficdata")
                 .filter(col("year") == (int(MONTH_BEFORE_M_3[:4])))
                 .filter(col("month") == (int(MONTH_BEFORE_M_3[4:6])))
                 .withColumn("data_mb", col("volumen")/(1024*1024))
                 .groupBy("msisdn")
                 .agg(sql_sum("data_mb").alias("data_mb_M-3"),
                      sql_sum("CARGOREAL").alias("data_amount_M-3"),
                      count("data_mb").alias("num_conexions_M-3"))
                 .select(*["data_mb_M-3", "msisdn", "data_amount_M-3", "num_conexions_M-3"])
                       )

df_data_consumed_M_4 = (spark.read.table("raw_es.prepaid_trafficdata")
                 .filter(col("year") == (int(MONTH_BEFORE_M_4[:4])))
                 .filter(col("month") == (int(MONTH_BEFORE_M_4[4:6])))
                 .withColumn("data_mb", col("volumen")/(1024*1024))
                 .groupBy("msisdn")
                 .agg(sql_sum("data_mb").alias("data_mb_M-4"),
                      sql_sum("CARGOREAL").alias("data_amount_M-4"),
                      count("data_mb").alias("num_conexions_M-4"))
                 .select(*["data_mb_M-4", "msisdn", "data_amount_M-4", "num_conexions_M-4"])
                       )

In [268]:
df_data_consumed_pre1 = (df_data_consumed_M_1
                       .join(df_data_consumed_M_2, on = 'msisdn' ,how = 'left'))

df_data_consumed_pre2 = (df_data_consumed_pre1
                            .join(df_data_consumed_M_3, on = 'msisdn' ,how = 'left'))

df_data_consumed = (df_data_consumed_pre2
                       .join(df_data_consumed_M_4, on = 'msisdn' ,how = 'left'))

## Información de tarificación.

 - ### Ahora, trabajamos con los resultados agregados por mes. 

- Lectura de columnas:

In [269]:
tarificador_meses_previos = spark.read.table("raw_es.vf_pre_info_tarif")

In [270]:
tarificador_meses_previos = tarificador_meses_previos.drop(*['partitioned_month', 'year', 'month', 'day'])

- M-2:

In [271]:
tarificador_month_M_2 = (sqlContext.read.format('csv').options(header='false', inferSchema='true', delimiter = '\t')
                    .load('/data/raw/vf_es/cvm/ES_CVM_PREPTAR_DATALAB_M/1.1/csv/partitioned_month='+MONTH_BEFORE_M_2[0:6]+'/year='+MONTH_BEFORE_M_2[0:4]+'/month='+str(int(MONTH_BEFORE_M_2[4:6]))))

In [272]:
tarificador_month_M_2 = tarificador_month_M_2.drop(*['day'])

In [273]:
i = 1

tarificador_month_M_2 = tarificador_month_M_2.withColumnRenamed('_c0', 'msisdn')

while i < len(tarificador_month_M_2.columns):
    tarificador_month_M_2 = tarificador_month_M_2.withColumnRenamed(tarificador_month_M_2.columns[i], tarificador_meses_previos.columns[i].upper()+'_previo_M_2')
    i += 1

- M-3:

In [274]:
tarificador_month_M_3= (sqlContext.read.format('csv').options(header='false', inferSchema='true', delimiter = '\t')
                    .load('/data/raw/vf_es/cvm/ES_CVM_PREPTAR_DATALAB_M/1.1/csv/partitioned_month='+MONTH_BEFORE_M_3[0:6]+'/year='+MONTH_BEFORE_M_3[0:4]+'/month='+str(int(MONTH_BEFORE_M_3[4:6]))))

In [275]:
tarificador_month_M_3 = tarificador_month_M_3.drop(*['day'])

In [276]:
i = 1

tarificador_month_M_3 = tarificador_month_M_3.withColumnRenamed('_c0', 'msisdn')

while i < len(tarificador_month_M_3.columns):
    tarificador_month_M_3 = tarificador_month_M_3.withColumnRenamed(tarificador_month_M_3.columns[i], tarificador_meses_previos.columns[i].upper()+'_previo_M_3')
    i += 1

- M-4:

In [277]:
tarificador_month_M_4 = (sqlContext.read.format('csv').options(header='false', inferSchema='true', delimiter = '\t')
                    .load('/data/raw/vf_es/cvm/ES_CVM_PREPTAR_DATALAB_M/1.1/csv/partitioned_month='+MONTH_BEFORE_M_4[0:6]+'/year='+MONTH_BEFORE_M_4[0:4]+'/month='+str(int(MONTH_BEFORE_M_4[4:6]))))

In [278]:
tarificador_month_M_4 = tarificador_month_M_4.drop(*['day'])

In [279]:
i = 1

tarificador_month_M_4 = tarificador_month_M_4.withColumnRenamed('_c0', 'msisdn')

while i < len(tarificador_month_M_4.columns):
    tarificador_month_M_4 = tarificador_month_M_4.withColumnRenamed(tarificador_month_M_4.columns[i], tarificador_meses_previos.columns[i].upper()+'_previo_M_4')
    i += 1

- M-5:

In [281]:
tarificador_month_M_5 = (sqlContext.read.format('csv').options(header='false', inferSchema='true', delimiter = '\t')
                    .load('/data/raw/vf_es/cvm/ES_CVM_PREPTAR_DATALAB_M/1.1/csv/partitioned_month='+MONTH_BEFORE_M_5[0:6]+'/year='+MONTH_BEFORE_M_5[0:4]+'/month='+str(int(MONTH_BEFORE_M_5[4:6]))))

In [282]:
tarificador_month_M_5 = tarificador_month_M_5.drop(*['day'])

In [283]:
i = 1

tarificador_month_M_5 = tarificador_month_M_5.withColumnRenamed('_c0', 'msisdn')

while i < len(tarificador_month_M_5.columns):
    tarificador_month_M_5 = tarificador_month_M_5.withColumnRenamed(tarificador_month_M_5.columns[i], tarificador_meses_previos.columns[i].upper()+'_previo_M_5')
    i += 1

    Join de toda la información de tarificación:

In [284]:
tarifas_pre = tarificador_month_M_2.join(tarificador_month_M_3, on = 'msisdn', how = 'inner')
tarifas_pre1 = tarifas_pre.join(tarificador_month_M_4, on = 'msisdn', how = 'inner')
df_tarifas = tarifas_pre1.join(tarificador_month_M_5, on = 'msisdn', how = 'inner')

In [285]:
df_tarifas.count()

1829856

## Información de la cartera `Yu`.

In [286]:
yu_meses_previos = spark.read.table("raw_es.vf_pre_cartera_yu_ba")

In [287]:
yu_meses_previos = yu_meses_previos.drop(*['partitioned_month', 'year', 'month', 'day']).withColumnRenamed('telefono', 'msisdn').columns

- Mes M-2:

In [288]:
df_yu_M_2 = (sqlContext.read.format('csv')
                    .options(header='false', inferSchema='true', delimiter = '\t')
                    .load('/data/raw/vf_es/cvm/ES_CVM_EXTRYUBAPORT_DATALAB_M/1.1/csv/partitioned_month='+MONTH_BEFORE_M_2[0:6]+'/year='+MONTH_BEFORE_M_2[0:4]+'/month='+str(int(MONTH_BEFORE_M_2[4:6])))
                   )

In [289]:
df_yu_M_2 = df_yu_M_2.drop(*['day'])

In [290]:
i = 1

df_yu_M_2 = df_yu_M_2.withColumnRenamed('_c0', 'msisdn')

while i < len(yu_meses_previos):
    df_yu_M_2 = df_yu_M_2.withColumnRenamed(df_yu_M_2.columns[i], yu_meses_previos[i].upper()+'_previo_M_2')
    i += 1

- Mes M-3:

In [291]:
df_yu_M_3 = (sqlContext.read.format('csv')
                    .options(header='false', inferSchema='true', delimiter = '\t')
                    .load('/data/raw/vf_es/cvm/ES_CVM_EXTRYUBAPORT_DATALAB_M/1.1/csv/partitioned_month='+MONTH_BEFORE_M_3[0:6]+'/year='+MONTH_BEFORE_M_3[0:4]+'/month='+str(int(MONTH_BEFORE_M_3[4:6])))
                   )

In [292]:
df_yu_M_3 = df_yu_M_3.drop(*['day'])

In [293]:
i = 1

df_yu_M_3 = df_yu_M_3.withColumnRenamed('_c0', 'msisdn')

while i < len(yu_meses_previos):
    df_yu_M_3 = df_yu_M_3.withColumnRenamed(df_yu_M_3.columns[i], yu_meses_previos[i].upper()+'_previo_M_3')
    i += 1

- Mes M-4:

In [294]:
df_yu_M_4 = (sqlContext.read.format('csv')
                    .options(header='false', inferSchema='true', delimiter = '\t')
                    .load('/data/raw/vf_es/cvm/ES_CVM_EXTRYUBAPORT_DATALAB_M/1.1/csv/partitioned_month='+MONTH_BEFORE_M_4[0:6]+'/year='+MONTH_BEFORE_M_4[0:4]+'/month='+str(int(MONTH_BEFORE_M_4[4:6])))
                   )

In [295]:
df_yu_M_4 = df_yu_M_4.drop(*['day'])

In [296]:
i = 1

df_yu_M_4 = df_yu_M_4.withColumnRenamed('_c0', 'msisdn')

while i < len(yu_meses_previos):
    df_yu_M_4 = df_yu_M_4.withColumnRenamed(df_yu_M_4.columns[i], yu_meses_previos[i].upper()+'_previo_M_4')
    i += 1

- Join de información sobre Comunidad Yu:

In [297]:
df_yu_previo1 = df_yu_M_2.join(df_yu_M_3, on = 'msisdn', how = 'leftouter')
df_yu_final = df_yu_previo1.join(df_yu_M_4, on = 'msisdn', how = 'leftouter')

- CUR:

In [298]:
cur_vega = spark.read.parquet('/data/raw/vf_es/billingtopsups/CUR_VEGA/1.0/parquet/year='+MONTH_BEFORE_M_2[:4]+'/month='+str(int(MONTH_BEFORE_M_2[4:6])))

In [299]:
test_cur_vega = cur_vega.withColumn('msisdn', expr("substring(MSISDN, 3, length(MSISDN)-1)"))

In [300]:
current_timestamp = datetime.datetime.now()

In [301]:
from pyspark.sql.functions import datediff, to_date, to_timestamp

cur_vega_pre = (test_cur_vega
            .withColumn('activacion', when(col('razon_concesion') == 'ACTIVACION', 1).otherwise(0))
            .withColumn('renovacion', when(col('razon_concesion') == 'RENOVACION', 1).otherwise(0))
            .withColumn('suscripcion', when(col('razon_concesion') == 'SUSCRIPCION', 1).otherwise(0))
            .withColumn('fx_concesion', to_timestamp("FECHA_CONCESION", "%Y-%m-%d %H:%M:%S"))
            .withColumn('fx_fin_validez', to_timestamp("FECHA_FIN_VALIDEZ", "%Y-%m-%d %H:%M:%S"))
            .withColumn('days_fx_concesion_2_fin_validez', datediff('fx_fin_validez','fx_concesion'))
           )

In [302]:
cur_vega = (cur_vega_pre
            .groupBy('msisdn')
            .agg(F.sum(col('activacion')).alias('num_activaciones'),
                 F.sum(col('renovacion')).alias('num_renovacion'),
                 F.sum(col('suscripcion')).alias('num_suscripcion'),
                 F.max(col('tipo_beneficio')).alias('tipo_beneficio'),
                 F.sum(col('days_fx_concesion_2_fin_validez')).alias('days_fx_concesion_2_fin_validez')
                )
           )

In [303]:
cur_dump = spark.read.parquet('/data/raw/vf_es/billingtopsups/CUR_DUMP/1.0/parquet/year='+MONTH_BEFORE_M_2[:4]+'/month='+str(int(MONTH_BEFORE_M_2[4:6])))

In [304]:
test_cur_dump = (cur_dump
                 .withColumn('msisdn', expr("substring(vfsid, 3, length(vfsid)-1)"))
                 .withColumn('fx_1stcall', to_timestamp("vf1stcalldate", "%Y-%m-%d %H:%M:%S"))
                 .withColumn('fx_1stactivation', to_timestamp("vf1stactdate", "%Y-%m-%d %H:%M:%S"))
                 .withColumn('fx_eslapsed', to_timestamp("vfeslapseddate", "%Y-%m-%d %H:%M:%S"))
                 .withColumn('fx_expiration', to_timestamp("vfesexpirationdate", "%Y-%m-%d %H:%M:%S"))
                 .withColumn('days_since_activation2call', datediff('fx_1stactivation','fx_1stcall'))
                )

In [305]:
cur_dump = (test_cur_dump
            .groupBy('msisdn')
            .agg(F.max(col('vfespreactreason')).alias('pre_activation_reason'),
                 F.sum(col('vfimeisv')).alias('imei_sv'),
                 F.max(col('vfplan')).alias('tarifa'),
                 F.sum(col('days_since_activation2call')).alias('days_since_activation2call'),

                )
           )

# Unimos tablas.

In [306]:
df_final_pre0 = df_prepago_labeled.join(cur_dump, on = 'msisdn', how = 'leftouter')

In [307]:
df_final_pre1 = df_final_pre0.join(cur_vega, on = 'msisdn', how = 'leftouter')

In [308]:
df_final_pre2 = df_final_pre1.join(df_tarifas, on = 'msisdn', how = "leftouter")

In [309]:
df_final_pre3 = df_final_pre2.join(df_yu_final, on = 'msisdn', how = "leftouter")

In [310]:
df_final_pre4 = df_final_pre3.join(df_monthlyfact_final, on = 'msisdn', how = 'leftouter')

In [311]:
df_final_pre5 = df_final_pre4.join(df_recargas_final, on = df_final_pre4['NIF_CLIENTE']==df_recargas_final['nif'], how= 'leftouter')

In [312]:
df_final_pre6 = df_final_pre5.join(df_balance_receptor, on='msisdn', how="leftouter")

In [313]:
df_final_pre7 = df_final_pre6.join(df_balance_emisor, on ='msisdn', how="leftouter")

In [314]:
df_final_pre8 = (df_final_pre7
    .withColumn("tbal_rec_amount", 1.21*col("importe_traspasado_receptor_M-1"))
    .withColumnRenamed("num_rec", "tbal_rec_num")
    .withColumn("tbal_tra_amount", 1.21*col("importe_traspasado_emisor_M-1") + 1.21*col("importe_cargo_emisor_M-1"))
    .withColumnRenamed("num_em", "tbal_tra_num"))

In [315]:
df_final_pre9 = df_final_pre8.join(df_topups, on = 'msisdn', how="left")

In [316]:
df_final_pre10 = df_final_pre9.join(df_voicesms, on = 'msisdn', how="left")

In [317]:
df_final_pre11 = df_final_pre10.join(df_data_consumed, on = 'msisdn', how="left")

In [318]:
df_final_pre12 = df_final_pre11.join(df_advance_solicitado, on = 'msisdn', how="left")

In [319]:
df_final_pre13 = df_final_pre12.join(df_balance, on ='msisdn', how = 'left')

In [320]:
df_final_pre14 = df_final_pre13.join(df_advance_recuperado, on = 'msisdn', how="left")

In [321]:
df_final = df_final_pre14.drop(*["MSISDN_tarif", "MSISDN_tarif_M_1", "MSISDN_tarif_M_2", "MSISDN_tarif_M_3", "MSISDN_tarif_M_4", "MSISDN_tarif_M_5", "NIF_FACTURACION_M-2",
                                     "msisdn_emisor","msisdn_receptor", 'MSISDN_tarif', "NIF_FACTURACION_M-1", 'Num_services_M-2', "NIF_FACTURACION_M-3", "Fx_ejec_M", "Num_services_M-1",
                                     "count(month)", "nationality", 'msisdn_pre', 'MSISDN_previo_M_2', "year_rec", "month_rec", "day_rec", "msisdn_rec", "MSISDN_balances",
                                     "year_advance", "month_advance", "day_advance", "msisdn_advance","year_data", "month_data", "day_data", "msisdn_data","year_data", "month_data", "day_data", "msisdn_data",
                                     "year_topups", "month_topups", "day_topups", "ndc_msisdn", 
                                    ])

# Añadimos ahora información de Netscout, CCC y otras fuentes de datos.

In [322]:
# Esta fecha ha de ser cambiada en función de la entrega a realizar.

ClosingDay = MONTH_BEFORE_M_2

ClosingDay_date = datetime.date(int(ClosingDay[:4]), int(ClosingDay[4:6]), int(ClosingDay[6:8]))

hdfs_partition_path = 'year=' + str(int(ClosingDay[:4])) + '/month=' + str(int(ClosingDay[4:6])) + '/day=' + str(int(ClosingDay[6:8]))

hdfs_write_path_common='/data/udf/vf_es/amdocs_ids/'

path_customer = hdfs_write_path_common +'customer/'+hdfs_partition_path
path_service = hdfs_write_path_common +'service/'+hdfs_partition_path
path_netscout_apps = hdfs_write_path_common +'netscout_apps/'+hdfs_partition_path
path_customer_agg = hdfs_write_path_common +'customer_agg/'+hdfs_partition_path

netscout_apps_load = spark.read.load(path_netscout_apps)
custAggServices=(spark.read.load(path_customer_agg))

## Información de CCC: M-2.

In [323]:
ClosingDay_M_2 = MONTH_BEFORE_M_2

hdfs_partition_path_M_2 = 'year=' + str(int(ClosingDay_M_2[:4])) + '/month=' + str(int(ClosingDay_M_2[4:6])) + '/day=' + str(int(ClosingDay_M_2[6:8]))
path_calls_to_competitor = '/data/attributes/vf_es/return_feed/call_to_competitor/'+hdfs_partition_path_M_2
path_ccc = hdfs_write_path_common +'call_centre_calls/' + hdfs_partition_path_M_2

competitor_calls_M_2 = (spark.read.load(path_calls_to_competitor))
df_ccc_load_M_2 = (spark.read.load(path_ccc))

In [324]:
df_ccc_load_M_2 = (df_ccc_load_M_2
                   .select(['Bucket_Sub_Bucket_Churn_Cancellations_Other_churn_issues',
                            'Bucket_Sub_Bucket_Churn_Cancellations_Churn_cancellations_process',
                            'Bucket_Sub_Bucket_Prepaid_balance_Top_up_process',
                            'Bucket_Sub_Bucket_Churn_Cancellations_Negotiation',
                            'Bucket_Sub_Bucket_Churn_Cancellations_Referrals',
                            'Bucket_Sub_Bucket_Churn_Cancellations_Transfers',
                            'Bucket_Sub_Bucket_Churn_Cancellations_Network',
                            'Raw_Productos_Voz','Raw_Resultado_Bajas',
                            'Raw_Provision_Resto','Raw_Provision_Movil',
                            'Bucket_Churn_Cancellations','Bucket_Prepaid_balance',
                            'Raw_Baja','Raw_Cierre', 'msisdn'
                           ])
                   .withColumnRenamed('Bucket_Sub_Bucket_Churn_Cancellations_Other_churn_issues', 'Bucket_Sub_Bucket_Churn_Cancellations_Other_churn_issues-M-2')
                   .withColumnRenamed('Bucket_Sub_Bucket_Churn_Cancellations_Churn_cancellations_process', 'Bucket_Sub_Bucket_Churn_Cancellations_Churn_cancellations_process-M-2')
                   .withColumnRenamed('Bucket_Sub_Bucket_Prepaid_balance_Top_up_process', 'Bucket_Sub_Bucket_Prepaid_balance_Top_up_process-M-2')
                   .withColumnRenamed('Bucket_Sub_Bucket_Churn_Cancellations_Negotiation', 'Bucket_Sub_Bucket_Churn_Cancellations_Negotiation-M-2')
                   .withColumnRenamed('Bucket_Sub_Bucket_Churn_Cancellations_Referrals', 'Bucket_Sub_Bucket_Churn_Cancellations_Referrals-M-2')
                   .withColumnRenamed('Bucket_Sub_Bucket_Churn_Cancellations_Transfers', 'Bucket_Sub_Bucket_Churn_Cancellations_Transfers-M-2')
                   .withColumnRenamed('Bucket_Sub_Bucket_Churn_Cancellations_Network', 'Bucket_Sub_Bucket_Churn_Cancellations_Network-M-2')
                   .withColumnRenamed('Raw_Productos_Voz', 'Raw_Productos_Voz-M-2')
                   .withColumnRenamed('Raw_Resultado_Bajas', 'Raw_Resultado_Bajas-M-2')
                   .withColumnRenamed('Raw_Provision_Resto', 'Raw_Provision_Resto-M-2')
                   .withColumnRenamed('Raw_Provision_Movil', 'Raw_Provision_Movil-M-2')
                   .withColumnRenamed('Bucket_Churn_Cancellations', 'Bucket_Churn_Cancellations-M-2')
                   .withColumnRenamed('Bucket_Prepaid_balance', 'Bucket_Prepaid_balance-M-2')
                   .withColumnRenamed('Raw_Baja', 'Raw_Baja-M-2')
                   .withColumnRenamed('Raw_Cierre', 'Raw_Cierre-M-2')

                  )

## Información de CCC: M-3.

In [325]:
ClosingDay_M_3 = MONTH_BEFORE_M_3

hdfs_partition_path_M_3 = 'year=' + str(int(ClosingDay_M_3[:4])) + '/month=' + str(int(ClosingDay_M_3[4:6])) + '/day=' + str(int(ClosingDay_M_3[6:8]))
path_calls_to_competitor = '/data/attributes/vf_es/return_feed/call_to_competitor/'+hdfs_partition_path_M_3
path_ccc = hdfs_write_path_common +'call_centre_calls/' + hdfs_partition_path_M_3

competitor_calls_M_3 = (spark.read.load(path_calls_to_competitor))
df_ccc_load_M_3 = (spark.read.load(path_ccc))

In [326]:
df_ccc_load_M_3 = (df_ccc_load_M_3
                   .select(['Bucket_Sub_Bucket_Churn_Cancellations_Other_churn_issues',
                            'Bucket_Sub_Bucket_Churn_Cancellations_Churn_cancellations_process',
                            'Bucket_Sub_Bucket_Prepaid_balance_Top_up_process',
                            'Bucket_Sub_Bucket_Churn_Cancellations_Negotiation',
                            'Bucket_Sub_Bucket_Churn_Cancellations_Referrals',
                            'Bucket_Sub_Bucket_Churn_Cancellations_Transfers',
                            'Bucket_Sub_Bucket_Churn_Cancellations_Network',
                            'Raw_Productos_Voz','Raw_Resultado_Bajas',
                            'Raw_Provision_Resto','Raw_Provision_Movil',
                            'Bucket_Churn_Cancellations','Bucket_Prepaid_balance',
                            'Raw_Baja','Raw_Cierre', 'msisdn'
                           ])
                   .withColumnRenamed('Bucket_Sub_Bucket_Churn_Cancellations_Other_churn_issues', 'Bucket_Sub_Bucket_Churn_Cancellations_Other_churn_issues-M-3')
                   .withColumnRenamed('Bucket_Sub_Bucket_Churn_Cancellations_Churn_cancellations_process', 'Bucket_Sub_Bucket_Churn_Cancellations_Churn_cancellations_process-M-3')
                   .withColumnRenamed('Bucket_Sub_Bucket_Prepaid_balance_Top_up_process', 'Bucket_Sub_Bucket_Prepaid_balance_Top_up_process-M-3')
                   .withColumnRenamed('Bucket_Sub_Bucket_Churn_Cancellations_Negotiation', 'Bucket_Sub_Bucket_Churn_Cancellations_Negotiation-M-3')
                   .withColumnRenamed('Bucket_Sub_Bucket_Churn_Cancellations_Referrals', 'Bucket_Sub_Bucket_Churn_Cancellations_Referrals-M-3')
                   .withColumnRenamed('Bucket_Sub_Bucket_Churn_Cancellations_Transfers', 'Bucket_Sub_Bucket_Churn_Cancellations_Transfers-M-3')
                   .withColumnRenamed('Bucket_Sub_Bucket_Churn_Cancellations_Network', 'Bucket_Sub_Bucket_Churn_Cancellations_Network-M-3')
                   .withColumnRenamed('Raw_Productos_Voz', 'Raw_Productos_Voz-M-3')
                   .withColumnRenamed('Raw_Resultado_Bajas', 'Raw_Resultado_Bajas-M-3')
                   .withColumnRenamed('Raw_Provision_Resto', 'Raw_Provision_Resto-M-3')
                   .withColumnRenamed('Raw_Provision_Movil', 'Raw_Provision_Movil-M-3')
                   .withColumnRenamed('Bucket_Churn_Cancellations', 'Bucket_Churn_Cancellations-M-3')
                   .withColumnRenamed('Bucket_Prepaid_balance', 'Bucket_Prepaid_balance-M-3')
                   .withColumnRenamed('Raw_Baja', 'Raw_Baja-M-3')
                   .withColumnRenamed('Raw_Cierre', 'Raw_Cierre-M-3')

                  )

- Llamadas a competidores:

In [327]:
competitor_calls_M_2 = (competitor_calls_M_2
                        .withColumnRenamed('group', 'group_M-2')
                        .withColumnRenamed('total_duration', 'total_duration_M-2')
                        .withColumnRenamed('times_called', 'times_called_M-2')
                        .withColumnRenamed('event_date', 'event_date_M-2')
                        .withColumnRenamed('load_date', 'load_date_M-2')
                        .withColumnRenamed('competitor', 'competitor_M-2')
                       )

In [328]:
competitor_calls_M_3 = (competitor_calls_M_3
                        .withColumnRenamed('group', 'group_M-3')
                        .withColumnRenamed('total_duration', 'total_duration_M-3')
                        .withColumnRenamed('times_called', 'times_called_M-3')
                        .withColumnRenamed('event_date', 'event_date_M-3')
                        .withColumnRenamed('load_date', 'load_date_M-3')
                        .withColumnRenamed('competitor', 'competitor_M-3')
                       )

- Unimos todo lo anterior:

In [329]:
data_CAR_SRV=(serviceDF_load.select('msisdn', 'NUM_CLIENTE')
      .join(netscout_apps_load, 'msisdn', 'leftouter')
      .join(custAggServices, 'NUM_CLIENTE', 'leftouter')
      .join(df_ccc_load_M_2, 'msisdn', 'leftouter')
      .join(df_ccc_load_M_3, 'msisdn', 'leftouter')
      .join(competitor_calls_M_2, 'msisdn', 'leftouter')
      .join(competitor_calls_M_3, 'msisdn', 'leftouter')
             ).drop('NUM_CLIENTE')

In [330]:
data_CAR_SRV_prepared = (data_CAR_SRV.withColumn('ClosingDay',lit(ClosingDay)).drop(*['nacionalidad']))

In [331]:
prepaid_final = df_final.join(data_CAR_SRV_prepared, on = 'msisdn', how = 'inner')

-----

### IMPORTANTE: El atributo `month_of_analysis` se utiliza para separar las muestras de entrenamiento de las muestras de predicción después de haber ejecutado el General Model Trainer:
   - Para el IDS de entrenamiento: `month_of_analysis` = *Month of training = M-1*.
   - Para el IDS de predicción: `month_of_analysis` = *Month of prediction = M*.

In [332]:
prepaid_final = prepaid_final.withColumn('month_of_analysis', lit('Month of training = M-1').cast(StringType()))
prepaid_final = prepaid_final.withColumn('month_of_analysis', lit('Month of prediction = M').cast(StringType()))

---

In [333]:
drop_cols = [
 'TRATAMIENTO',
 'NOMBRE',
 'PRIM_APELLIDO',
 'SEG_APELLIDO',
 'CLASE_CLI_COD_CLASE_CLIENTE',
 'DIR_LINEA1',
 'DIR_LINEA2',
 'DIR_LINEA3',
 'COD_ESTADO_GENERAL',
 'NOM_COMPLETO',
 'DIR_FACTURA1',
 'DIR_FACTURA2',
 'DIR_FACTURA3',
 'DIR_FACTURA4',
 'TRAT_FACT',
 'NOMBRE_CLI_FACT',
 'APELLIDO1_CLI_FACT',
 'APELLIDO2_CLI_FACT',
 'DIR_NUM_DIRECCION',
 'NIF_CLIENTE',
 'FECHA_NACI',
 'METODO_PAGO',
 'PUBLICIDAD',
 'ENCUESTAS',
 'CTA_CORREO_CONTACTO',
 'CTA_CORREO',
 'FACTURA_CATALAN',
 'FACTURA_ELECTRONICA',
 'SUPEROFERTA',
 'NIF_FACTURACION',
 'X_PUBLICIDAD_EMAIL',
 'CICLO',
 'x_tipo_cuenta_corp',
 'x_antiguedad_cuenta',
 'x_datos_navegacion',
 'x_datos_trafico',
 'x_cesion_datos',
 'x_user_facebook',
 'x_user_twitter',
 'FLG_LORTAD',
 'FLG_ROBINSON',
 'X_FORMATO_FACTURA',
 'X_IDIOMA_FACTURA',
 'FECHA_MIGRACION',
 'ENCUESTAS2',
 'cta_correo_flag',
 'cta_correo_server',
 'Instancia_P',
 'OBJID',
 'TACADA',
 'FX_SRV_BASIC',
 'PRICE_SRV_BASIC',
 'RGU',
 'TIPO_SIM',
 'IMSI',
 'TARIFF',
 'FX_TARIFF',
 'DESC_TARIFF',
 'PRICE_TARIFF',
 'VOICE_TARIFF',
 'FX_VOICE_TARIFF',
 'PRICE_VOICE_TARIFF',
 'DATA',
 'FX_DATA',
 'PRICE_DATA',
 'DTO_LEV1',
 'FX_DTO_LEV1',
 'PRICE_DTO_LEV1',
 'DTO_LEV2',
 'FX_DTO_LEV2',
 'PRICE_DTO_LEV2',
 'DTO_LEV3',
 'FX_DTO_LEV3',
 'PRICE_DTO_LEV3',
 'DATA_ADDITIONAL',
 'FX_DATA_ADDITIONAL',
 'PRICE_DATA_ADDITIONAL',
 'OOB',
 'FX_OOB',
 'PRICE_OOB',
 'NETFLIX_NAPSTER',
 'FX_NETFLIX_NAPSTER',
 'PRICE_NETFLIX_NAPSTER',
 'ROAMING_BASIC',
 'FX_ROAMING_BASIC',
 'PRICE_ROAMING_BASIC',
 'ROAM_USA_EUR',
 'FX_ROAM_USA_EUR',
 'PRICE_ROAM_USA_EUR',
 'ROAM_ZONA_2',
 'FX_ROAM_ZONA_2',
 'PRICE_ROAM_ZONA_2',
 'CONSUM_MIN',
 'FX_CONSUM_MIN',
 'PRICE_CONSUM_MIN',
 'SIM_VF',
 'HOMEZONE',
 'FX_HOMEZONE',
 'PRICE_HOMEZONE',
 'MOBILE_HOMEZONE',
 'FBB_UPGRADE',
 'FX_FBB_UPGRADE',
 'PRICE_FBB_UPGRADE',
 'DECO_TV',
 'FX_DECO_TV',
 'PRICE_DECO_TV',
 'NUM_SERIE_DECO_TV',
 'OBJID_DECO_TV',
 'TV_CUOTA_ALTA',
 'FX_TV_CUOTA_ALTA',
 'PRICE_TV_CUOTA_ALTA',
 'TV_TARIFF',
 'FX_TV_TARIFF',
 'PRICE_TV_TARIFF',
 'TV_CUOT_CHARGES',
 'FX_TV_CUOT_CHARGES',
 'PRICE_TV_CUOT_CHARGES',
 'TV_PROMO',
 'FX_TV_PROMO',
 'PRICE_TV_PROMO',
 'TV_PROMO_USER',
 'FX_TV_PROMO_USER',
 'PRICE_TV_PROMO_USER',
 'TV_ABONOS',
 'FX_TV_ABONOS',
 'PRICE_TV_ABONOS',
 'TV_LOYALTY',
 'FX_TV_LOYALTY',
 'PRICE_TV_LOYALTY',
 'TV_SVA',
 'FX_TV_SVA',
 'PRICE_TV_SVA',
 'FOOTBALL_TV',
 'FX_FOOTBALL_TV',
 'PRICE_FOOTBALL_TV',
 'MOTOR_TV',
 'FX_MOTOR_TV',
 'PRICE_MOTOR_TV',
 'PVR_TV',
 'FX_PVR_TV',
 'PRICE_PVR_TV',
 'ZAPPER_TV',
 'FX_ZAPPER_TV',
 'PRICE_ZAPPER_TV',
 'TRYBUY_TV',
 'FX_TRYBUY_TV',
 'PRICE_TRYBUY_TV',
 'TRYBUY_AUTOM_TV',
 'FX_TRYBUY_AUTOM_TV',
 'PRICE_TRYBUY_AUTOM_TV',
 'CAMPO1',
 'CAMPO2',
 'CAMPO3',
 'flag_msisdn_err',
 'TV_TOTAL_CHARGES',
 'MOBILE_BAM_TOTAL_CHARGES',
 'msisdn_CAR',
 'Fecha_ejecucion'
]

In [334]:
msisdn_cols = [c for c in prepaid_final.columns if 'MSISDN_' in c]

In [335]:
prepaid_final = prepaid_final.drop(*drop_cols+msisdn_cols)

In [336]:
prepaid_final.groupBy('Churned').count().show()

+-------+-------+
|Churned|  count|
+-------+-------+
|   null|1521707|
+-------+-------+



---

## Escribimos los datos almacenados en la BDP.

**IMPORTANTE**: Cuando estemos almacenando el IDS de una nueva ejecución en una tabla de la BDP, es necesario utilizar el modo 'overwrite'. Después, para ensamblar las muestras de entrenamiento sobre las muestras de predicción, es necesario que utilicemos el modo 'append', para que no borre los registros anteriormente guardados.

In [337]:
prepaid_final.write.saveAsTable('tests_es.carnaum2_churn_prepago_ids', format='parquet', mode='append') # Use *mode = 'overwrite'* or 'append'

In [172]:
end_time = dt.datetime.now()
total_time = 2*str(int(end_time - start_time))
total_time_sec=total_time.total_seconds() 
print('Tiempo de ejecución ' + total_time+' segundos')

TypeError: int() argument must be a string or a number, not 'datetime.timedelta'

In [338]:
spark.stop()