In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_colwidth', 300)

import matplotlib.pyplot as plt

from red_agent.common.utils.hdfs_generic import *
import os

MAX_N_EXECUTORS=4
MIN_N_EXECUTORS=1
N_CORES_EXECUTOR=4
EXECUTOR_IDLE_MAX_TIME=120
EXECUTOR_MEMORY='16g'
DRIVER_MEMORY='8g'
N_CORES_DRIVER=1
MEMORY_OVERHEAD=N_CORES_EXECUTOR*2048
QUEUE="root.datascience.normal"
MAX_PORT_RETRY=100


SPARK_COMMON_OPTS=os.environ.get('SPARK_COMMON_OPTS', '')
SPARK_COMMON_OPTS+=" --executor-memory %s --driver-memory %s" % (EXECUTOR_MEMORY, DRIVER_MEMORY)
SPARK_COMMON_OPTS+=" --conf spark.shuffle.manager=tungsten-sort"
SPARK_COMMON_OPTS+="  --queue %s" % QUEUE
SPARK_COMMON_OPTS+=" --conf spark.port.maxRetries=%s" % (MAX_PORT_RETRY)

# Dynamic allocation configuration
SPARK_COMMON_OPTS+=" --conf spark.dynamicAllocation.enabled=true"
SPARK_COMMON_OPTS+=" --conf spark.shuffle.service.enabled=true"
SPARK_COMMON_OPTS+=" --conf spark.dynamicAllocation.maxExecutors=%s" % (MAX_N_EXECUTORS)
SPARK_COMMON_OPTS+=" --conf spark.dynamicAllocation.minExecutors=%s" % (MIN_N_EXECUTORS)
SPARK_COMMON_OPTS+=" --conf spark.dynamicAllocation.executorIdleTimeout=%s" % (EXECUTOR_IDLE_MAX_TIME)



BDA_ENV = os.environ.get('BDA_USER_HOME', '')

# Attach bda-core-ra codebase
SPARK_COMMON_OPTS+=" --files \
{}/scripts/properties/red_agent/nodes.yaml,\
{}/scripts/properties/red_agent/nodes-de.yaml,\
{}/scripts/properties/red_agent/nodes-es.yaml,\
{}/scripts/properties/red_agent/nodes-ie.yaml,\
{}/scripts/properties/red_agent/nodes-it.yaml,\
{}/scripts/properties/red_agent/nodes-pt.yaml,\
{}/scripts/properties/red_agent/nodes-uk.yaml".format(*[BDA_ENV]*7)

os.environ["SPARK_COMMON_OPTS"] = SPARK_COMMON_OPTS
os.environ["PYSPARK_SUBMIT_ARGS"] = "%s pyspark-shell " % SPARK_COMMON_OPTS

#print os.environ.get('SPARK_COMMON_OPTS', '')
#print os.environ.get('PYSPARK_SUBMIT_ARGS', '')

sc, sparkSession, sqlContext = run_sc()
print sc.defaultParallelism

2


In [3]:
# This literal_eval is needed since 
# we have to read from a textfile
# which is formatted as python objects.
# It is totally safe.
from ast import literal_eval

# Standard Library stuff:
from functools import partial
from datetime import date, timedelta, datetime

# Numpy stuff
from numpy import (nan as np_nan, round as np_round, int64 as np_int64)
import numpy as np


# Spark stuff
from pyspark.sql import SparkSession
from pyspark import StorageLevel
from pyspark.sql.functions import (udf, col, decode, when, lit, lower, concat,
                                   translate, count, sum as sql_sum, max as sql_max, min as sql_min,
                                   round, 
                                   mean, stddev, datediff,
                                   length,
                                   countDistinct,
                                   hour, date_format, collect_set, collect_list,
                                   year, month, dayofmonth,
                                   rank, expr, lag, coalesce, row_number,
                                   isnull, isnan,
                                   unix_timestamp,
                                   split
                                  )
from pyspark.sql.types import DoubleType, StringType, IntegerType, ArrayType, FloatType

from pyspark.sql import DataFrameStatFunctions as statFunc

from pyspark.sql.window import Window

import json
from collections import OrderedDict

from subprocess import Popen, PIPE
import datetime, calendar

# Import my own libraries
# from notebooks_utils import deliverTable

In [4]:
spark = (SparkSession.builder
         .appName("One Model Per Campaign")
         .master("yarn")
         .config("spark.submit.deployMode", "client")
         .config("spark.ui.showConsoleProgress", "true")
         .enableHiveSupport()
         .getOrCreate()
         )

# sc = spark.sparkContext

## Preprocessing

In [5]:
# Calculate the current month and year so that campaigns from two months can be read
currentMonth = datetime.datetime.now().month
currentYear = datetime.datetime.now().year

In [7]:
currentMonth

6

In [8]:
# Functions
def reduceLabelSet(label_set):
    # First, prioritize positive responses
    if 'Target_Positive' in label_set:
        return 'Target_Positive'
    elif 'Control_Positive' in label_set:
        return 'Control_Positive'
    # Then, negative responses
    elif 'Target_Negative' in label_set:
        return 'Target_Negative'
    elif 'Control_Negative' in label_set:
        return 'Control_Negative'
    else: return 'Ignore'

# UDFs
reduceCampaignCode = udf(lambda CampaignCodeArray: '_'.join(CampaignCodeArray.split('_')[2:]), StringType())
reduceLabelSet = udf(reduceLabelSet, StringType())

## Read table Contact Hist

In [10]:
NifContactHist = (spark.read.load("/data/raw/vf_es/campaign/NifContactHist/1.0/parquet/"))

In [12]:
NIF_Contact_hist = \
(NifContactHist
 .where(col('campaigncode').like('%PXXX%'))
 .where(~col('canal').like('%NBA%'))
 .where(~col('canal').like('%PO%'))
 .where(~col('canal').like('%PER%'))
 .where(~col('canal').like('%PMG%'))
 .where(~col('canal').like('%PROMPTOS%'))
 .where(~col('canal').like('%Tienda%'))
 .where(col('flag_borrado')==0)
 # If CampaignCode is AUTOMMES_PXXXC_BTS_XSELL_FUT
 # with this function, only the string BTS_XSELL_FUT is kept
 .withColumn('CampaignType',
             (reduceCampaignCode(col('CampaignCode')))
            )
 .withColumn('Grupo',
             when(col('cellcode').startswith('CU'), 'Universal')
             .when(col('cellcode').startswith('CC'), 'Control')
             .otherwise('Target'))
 .select(col('year'),col('month'),col('day'),col('cif_nif'),
         col('CampaignCode'),col('CampaignType'),col('Grupo'),
         col('creatividad'),col('treatmentcode'),col('canal'),
         col('contactdatetime')
        )
)

In [14]:
NIF_Contact_hist.head()

Row(year=2017, month=10, day=25, cif_nif=u'32444203Y', CampaignCode=u'20161001_PXXXT_REM__FUNC_ECARE', CampaignType=u'REM__FUNC_ECARE', Grupo=u'Target', creatividad=u'SMS_REM__FUNC_ECARE', treatmentcode=u'148435407', canal=u'SMS', contactdatetime=datetime.datetime(2017, 10, 25, 11, 15, 50))

### Take the most common campaigns from two months away

In [8]:
orderedContactCampaignCodes = \
(NIF_Contact_hist
 .where(col('year') == currentYear)
 .where(col('month') == (currentMonth-2))
 .groupBy(col('CampaignCode'))
 .count()
 .orderBy(col('count').desc())
 .select(col('CampaignCode'))
 .collect()
)

In [9]:
firstCampaignCode = orderedContactCampaignCodes[0].CampaignCode

In [10]:
firstCampaignCode

u'20180319_PXXXT_PORT_TRANS_VFTV'

## Read table Response Hist

In [11]:
NifResponseHist = (spark.read.load("/data/raw/vf_es/campaign/NifResponseHist/1.0/parquet/"))

In [12]:
NIF_Response_hist = \
(NifResponseHist
 .where(col('campaigncode').like('%PXXX%'))
 .where(col('flag_borrado')==0)
 # If CampaignCode is AUTOMMES_PXXXC_BTS_XSELL_FUT
 # with this function, only the string BTS_XSELL_FUT is kept
 .withColumn('CampaignType',
             (reduceCampaignCode(col('CampaignCode')))
            )
 .withColumn('Grupo',
             when(col('cellcode').startswith('CU'), 'Universal')
             .when(col('cellcode').startswith('CC'), 'Control')
             .otherwise('Target'))
 .select(col('year'),col('month'),col('day'),col('cif_nif'),
         col('CampaignCode'),col('CampaignType'),col('Grupo'),
         col('creatividad'),col('treatmentcode'),col('canal'),
         col('responsedatetime')
        )
)

In [13]:
orderedResponseCampaignCodes = \
(NIF_Response_hist
 .where(col('year') == currentYear)
 .where(col('month') == (currentMonth-1))
 .groupBy(col('CampaignCode'))
 .count()
 .orderBy(col('count').desc())
 .select(col('CampaignCode'))
 .collect()
)

In [14]:
firstResponseCampaignCode = orderedResponseCampaignCodes[0].CampaignCode

In [15]:
sample_NIF_Response_hist = (spark.createDataFrame(NIF_Response_hist
                                                  .where(col('CampaignCode')==str(firstResponseCampaignCode))
                                                  .take(200000)))
sample_NIF_Contact_hist = (spark.createDataFrame(NIF_Contact_hist
                                                 .where(col('CampaignCode')==str(firstResponseCampaignCode))
                                                 .take(200000)))

In [16]:
sample_NIF_Contact_hist.head()

Row(year=2018, month=6, day=4, cif_nif=u'29467236W', CampaignCode=u'AUTOMMES_PXXXC_CH_N2', CampaignType=u'CH_N2', Grupo=u'Target', creatividad=u'CONV_FIBRA_NO_HR', treatmentcode=u'148613732', canal=u'SAT', contactdatetime=datetime.datetime(2018, 6, 4, 18, 49, 39))

In [17]:
sample_NIF_Response_hist.head()

Row(year=2018, month=6, day=7, cif_nif=u'66463538H', CampaignCode=u'AUTOMMES_PXXXC_CH_N2', CampaignType=u'CH_N2', Grupo=u'Target', creatividad=u'SOLO_MOVIL_HUELLA_MM', treatmentcode=u'148601394', canal=u'SAT', responsedatetime=datetime.datetime(2018, 6, 7, 0, 0))

In [18]:
nif_with_labels = \
(sample_NIF_Contact_hist
 .join(sample_NIF_Response_hist
       .drop(col('year'))
       .drop(col('month'))
       .drop(col('Grupo'))
       .drop(col('canal'))
       .drop(col('day'))
       .drop(col('CampaignType'))
       .drop(col('creatividad'))
       .drop_duplicates()
       ,
       ['cif_nif','CampaignCode','treatmentcode'],
       how='left_outer'
      )
 .withColumn('EsRespondedor',
             when(col('responsedatetime').isNotNull(),1)
             .otherwise(0))
 .withColumn('Label',
             when((col('Grupo')=='Control') & (col('EsRespondedor')==0), 'Control_Negative')
             .when((col('Grupo')=='Control') & (col('EsRespondedor')==1), 'Control_Positive')
             .when((col('Grupo')=='Target') & (col('EsRespondedor')==0), 'Target_Negative')
             .when((col('Grupo')=='Target') & (col('EsRespondedor')==1), 'Target_Positive')
             .otherwise('Ignore')
            )
 .select(col('year'),col('month'), col('day'),col('cif_nif'),col('CampaignCode'),col('CampaignType'),col('Grupo'),
         col('creatividad'),col('canal'), col('contactdatetime'), col('Label')
        )
 .orderBy(col('year'),col('month'), col('day'),col('cif_nif'))
)

In [19]:
labeled_datamart_nif_level = \
(nif_with_labels
 .groupBy(col('year'),col('month'), col('day'),col('cif_nif'))
 # Collect all possible responses in label_set
 # Sometimes, under a same CampaignCode, there can be some contradictory responses
 .agg(collect_list('Label').alias('label_set'))
 .withColumn('CampaignCode',lit(str(firstResponseCampaignCode)))
 .withColumn('Label',reduceLabelSet(col('label_set')))
 .drop(col('label_set'))
 .cache()
)

In [28]:
labeled_datamart_nif_level.head()

Row(year=2018, month=5, day=3, cif_nif=u'00002490X', CampaignCode=u'AUTOMMES_PXXXC_CH_N2', Label=u'Target_Positive')

## Read attributes from customer base

In [20]:
customerAttributes = (spark.read.table('tests_es.rbl_ids_srv_20180331'))

In [21]:
colsGNV_CustomerAttributes = [(column) for column in customerAttributes.columns if column.startswith('GNV')]
collectList_colsGNV_CustomerAttributes = [collect_list(col(column)) for column in customerAttributes.columns if column.startswith('GNV')]

In [22]:
def clean_GNV_attributes(column_GNV_values):
    if len(column_GNV_values) == 0:
        return np.float(-1)
    else:
        return np.mean(column_GNV_values).tolist()
    
clean_GNV_attributes_UDF = udf(clean_GNV_attributes,(FloatType()))

In [23]:
customerAggregatedAttributes = \
(customerAttributes
 .select(['NIF_CLIENTE']+
         colsGNV_CustomerAttributes
        )
 .groupBy(col('NIF_CLIENTE'))
 .agg(*(collectList_colsGNV_CustomerAttributes))
 .join(labeled_datamart_nif_level,
       labeled_datamart_nif_level.cif_nif == customerAttributes.NIF_CLIENTE,
       how='inner'
      )
 .orderBy(col('NIF_CLIENTE'))
)

In [24]:
colsGNV_CustomerAggregatedAttributes = [(column) for column in customerAggregatedAttributes.columns if 'GNV' in column]

In [25]:
for column in colsGNV_CustomerAggregatedAttributes:
    customerAggregatedAttributes = \
    (customerAggregatedAttributes
     .withColumn(column
                 # collect_list(GNV_hour_0_W_MOU)  -> GNV_hour_0_W_MOU
                 .split('(')[1][:-1],
                 clean_GNV_attributes_UDF(column))
     .drop(column)
    )

## Join attributes with labeled data

In [29]:
customerAggregatedAttributesLabeled = \
(customerAggregatedAttributes
 .join(labeled_datamart_nif_level,
       customerAggregatedAttributes.NIF_CLIENTE == labeled_datamart_nif_level.cif_nif,
       how='inner'
      )
)

## H2O

In [32]:
import h2o
from h2o.automl import H2OAutoML

In [31]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.7.0_79"; Java(TM) SE Runtime Environment (build 1.7.0_79-b15); Java HotSpot(TM) 64-Bit Server VM (build 24.79-b02, mixed mode)
  Starting server from /opt/cloudera/parcels/Anaconda-2.5.0/lib/python2.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmppcac4C
  JVM stdout: /tmp/tmppcac4C/h2o_adesant3_started_from_python.out
  JVM stderr: /tmp/tmppcac4C/h2o_adesant3_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,03 secs
H2O cluster version:,3.16.0.4
H2O cluster version age:,5 months and 2 days !!!
H2O cluster name:,H2O_from_python_adesant3_jse8b1
H2O cluster total nodes:,1
H2O cluster free memory:,26.60 Gb
H2O cluster total cores:,56
H2O cluster allowed cores:,56
H2O cluster status:,"accepting new members, healthy"
H2O connection url:,http://127.0.0.1:54321


In [115]:
ccc_msisdn = (spark.read.load("/tmp/bbergua/ccc/msisdn/"))


In [122]:
(ccc_msisdn
 .where(col('Incidencia_Provision_Movil')>0)
 .where(col('Consulta_Tecnica_Movil')>0)
 .groupBy(col('Incidencia_Provision_Movil'),
          col('Voice_and_mobile_data_incidences_and_support_Referrals'),
          col('Consulta_Tecnica_Movil')
         )
 .count()
 .show()
)

+--------------------------+------------------------------------------------------+----------------------+-----+
|Incidencia_Provision_Movil|Voice_and_mobile_data_incidences_and_support_Referrals|Consulta_Tecnica_Movil|count|
+--------------------------+------------------------------------------------------+----------------------+-----+
|                        52|                                                     0|                     1|    1|
|                        13|                                                     0|                    19|    1|
|                        14|                                                     0|                    23|    1|
|                        51|                                                     0|                    23|    1|
|                       129|                                                    82|                  3548|    1|
|                        33|                                                     0|             

In [119]:
ccc_msisdn.printSchema()

root
 |-- msisdn: string (nullable = true)
 |-- Pagar_menos: long (nullable = true)
 |-- Incidencia_Provision_Neba: long (nullable = true)
 |-- Incidencia_Provision_Fibra: long (nullable = true)
 |-- Incidencia_Provision_DSL: long (nullable = true)
 |-- Incidencia_Tecnica: long (nullable = true)
 |-- Incidencia_SGI: long (nullable = true)
 |-- Incidencia_Resto: long (nullable = true)
 |-- Incidencia_Provision_Movil: long (nullable = true)
 |-- Resultado_No_Aplica: long (nullable = true)
 |-- Resultado_Informacion: long (nullable = true)
 |-- Resultado_Solucionado: long (nullable = true)
 |-- Resultado_Retenido: long (nullable = true)
 |-- Resultado_No_Retenido: long (nullable = true)
 |-- Resultado_Escalo: long (nullable = true)
 |-- Resultado_Envio_tecnico: long (nullable = true)
 |-- Resultado_Transferencia: long (nullable = true)
 |-- Resultado_Abono: long (nullable = true)
 |-- Resultado_Bajas: long (nullable = true)
 |-- Resultado_Reclamacion: long (nullable = true)
 |-- Device_up