In [1]:
from common.src.main.python.utils.hdfs_generic import *
import os

MAX_N_EXECUTORS=15
MIN_N_EXECUTORS=1
N_CORES_EXECUTOR=4
EXECUTOR_IDLE_MAX_TIME=120
EXECUTOR_MEMORY='16g'
DRIVER_MEMORY='16g'
N_CORES_DRIVER=1
MEMORY_OVERHEAD=N_CORES_EXECUTOR*2048
QUEUE="root.datascience.normal"
BDA_CORE_VERSION="1.0.0"

SPARK_COMMON_OPTS=os.environ.get('SPARK_COMMON_OPTS', '')
SPARK_COMMON_OPTS+=" --executor-memory %s --driver-memory %s" % (EXECUTOR_MEMORY, DRIVER_MEMORY)
SPARK_COMMON_OPTS+=" --conf spark.shuffle.manager=tungsten-sort"
SPARK_COMMON_OPTS+="  --queue %s" % QUEUE

# Dynamic allocation configuration
SPARK_COMMON_OPTS+=" --conf spark.dynamicAllocation.enabled=true"
SPARK_COMMON_OPTS+=" --conf spark.shuffle.service.enabled=true"
SPARK_COMMON_OPTS+=" --conf spark.dynamicAllocation.maxExecutors=%s" % (MAX_N_EXECUTORS)
SPARK_COMMON_OPTS+=" --conf spark.dynamicAllocation.minExecutors=%s" % (MIN_N_EXECUTORS)
SPARK_COMMON_OPTS+=" --conf spark.dynamicAllocation.executorIdleTimeout=%s" % (EXECUTOR_IDLE_MAX_TIME)

BDA_ENV = os.environ.get('BDA_USER_HOME', '')

# Attach bda-core-ra codebase
SPARK_COMMON_OPTS+=" --files \
{}/scripts/properties/red_agent/nodes.properties,\
{}/scripts/properties/red_agent/nodes-de.properties,\
{}/scripts/properties/red_agent/nodes-es.properties,\
{}/scripts/properties/red_agent/nodes-ie.properties,\
{}/scripts/properties/red_agent/nodes-it.properties,\
{}/scripts/properties/red_agent/nodes-pt.properties,\
{}/scripts/properties/red_agent/nodes-uk.properties".format(*[BDA_ENV]*7)

os.environ["SPARK_COMMON_OPTS"] = SPARK_COMMON_OPTS
os.environ["PYSPARK_SUBMIT_ARGS"] = "%s pyspark-shell " % SPARK_COMMON_OPTS

print os.environ.get('SPARK_COMMON_OPTS', '')
print os.environ.get('PYSPARK_SUBMIT_ARGS', '')

sc, sparkSession, sqlContext = run_sc()
print sc.defaultParallelism

 --queue root.datascience.normal  --conf spark.port.maxRetries=50  --conf spark.network.timeout=10000000  --conf spark.executor.heartbeatInterval=60  --conf spark.yarn.executor.memoryOverhead=2G  --conf spark.sql.broadcastTimeout=1200  --master yarn --conf spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.kryoserializer.buffer.max=1g --py-files /var/SP/data/home/bbergua/artifacts/bda-core-ra-complete-assembly-2.0.0.jar,/var/SP/data/home/bbergua/artifacts/common.zip,/var/SP/data/home/bbergua/artifacts/graphframes.zip,/var/SP/data/home/bbergua/artifacts/scripts.zip,/var/SP/data/home/bbergua/artifacts/xgboost4j-spark-2.1.1-0.7-jar-with-dependencies.jar --files /var/SP/data/home/bbergua/scripts/properties/red_agent/nodes-de.properties,/var/SP/data/home/bbergua/scripts/properties/red_agent/nodes-es.properties,/var/SP/data/home/bbergua/scripts/properties/red_agent/nodes-ie.properties,/var/SP/data/home/bbergua/scripts/properties/red_agent/nodes-it.properties,/var/SP/dat

In [2]:
import re
import subprocess
# Spark utils
from pyspark.sql.functions import array_contains, col, collect_set, concat, lit, lpad, size, struct, trim, udf, when
from pyspark.sql.types import IntegerType, StringType

In [3]:
spark = (SparkSession.builder
         .appName("VF-ES Master customers services")
         .master("yarn")
         .config("spark.submit.deployMode", "client")
         .config("spark.ui.showConsoleProgress", "true")
         .enableHiveSupport()
         .getOrCreate()
         )
print 'Spark version:', spark.version

Spark version: 2.1.0.cloudera1


Define some useful functions for generating customer's segment (Prepaid, Mobile-only, Convergent, and others)

In [4]:
# Replace column names of the type 'fun(colname)' by 'fun_colname'
# Also replace any character not in [a-zA-Z0-9_.] with '_'
def fix_column_names(df):
        names = df.schema.names

        for n in names:
            m = re.search('([^()]*)\(([^()]*)\)', n)
            if m is not None:
                # print m.group(0), '->', m.group(1) + '_' + m.group(2)
                df = df.withColumnRenamed(n, m.group(1) + '_' + m.group(2))

            m = re.sub('[^a-zA-Z0-9_.]', '_', n)
            if n != m:
                df = df.withColumnRenamed(n, m)

        return df

# First of all, we need to aggregate MSISDNs by NIF for Oracle customers, and by NUM_CLIENTE and then by NIF for Ono customers
def calculate_vfpos_segment_by_id():
        flags_fields = ['flagvoz', 'flagadsl', 'flagftth'] #, 'flaglpd', 'flaghz', 'flagtivo', 'flagvfbox', 'flagfutbol', 'flagmotor']
        
        data = spark.table('raw_es.vf_pos_ac_final').select(['x_num_ident', 'x_id_red', 'partitioned_month']+flags_fields)
        for col in flags_fields:
            data = data.withColumn(col, data[col].cast(IntegerType()))

        data_by_id = data.groupBy(['x_num_ident', 'partitioned_month']).sum(*flags_fields)
        data_by_id = fix_column_names(data_by_id)

        # Calculate Mobile-Only
        # flags_fixed = ['sum_' + c for c in flags_fields if c != 'flagvoz']
        # mo_condition = (data_by_id['sum_flagvoz'] > 0)
        # for flag in flags_fixed:
        #     if flag in data_by_id.columns:
        #         # print 'Adding fixed flag', flag, 'to mo_condition'
        #         mo_condition = mo_condition & (data_by_id[flag] == 0)
        mo_condition = (data_by_id['sum_flagvoz'] > 0) & (data_by_id['sum_flagadsl'] == 0) | (data_by_id['sum_flagftth'] == 0)
        data_by_id = data_by_id.withColumn('is_mobile_only', when(mo_condition, True).otherwise(False))

        # Calculate Convergent
        # co_condition = None
        # for flag in flags_fixed:
        #     if flag in data_by_id.columns:
        #         # print 'Adding flag', flag, 'to co_condition'
        #         if co_condition is None:
        #             co_condition = (data_by_id[flag] > 0)
        #         else:
        #             co_condition = co_condition | (data_by_id[flag] > 0)
        # 
        # co_condition = (data_by_id['sum_flagvoz'] > 0) & co_condition
        co_condition = (data_by_id['sum_flagadsl'] > 0) | (data_by_id['sum_flagftth'] > 0)
        data_by_id = data_by_id.withColumn('is_convergent', when(co_condition, True).otherwise(False))
        
        data_by_id = data_by_id.withColumn('SEGMENTACION',   when(mo_condition, lit('Mobile-Only'))
                                                            .when(co_condition, lit('Convergent'))
                                                            .otherwise(         lit('Other')))

        data_by_id = data_by_id.withColumnRenamed('x_num_ident', 'nif')
        data_by_id = data_by_id.select(['nif', 'partitioned_month', 'SEGMENTACION']) # , 'is_mobile_only', 'is_convergent'

        # data_by_id.filter('is_mobile_only==FALSE AND is_convergent==FALSE').groupby('partitioned_month').count().sort('partitioned_month').show()
        # data_by_id.groupby(['partitioned_month', 'SEGMENTACION']).count().sort(['partitioned_month', 'SEGMENTACION']).show(60)

        return data_by_id

def calculate_vf_segment_by_id():
    data_vfpre_by_id = spark.table('raw_es.vf_pre_ac_final').select(['num_documento_comprador', 'partitioned_month']).distinct().withColumnRenamed('num_documento_comprador', 'nif').withColumn('SEGMENTACION', lit('Prepaid')).select('nif', 'partitioned_month', 'SEGMENTACION')
    data_vfpos_by_id = calculate_vfpos_segment_by_id()
    data_by_id = data_vfpre_by_id.union(data_vfpos_by_id)
    
    return data_by_id

def calculate_vfpos_segment_by_msisdn():
    data_by_msisdn = spark.table('raw_es.vf_pos_ac_final').select(['x_num_ident', 'x_id_red', 'partitioned_month']).withColumnRenamed('x_id_red', 'msisdn').withColumnRenamed('x_num_ident', 'nif')
    data_by_id = calculate_vfpos_segment_by_id()
    data_by_msisdn = data_by_msisdn.join(data_by_id, ['nif', 'partitioned_month'])
    data_by_msisdn = data_by_msisdn.select('msisdn', 'nif', 'partitioned_month', 'SEGMENTACION')
    
    return data_by_msisdn

def calculate_vf_segment_by_msisdn():
    data_vfpre_by_msisdn = spark.table('raw_es.vf_pre_ac_final').select(['msisdn', 'num_documento_comprador', 'partitioned_month']).distinct().withColumnRenamed('num_documento_comprador', 'nif').withColumn('SEGMENTACION', lit('Prepaid')).select('msisdn', 'nif', 'partitioned_month', 'SEGMENTACION')
    data_vfpos_by_msisdn = calculate_vfpos_segment_by_msisdn()
    data_by_msisdn = data_vfpre_by_msisdn.union(data_vfpos_by_msisdn)
    
    return data_by_msisdn

# TODO: Ono

In [10]:
spark.table('raw_es.vf_pos_ac_final').groupby('partitioned_month').count().sort('partitioned_month', ascending=False).show()

+-----------------+-------+
|partitioned_month|  count|
+-----------------+-------+
|           201801|7054125|
|           201712|7453117|
|           201711|7607784|
|           201710|7585021|
|           201709|7530387|
|           201708|7484317|
|           201707|7466648|
|           201706|7508354|
|           201705|7415714|
|           201704|7420572|
|           201703|7425614|
|           201702|7449182|
|           201701|7414137|
|           201612|7338397|
|           201611|7269925|
|           201610|7268701|
|           201609|6921939|
|           201608|6900407|
+-----------------+-------+



In [11]:
spark.table('raw_es.vf_pre_ac_final').groupby('partitioned_month').count().sort('partitioned_month', ascending=False).show()

+-----------------+-------+
|partitioned_month|  count|
+-----------------+-------+
|           201801|2708572|
|           201712|2730651|
|           201711|2777228|
|           201710|2825192|
|           201709|2821122|
|           201708|2821362|
|           201707|2884347|
|           201706|2874127|
|           201705|2871628|
|           201704|2886848|
|           201703|2916068|
|           201702|2965506|
|           201701|3022831|
|           201612|3030962|
|           201611|3049511|
|           201610|3082074|
|           201609|3115148|
|           201608|3111525|
+-----------------+-------+



In [5]:
master_by_msisdn = calculate_vf_segment_by_msisdn()
master_by_msisdn.groupby('partitioned_month', 'SEGMENTACION').count() \
                .sort('partitioned_month', 'count', ascending=False).show()

+-----------------+------------+-------+
|partitioned_month|SEGMENTACION|  count|
+-----------------+------------+-------+
|           201801| Mobile-Only|6921064|
|           201801|     Prepaid|2708572|
|           201801|  Convergent| 133061|
|           201712| Mobile-Only|7315910|
|           201712|     Prepaid|2730651|
|           201712|  Convergent| 137207|
|           201711| Mobile-Only|7471965|
|           201711|     Prepaid|2777228|
|           201711|  Convergent| 135819|
|           201710| Mobile-Only|7452674|
|           201710|     Prepaid|2825192|
|           201710|  Convergent| 132347|
|           201709| Mobile-Only|7398248|
|           201709|     Prepaid|2821122|
|           201709|  Convergent| 132139|
|           201708| Mobile-Only|7353473|
|           201708|     Prepaid|2821362|
|           201708|  Convergent| 130844|
|           201707| Mobile-Only|7335495|
|           201707|     Prepaid|2884347|
+-----------------+------------+-------+
only showing top

In [6]:
master_by_msisdn.write.mode('overwrite').format('parquet').save('/tmp/bbergua/master_customers_services/msisdn/')
subprocess.call('hdfs dfs -chmod -R o+rx /tmp/bbergua/master_customers_services/msisdn/',  shell=True)
subprocess.call('hdfs dfs -chmod    o+r  /tmp/bbergua/master_customers_services/msisdn/*', shell=True)

0

In [7]:
master_by_id = calculate_vf_segment_by_id()
master_by_id.groupby('partitioned_month', 'SEGMENTACION').count() \
            .sort('partitioned_month', 'count', ascending=False).show()

+-----------------+------------+-------+
|partitioned_month|SEGMENTACION|  count|
+-----------------+------------+-------+
|           201801| Mobile-Only|2874140|
|           201801|     Prepaid|2247113|
|           201801|  Convergent|  55616|
|           201712| Mobile-Only|3228366|
|           201712|     Prepaid|2267022|
|           201712|  Convergent|  57290|
|           201711| Mobile-Only|3410091|
|           201711|     Prepaid|2303454|
|           201711|  Convergent|  55737|
|           201710| Mobile-Only|3433925|
|           201710|     Prepaid|2339810|
|           201710|  Convergent|  53180|
|           201709| Mobile-Only|3452036|
|           201709|     Prepaid|2329992|
|           201709|  Convergent|  53063|
|           201708| Mobile-Only|3468005|
|           201708|     Prepaid|2324060|
|           201708|  Convergent|  52475|
|           201707| Mobile-Only|3486883|
|           201707|     Prepaid|2376711|
+-----------------+------------+-------+
only showing top

In [12]:
master_by_id_seg = master_by_id.select('nif', 'partitioned_month', 'SEGMENTACION') \
    .groupby('nif', 'partitioned_month').agg(collect_set('SEGMENTACION').alias('SEGMENTACION'))
#master_by_id_seg.show()
master_by_id_seg.groupby('partitioned_month', 'SEGMENTACION').count() \
                .sort('partitioned_month', 'count', ascending=False).show()

+-----------------+--------------------+-------+
|partitioned_month|        SEGMENTACION|  count|
+-----------------+--------------------+-------+
|           201801|       [Mobile-Only]|2634156|
|           201801|           [Prepaid]|2001019|
|           201801|[Mobile-Only, Pre...| 239984|
|           201801|        [Convergent]|  49506|
|           201801|[Convergent, Prep...|   6110|
|           201712|       [Mobile-Only]|2963508|
|           201712|           [Prepaid]|1995780|
|           201712|[Mobile-Only, Pre...| 264858|
|           201712|        [Convergent]|  50906|
|           201712|[Convergent, Prep...|   6384|
|           201711|       [Mobile-Only]|3134593|
|           201711|           [Prepaid]|2021636|
|           201711|[Mobile-Only, Pre...| 275498|
|           201711|        [Convergent]|  49417|
|           201711|[Convergent, Prep...|   6320|
|           201710|       [Mobile-Only]|3151842|
|           201710|           [Prepaid]|2051523|
|           201710|[

In [8]:
master_by_id.write.mode('overwrite').format('parquet').save('/tmp/bbergua/master_customers_services/id/')
subprocess.call('hdfs dfs -chmod -R o+rx /tmp/bbergua/master_customers_services/id/',  shell=True)
subprocess.call('hdfs dfs -chmod    o+r  /tmp/bbergua/master_customers_services/id/*', shell=True)

0