In [2]:
from common.src.main.python.utils.hdfs_generic import *
import os

MAX_N_EXECUTORS=15
MIN_N_EXECUTORS=1
N_CORES_EXECUTOR=4
EXECUTOR_IDLE_MAX_TIME=120
EXECUTOR_MEMORY='16g'
DRIVER_MEMORY='16g'
N_CORES_DRIVER=1
MEMORY_OVERHEAD=N_CORES_EXECUTOR*2048
QUEUE="root.datascience.normal"
BDA_CORE_VERSION="1.0.0"

SPARK_COMMON_OPTS=os.environ.get('SPARK_COMMON_OPTS', '')
SPARK_COMMON_OPTS+=" --executor-memory %s --driver-memory %s" % (EXECUTOR_MEMORY, DRIVER_MEMORY)
SPARK_COMMON_OPTS+=" --conf spark.shuffle.manager=tungsten-sort"
SPARK_COMMON_OPTS+=" --queue %s" % QUEUE

# Dynamic allocation configuration
SPARK_COMMON_OPTS+=" --conf spark.dynamicAllocation.enabled=true"
SPARK_COMMON_OPTS+=" --conf spark.shuffle.service.enabled=true"
SPARK_COMMON_OPTS+=" --conf spark.dynamicAllocation.maxExecutors=%s" % (MAX_N_EXECUTORS)
SPARK_COMMON_OPTS+=" --conf spark.dynamicAllocation.minExecutors=%s" % (MIN_N_EXECUTORS)
SPARK_COMMON_OPTS+=" --conf spark.dynamicAllocation.executorIdleTimeout=%s" % (EXECUTOR_IDLE_MAX_TIME)

BDA_ENV = os.environ.get('BDA_USER_HOME', '')

# Attach bda-core-ra codebase
SPARK_COMMON_OPTS+=" --files \
{}/scripts/properties/red_agent/nodes.properties,\
{}/scripts/properties/red_agent/nodes-de.properties,\
{}/scripts/properties/red_agent/nodes-es.properties,\
{}/scripts/properties/red_agent/nodes-ie.properties,\
{}/scripts/properties/red_agent/nodes-it.properties,\
{}/scripts/properties/red_agent/nodes-pt.properties,\
{}/scripts/properties/red_agent/nodes-uk.properties".format(*[BDA_ENV]*7)

os.environ["SPARK_COMMON_OPTS"] = SPARK_COMMON_OPTS
os.environ["PYSPARK_SUBMIT_ARGS"] = "%s pyspark-shell " % SPARK_COMMON_OPTS

print os.environ.get('SPARK_COMMON_OPTS', '')
print os.environ.get('PYSPARK_SUBMIT_ARGS', '')

sc, sparkSession, sqlContext = run_sc()
print sc.defaultParallelism

 --queue root.datascience.normal  --conf spark.port.maxRetries=50  --conf spark.network.timeout=10000000  --conf spark.executor.heartbeatInterval=60  --conf spark.yarn.executor.memoryOverhead=2G  --conf spark.sql.broadcastTimeout=1200  --master yarn --conf spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.kryoserializer.buffer.max=1g --py-files /var/SP/data/home/bbergua/artifacts/bda-core-ra-complete-assembly-2.0.0.jar,/var/SP/data/home/bbergua/artifacts/common.zip,/var/SP/data/home/bbergua/artifacts/graphframes.zip,/var/SP/data/home/bbergua/artifacts/scripts.zip,/var/SP/data/home/bbergua/artifacts/xgboost4j-spark-2.1.1-0.7-jar-with-dependencies.jar --files /var/SP/data/home/bbergua/scripts/properties/red_agent/nodes-de.properties,/var/SP/data/home/bbergua/scripts/properties/red_agent/nodes-es.properties,/var/SP/data/home/bbergua/scripts/properties/red_agent/nodes-ie.properties,/var/SP/data/home/bbergua/scripts/properties/red_agent/nodes-it.properties,/var/SP/dat

In [3]:
import re
import subprocess
# Spark utils
from pyspark.sql.functions import array_contains, col, collect_set, concat, lit, lpad, size, struct, trim, udf, when
from pyspark.sql.types import IntegerType, StringType

In [4]:
spark = (SparkSession.builder
         .appName("VF-ES NPS & TNPS")
         .master("yarn")
         .config("spark.submit.deployMode", "client")
         .config("spark.ui.showConsoleProgress", "true")
         .enableHiveSupport()
         .getOrCreate()
         )
print 'Spark version:', spark.version

Spark version: 2.1.0.cloudera1


Define a useful function to rename columns

In [5]:
# Replace column names of the type 'fun(colname)' by 'fun_colname'
# Also replace any character not in [a-zA-Z0-9_.] with '_'
def fix_column_names(df):
        names = df.schema.names

        for n in names:
            m = re.search('([^()]*)\(([^()]*)\)', n)
            if m is not None:
                # print m.group(0), '->', m.group(1) + '_' + m.group(2)
                df = df.withColumnRenamed(n, m.group(1) + '_' + m.group(2))

            m = re.sub('[^a-zA-Z0-9_.]', '_', n)
            if n != m:
                df = df.withColumnRenamed(n, m)

        return df

# Generate TNPS by MSISDN

First of all, read TNPS data, and select only those VDNs that map to the question "¿Recomendarías Vodafone ...?". Then take the answer field and convert it to numerical.

In [67]:
tnps = spark.table('raw_es.tnps').distinct().filter('year == 2018')
#tnps.show()
# TODO: Can I trust in year, month, and day? Or do I have to regenerate them using FechaLLamYDILO?

nums_pregunta_recomendaria = ['4155.0', '4161.0', '4167.0', '4173.0',
                              '4179.0', '4185.0', '4191.0', '4197.0',
                              '5001.0', '5018.0', '5190.0', '5774.0',
                              '5775.0', '5776.0', '5805.0', '5818.0',
                              '5821.0', '5825.0', '5835.0', '5847.0',
                              '5860.0', '5894.0', '5910.0', '5974.0',
                              '6025.0', '6034.0', '6064.0', '6066.0',
                              '6128.0', '6191.0', '6260.0', '6286.0',
                              '6295.0', '6303.0', '6308.0', '6319.0',
                              '6473.0', '6595.0']
# tnps.filter(tnps['Num_Pregunta'] == '4155.0').select('Num_Pregunta', 'Literal').show()
# tnps.filter(tnps['Num_Pregunta'] == '5190.0').select('Num_Pregunta', 'Literal').show()

tnps_nps = tnps.filter(tnps['Num_Pregunta'].isin(nums_pregunta_recomendaria))
vdns = [x.VDN for x in tnps_nps.select('VDN').distinct().collect()]

tnps_nps = tnps_nps.withColumnRenamed('SERIAL_NUMBER', 'msisdn')
tnps_nps = tnps_nps.withColumn('partitioned_month', concat(tnps_nps.year, lpad(tnps_nps.month, 2, '0')))
#tnps_nps.groupby('year', 'month', 'partitioned_month').count().sort('year', 'month', 'partitioned_month').show()
#tnps_nps.select('Respuesta').distinct().show()
#tnps_nps.select('Respuesta').groupby('Respuesta').count().sort('count').show()
tnps_nps = tnps_nps.filter('Respuesta != "ERROR"').withColumn('Respuesta_Num',
                                              when(tnps_nps.Respuesta.like('CERO'),   lit(0))
                                             .when(tnps_nps.Respuesta.like('UNO'),    lit(1))
                                             .when(tnps_nps.Respuesta.like('DOS'),    lit(2))
                                             .when(tnps_nps.Respuesta.like('TRES'),   lit(3))
                                             .when(tnps_nps.Respuesta.like('CUATRO'), lit(4))
                                             .when(tnps_nps.Respuesta.like('CINCO'),  lit(5))
                                             .when(tnps_nps.Respuesta.like('SEIS'),   lit(6))
                                             .when(tnps_nps.Respuesta.like('SIETE'),  lit(7))
                                             .when(tnps_nps.Respuesta.like('OCHO'),   lit(8))
                                             .when(tnps_nps.Respuesta.like('NUEVE'),  lit(9))
                                             .when(tnps_nps.Respuesta.like('DIEZ'),   lit(10)))
#tnps_nps.select('Respuesta_Num').groupby('Respuesta_Num').count().sort('count').show()
tnps_nps.select('partitioned_month').groupby('partitioned_month').count() \
        .sort('partitioned_month').show(50)

+-----------------+------+
|partitioned_month| count|
+-----------------+------+
|           201801|906071|
|           201802|753627|
|           201803|262604|
+-----------------+------+



Finally, pivot on VDN column, and create new column with the minimum VDN

In [68]:
#print 'Pivoting TNPS on VDN ...'
tnps_pivoted = tnps_nps.groupby('msisdn', 'partitioned_month') \
                        .pivot('VDN', values=vdns).min('Respuesta_Num')

#print 'Appending TNPS prefix to columns ...'
for c in tnps_pivoted.columns:
    if c not in ['msisdn', 'year', 'month', 'day', 'partitioned_month']:
        tnps_pivoted = tnps_pivoted.withColumnRenamed(c, 'TNPS_VDN_' + c)

min_vdn = udf(lambda row: min(filter(lambda x: x is not None, row)) if len(filter(lambda x: x is not None, row))>0 else None, IntegerType())
tnps_pivoted = tnps_pivoted.withColumn('min_VDN', min_vdn(struct([tnps_pivoted[x] for x in tnps_pivoted.columns if x.startswith('TNPS_VDN_')])))#.fillna(0)
# tnps_pivoted.show()
# tnps_pivoted.select('partitioned_month').groupby('partitioned_month').count().sort('partitioned_month').show(50)

tnps_pivoted = tnps_pivoted.withColumnRenamed('min_VDN', 'TNPS01')
tnps_pivoted = tnps_pivoted.withColumn('TNPS',
                         when(tnps_pivoted['TNPS01'].isin(10, 9), 'PROMOTER')
                        .when(tnps_pivoted['TNPS01'].isin(8, 7),  'NEUTRAL')
                        .when(tnps_pivoted['TNPS01'].isin(6, 5, 4, 3, 2, 1, 0), 'DETRACTOR'))
                        #.otherwise('False'))
tnps_pivoted = tnps_pivoted.withColumn('TNPS4',
                         when(tnps_pivoted['TNPS01'].isin(10, 9),      'PROMOTER')
                        .when(tnps_pivoted['TNPS01'].isin(8, 7),       'NEUTRAL')
                        .when(tnps_pivoted['TNPS01'].isin(6, 5, 4),    'SOFT DETRACTOR')
                        .when(tnps_pivoted['TNPS01'].isin(3, 2, 1, 0), 'HARD DETRACTOR'))
                        #.otherwise(False))
tnps_pivoted.groupby('partitioned_month').count().sort('partitioned_month').show(50)

+-----------------+------+
|partitioned_month| count|
+-----------------+------+
|           201801|861516|
|           201802|720718|
|           201803|254844|
+-----------------+------+



Add SEGMENTACION

In [69]:
# Add SEGMENTACION
if not 'master_by_msisdn' in globals():
    print 'Reading master_customers_services by MSISDN from HDFS'
    master_by_msisdn = spark.read.parquet('/tmp/bbergua/master_customers_services/msisdn/')

tnps_pivoted_segment = tnps_pivoted.join(master_by_msisdn, ['msisdn', 'partitioned_month'], 'left_outer')
tnps_pivoted_segment.groupby('partitioned_month').count().sort('partitioned_month').show(50)

# tnps_pivoted_segment.select('partitioned_month').groupby('partitioned_month').count().sort('partitioned_month').show(50)
# self.tnps = tnps_pivoted_segment.select(['msisdn', 'partitioned_month', 'TNPS01', 'TNPS', 'TNPS4'])
tnps = tnps_pivoted_segment

tnps.groupby(['partitioned_month', 'SEGMENTACION']).count().sort(['partitioned_month', 'count'], ascending=False).show()
tnps.select(['msisdn', 'partitioned_month', 'SEGMENTACION', 'TNPS01', 'TNPS', 'TNPS4']).show()

Reading master_customers_services by MSISDN from HDFS
+-----------------+------+
|partitioned_month| count|
+-----------------+------+
|           201801|861516|
|           201802|720718|
|           201803|254844|
+-----------------+------+

+-----------------+------------+------+
|partitioned_month|SEGMENTACION| count|
+-----------------+------------+------+
|           201803|        null|254844|
|           201802|        null|720718|
|           201801|        null|475252|
|           201801| Mobile-Only|322591|
|           201801|     Prepaid| 60071|
|           201801|  Convergent|  3602|
+-----------------+------------+------+

+---------+-----------------+------------+------+---------+--------------+
|   msisdn|partitioned_month|SEGMENTACION|TNPS01|     TNPS|         TNPS4|
+---------+-----------------+------------+------+---------+--------------+
|600026079|           201803|        null|     4|DETRACTOR|SOFT DETRACTOR|
|600032825|           201802|        null|     8|  NEUT

Finally, write TNPS data by MSISDN to HDFS

In [None]:
tnps.write.mode('overwrite').format('parquet').save('/tmp/bbergua/tnps/msisdn/')
subprocess.call('hdfs dfs -chmod -R o+rx /tmp/bbergua/tnps/msisdn/',  shell=True)
subprocess.call('hdfs dfs -chmod    o+r  /tmp/bbergua/tnps/msisdn/*', shell=True)

# Generate TNPS by NIF (client's id)

In [6]:
if not 'tnps' in globals():
    print 'Reading TNPS by MSISDN from HDFS'
    tnps = spark.read.parquet('/tmp/bbergua/tnps/msisdn/')

Reading TNPS by MSISDN from HDFS


Now, aggregate TNPS by client's id (NIF)

In [7]:
tnps_pivoted_segment = tnps
#nif_segment = tnps_pivoted.select('nif', 'partitioned_month', 'SEGMENTACION').groupby('nif', 'partitioned_month').agg(collect_list('SEGMENTACION').alias('SEGMENTACION'))
nif_segment = tnps_pivoted_segment.select('nif', 'partitioned_month', 'SEGMENTACION') \
    .groupby('nif', 'partitioned_month').agg(collect_set('SEGMENTACION').alias('SEGMENTACION'))
#nif_segment.show()
nif_segment.groupby('partitioned_month', 'SEGMENTACION').count() \
            .sort('partitioned_month', 'count', ascending=False).show()

+-----------------+--------------------+------+
|partitioned_month|        SEGMENTACION| count|
+-----------------+--------------------+------+
|           201803|                  []|     1|
|           201802|                  []|     1|
|           201801|        [Convergent]|208201|
|           201801|       [Mobile-Only]| 92707|
|           201801|           [Prepaid]| 45076|
|           201801|             [Other]|  9332|
|           201801|[Convergent, Prep...|   204|
|           201801|[Mobile-Only, Pre...|   126|
|           201801|    [Other, Prepaid]|    28|
|           201801|                  []|     1|
|           201712|        [Convergent]|183865|
|           201712|       [Mobile-Only]| 91861|
|           201712|           [Prepaid]| 37079|
|           201712|             [Other]|  8732|
|           201712|[Convergent, Prep...|   157|
|           201712|[Mobile-Only, Pre...|    98|
|           201712|    [Other, Prepaid]|    24|
|           201712|                  []|

In [8]:
nif_segment = nif_segment.na.drop(subset='nif')
nif_segment.groupby('partitioned_month', 'SEGMENTACION').count() \
            .sort('partitioned_month', 'count', ascending=False).show()

+-----------------+--------------------+------+
|partitioned_month|        SEGMENTACION| count|
+-----------------+--------------------+------+
|           201801|        [Convergent]|208201|
|           201801|       [Mobile-Only]| 92707|
|           201801|           [Prepaid]| 45076|
|           201801|             [Other]|  9332|
|           201801|[Convergent, Prep...|   204|
|           201801|[Mobile-Only, Pre...|   126|
|           201801|    [Other, Prepaid]|    28|
|           201712|        [Convergent]|183865|
|           201712|       [Mobile-Only]| 91861|
|           201712|           [Prepaid]| 37079|
|           201712|             [Other]|  8732|
|           201712|[Convergent, Prep...|   157|
|           201712|[Mobile-Only, Pre...|    98|
|           201712|    [Other, Prepaid]|    24|
|           201711|        [Convergent]|209407|
|           201711|       [Mobile-Only]|114014|
|           201711|           [Prepaid]| 45753|
|           201711|             [Other]|

In [9]:
nif_segment.printSchema()
#nif_segment.filter(array_contains(nif_segment.SEGMENTACION, 'Vodafone') & array_contains(nif_segment.SEGMENTACION, 'Prepaid')).show()
#nif_segment.filter(array_contains(nif_segment.SEGMENTACION, 'Vodafone') & array_contains(nif_segment.SEGMENTACION, 'Prepaid')).groupby('partitioned_month').count().sort('partitioned_month').show()
#nif_segment.where(size(col('SEGMENTACION')) >= 2).show()

root
 |-- nif: string (nullable = true)
 |-- partitioned_month: string (nullable = true)
 |-- SEGMENTACION: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [10]:
nif_segment = nif_segment.withColumn('SEGMENTACION_Prepaid', 
                       when(array_contains(nif_segment['SEGMENTACION'], 'Prepaid'), 'Prepaid'))
nif_segment.where(size(col('SEGMENTACION')) >= 2).show()

+---------+-----------------+--------------------+--------------------+
|      nif|partitioned_month|        SEGMENTACION|SEGMENTACION_Prepaid|
+---------+-----------------+--------------------+--------------------+
|00961294V|           201704|[Convergent, Prep...|             Prepaid|
|01120010Q|           201709|    [Other, Prepaid]|             Prepaid|
|03915755X|           201609|[Convergent, Prep...|             Prepaid|
|06455634M|           201708|[Convergent, Prep...|             Prepaid|
|06617751N|           201701|    [Other, Prepaid]|             Prepaid|
|09251143L|           201708|[Mobile-Only, Pre...|             Prepaid|
|09279809T|           201609|[Convergent, Prep...|             Prepaid|
|12578367C|           201612|[Convergent, Prep...|             Prepaid|
|18755519G|           201610|[Convergent, Prep...|             Prepaid|
|20594773E|           201609|[Convergent, Prep...|             Prepaid|
|23846081A|           201704|[Mobile-Only, Pre...|             P

In [11]:
#def drop_prepaid_from_array__(arr):
#    if len(arr) == 1:
#        return arr[0]
#    elif len(arr) > 1:
#        arr.remove('Prepaid')
#        return arr[0]
#    return None
def drop_prepaid_from_array_(arr):
    if len(arr) > 1:
        arr.remove('Prepaid')
        return arr[0]
    return arr

drop_prepaid_from_array = udf(drop_prepaid_from_array_, StringType())
#nif_segment = nif_segment.withColumn('SEGMENTACION_', drop_prepaid_from_array('SEGMENTACION'))
nif_segment = nif_segment.withColumn('SEGMENTACION_', 
                                     when(size(col('SEGMENTACION')) == 1, col('SEGMENTACION').getItem(0))\
                                    .when(size(col('SEGMENTACION')) >= 1, drop_prepaid_from_array('SEGMENTACION'))\
                                    .otherwise(None))

In [12]:
#nif_segment.show()
#nif_segment.where(col('SEGMENTACION') == None).show()
#nif_segment.where(size(col('SEGMENTACION')) <= 0).show()
nif_segment.where(size(col('SEGMENTACION')) == 1).show()
nif_segment.where(size(col('SEGMENTACION')) >= 2).show()

+---------+-----------------+-------------+--------------------+-------------+
|      nif|partitioned_month| SEGMENTACION|SEGMENTACION_Prepaid|SEGMENTACION_|
+---------+-----------------+-------------+--------------------+-------------+
|00000707Y|           201710|[Mobile-Only]|                null|  Mobile-Only|
|00004188P|           201710|    [Prepaid]|             Prepaid|      Prepaid|
|00005932U|           201702|      [Other]|                null|        Other|
|00007852K|           201704| [Convergent]|                null|   Convergent|
|00008261U|           201704|[Mobile-Only]|                null|  Mobile-Only|
|00008549G|           201707| [Convergent]|                null|   Convergent|
|00018304R|           201609|    [Prepaid]|             Prepaid|      Prepaid|
|00022294W|           201608|[Mobile-Only]|                null|  Mobile-Only|
|00029868F|           201707|[Mobile-Only]|                null|  Mobile-Only|
|00030568Q|           201706| [Convergent]|         

In [13]:
nif_segment = nif_segment.drop('SEGMENTACION').withColumnRenamed('SEGMENTACION_', 'SEGMENTACION')
nif_segment.show()

+---------+-----------------+--------------------+------------+
|      nif|partitioned_month|SEGMENTACION_Prepaid|SEGMENTACION|
+---------+-----------------+--------------------+------------+
|00000707Y|           201710|                null| Mobile-Only|
|00004188P|           201710|             Prepaid|     Prepaid|
|00005932U|           201702|                null|       Other|
|00007852K|           201704|                null|  Convergent|
|00008261U|           201704|                null| Mobile-Only|
|00008549G|           201707|                null|  Convergent|
|00018304R|           201609|             Prepaid|     Prepaid|
|00022294W|           201608|                null| Mobile-Only|
|00029868F|           201707|                null| Mobile-Only|
|00030568Q|           201706|                null|  Convergent|
|00038161V|           201703|                null|  Convergent|
|00038737X|           201705|                null| Mobile-Only|
|00040124H|           201708|           

In [18]:
tnps_pivoted_by_id = tnps_pivoted_segment.groupby('nif', 'partitioned_month').min()
tnps_pivoted_by_id = fix_column_names(tnps_pivoted_by_id)
#tnps_pivoted_by_id.printSchema()
tnps_pivoted_by_id = tnps_pivoted_by_id.withColumnRenamed('min_TNPS01', 'TNPS01')
tnps_pivoted_by_id = tnps_pivoted_by_id.withColumn('TNPS',
                     when(tnps_pivoted_by_id['TNPS01'].isin(10, 9), 'PROMOTER')
                    .when(tnps_pivoted_by_id['TNPS01'].isin(8, 7),  'NEUTRAL')
                    .when(tnps_pivoted_by_id['TNPS01'].isin(6, 5, 4, 3, 2, 1, 0), 'DETRACTOR'))
                    #.otherwise('False'))
tnps_pivoted_by_id = tnps_pivoted_by_id.withColumn('TNPS4',
                     when(tnps_pivoted_by_id['TNPS01'].isin(10, 9),      'PROMOTER')
                    .when(tnps_pivoted_by_id['TNPS01'].isin(8, 7),       'NEUTRAL')
                    .when(tnps_pivoted_by_id['TNPS01'].isin(6, 5, 4),    'SOFT DETRACTOR')
                    .when(tnps_pivoted_by_id['TNPS01'].isin(3, 2, 1, 0), 'HARD DETRACTOR'))
                    #.otherwise(False))
tnps_pivoted_by_id.select(['nif', 'partitioned_month', 'TNPS01', 'TNPS', 'TNPS4']).show()#tnps_pivoted_by_id.printSchema()
#tnps_pivoted_by_id.printSchema()

+---------+-----------------+------+---------+--------------+
|      nif|partitioned_month|TNPS01|     TNPS|         TNPS4|
+---------+-----------------+------+---------+--------------+
|13305780J|           201611|     0|DETRACTOR|HARD DETRACTOR|
|61492319I|           201710|    10| PROMOTER|      PROMOTER|
|34273028K|           201610|     8|  NEUTRAL|       NEUTRAL|
|22409905A|           201702|     0|DETRACTOR|HARD DETRACTOR|
|27046725H|           201609|     7|  NEUTRAL|       NEUTRAL|
|02678627O|           201704|    10| PROMOTER|      PROMOTER|
|K3867172I|           201705|     9| PROMOTER|      PROMOTER|
|95242039Q|           201609|    10| PROMOTER|      PROMOTER|
|Z9347531A|           201612|     9| PROMOTER|      PROMOTER|
|15421375Y|           201706|     8|  NEUTRAL|       NEUTRAL|
|19175986Q|           201706|     0|DETRACTOR|HARD DETRACTOR|
|25924404R|           201609|    10| PROMOTER|      PROMOTER|
|59424407P|           201705|    10| PROMOTER|      PROMOTER|
|7172209

Join with SEGMENTACION, previously calculated

In [19]:
tnps_pivoted_by_id = tnps_pivoted_by_id.join(nif_segment, ['nif', 'partitioned_month'], 'left_outer')
tnps_pivoted_by_id.select(['nif', 'partitioned_month', 'SEGMENTACION_Prepaid', 'SEGMENTACION', 'TNPS01', 'TNPS', 'TNPS4']).show(10)
#tnps_pivoted_by_id.printSchema()

+---------+-----------------+--------------------+------------+------+---------+--------------+
|      nif|partitioned_month|SEGMENTACION_Prepaid|SEGMENTACION|TNPS01|     TNPS|         TNPS4|
+---------+-----------------+--------------------+------------+------+---------+--------------+
|00000707Y|           201710|                null| Mobile-Only|    10| PROMOTER|      PROMOTER|
|00004188P|           201710|             Prepaid|     Prepaid|     7|  NEUTRAL|       NEUTRAL|
|00005932U|           201702|                null|       Other|     5|DETRACTOR|SOFT DETRACTOR|
|00007852K|           201704|                null|  Convergent|    10| PROMOTER|      PROMOTER|
|00008261U|           201704|                null| Mobile-Only|     8|  NEUTRAL|       NEUTRAL|
|00008549G|           201707|                null|  Convergent|     9| PROMOTER|      PROMOTER|
|00018304R|           201609|             Prepaid|     Prepaid|     8|  NEUTRAL|       NEUTRAL|
|00022294W|           201608|           

In [20]:
tnps_pivoted_by_id.filter('SEGMENTACION == NULL').count()

0

Finally, write TNPS data by Id (NIF) to HDFS

In [21]:
tnps_pivoted_by_id.write.mode('overwrite').format('parquet').save('/tmp/bbergua/tnps/id/')
subprocess.call('hdfs dfs -chmod -R o+rx /tmp/bbergua/tnps/id/',  shell=True)
subprocess.call('hdfs dfs -chmod    o+r  /tmp/bbergua/tnps/id/*', shell=True)

0