In [4]:
from common.src.main.python.utils.hdfs_generic import *
import os

MAX_N_EXECUTORS=15
MIN_N_EXECUTORS=1
N_CORES_EXECUTOR=4
EXECUTOR_IDLE_MAX_TIME=120
EXECUTOR_MEMORY='32g'
DRIVER_MEMORY='16g'
N_CORES_DRIVER=1
MEMORY_OVERHEAD=N_CORES_EXECUTOR*2048
QUEUE="root.datascience.normal"
BDA_CORE_VERSION="1.0.0"

SPARK_COMMON_OPTS=os.environ.get('SPARK_COMMON_OPTS', '')
SPARK_COMMON_OPTS+=" --executor-memory %s --driver-memory %s" % (EXECUTOR_MEMORY, DRIVER_MEMORY)
SPARK_COMMON_OPTS+=" --conf spark.shuffle.manager=tungsten-sort"
SPARK_COMMON_OPTS+="  --queue %s" % QUEUE

# Dynamic allocation configuration
SPARK_COMMON_OPTS+=" --conf spark.dynamicAllocation.enabled=true"
SPARK_COMMON_OPTS+=" --conf spark.shuffle.service.enabled=true"
SPARK_COMMON_OPTS+=" --conf spark.dynamicAllocation.maxExecutors=%s" % (MAX_N_EXECUTORS)
SPARK_COMMON_OPTS+=" --conf spark.dynamicAllocation.minExecutors=%s" % (MIN_N_EXECUTORS)
SPARK_COMMON_OPTS+=" --conf spark.dynamicAllocation.executorIdleTimeout=%s" % (EXECUTOR_IDLE_MAX_TIME)
SPARK_COMMON_OPTS+=" --conf spark.port.maxRetries=100"

BDA_ENV = os.environ.get('BDA_USER_HOME', '')

# Attach bda-core-ra codebase
SPARK_COMMON_OPTS+=" --files \
{}/scripts/properties/red_agent/nodes.properties,\
{}/scripts/properties/red_agent/nodes-de.properties,\
{}/scripts/properties/red_agent/nodes-es.properties,\
{}/scripts/properties/red_agent/nodes-ie.properties,\
{}/scripts/properties/red_agent/nodes-it.properties,\
{}/scripts/properties/red_agent/nodes-pt.properties,\
{}/scripts/properties/red_agent/nodes-uk.properties".format(*[BDA_ENV]*7)

os.environ["SPARK_COMMON_OPTS"] = SPARK_COMMON_OPTS
os.environ["PYSPARK_SUBMIT_ARGS"] = "%s pyspark-shell " % SPARK_COMMON_OPTS

print os.environ.get('SPARK_COMMON_OPTS', '')
print os.environ.get('PYSPARK_SUBMIT_ARGS', '')

sc, sparkSession, sqlContext = run_sc()
print sc.defaultParallelism


 --master yarn --conf spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.kryoserializer.buffer.max=1g --py-files /var/SP/data/home/adesant3/artifacts/bda-core-ra-complete-assembly-2.0.0.jar,/var/SP/data/home/adesant3/artifacts/common.zip,/var/SP/data/home/adesant3/artifacts/graphframes.zip,/var/SP/data/home/adesant3/artifacts/scripts.zip,/var/SP/data/home/adesant3/artifacts/xgboost4j-spark-2.1.1-0.7-jar-with-dependencies.jar --files /var/SP/data/home/adesant3/scripts/properties/red_agent/nodes-de.properties,/var/SP/data/home/adesant3/scripts/properties/red_agent/nodes-es.properties,/var/SP/data/home/adesant3/scripts/properties/red_agent/nodes-ie.properties,/var/SP/data/home/adesant3/scripts/properties/red_agent/nodes-it.properties,/var/SP/data/home/adesant3/scripts/properties/red_agent/nodes.properties,/var/SP/data/home/adesant3/scripts/properties/red_agent/nodes-pt.properties,/var/SP/data/home/adesant3/scripts/properties/red_agent/nodes-uk.properties,/var/SP/data

In [5]:
# Spark utils
from pyspark.sql.functions import (udf, col, decode, when, lit, lower, 
                                   translate, count, sum as sql_sum, max as sql_max, 
                                   isnull,
                                   substring, size, length,
                                   desc)
from pyspark.sql.types import DoubleType, StringType, IntegerType

In [6]:
spark = (SparkSession.builder
         .appName("Maps from Netperform")
         .master("yarn")
         .config("spark.submit.deployMode", "client")
         .config("spark.ui.showConsoleProgress", "true")
         .enableHiveSupport()
         .getOrCreate()
         )

In [7]:
nameMonth = '201805'
monthsHistoricData = [2,3,4]
nameTableAnonymized = 'tests_es.ads_tmp_'+nameMonth+'_maps_notprepared'
nameTableDeanonymized = 'tests_es.ads_tmp_'+nameMonth+'_maps_prepared'
nameFileHDFS = nameMonth+'_hdfs_maps.csv'
nameFileDelivery = 'delivery_'+nameMonth+'_maps.csv'

In [8]:
netPerform = (spark.read.table("raw_es.netperform_1_1")
                  .where((col("month").isin(monthsHistoricData))
                        ))

In [None]:
%%capture output

netPerform.groupBy('month').count().show()

In [13]:
a = 3

# Extract information from NetPerform in relation to maps apps consumption

## Read Data from NetPerform

First of all, I keep the information concerning maps per user (id_client). There are three different types of information attributes that are relevant:
- **Application Starts Hourly:** Number of times, per hour, a user has started an application. It can be regarded as the number of times a user has entered the application also. The *value* associated with this attribute is a counter/integer.
    - The reason why it is per hour (and now per Day) it is because, the per Day attribute is almost empty.
- ** Application Usage Time Hourly:** Minutes of times, per hour, a user has been actively using an application. The *value* associated with this attribute is in minutes.
    - The reason why it is per hour (and now per Day) it is because, the per Day attribute is almost empty.
- ** App Data Traffic - Daily (DL+UL):** Kb of data both downloaded/uploaded when using this application. The *value* associated with this attribute is in kb.

In addition, the apps associated with maps are:
- Google Maps
- Waze
- Google Earth
- Coyote
- TomTom

In [6]:
mapsUsage = (netPerform
            .select('year','month','day','id_client','measurement_type_name','app_identifier','value')
            .where(col('measurement_type_name').like("%Application Starts Hourly%") |
                   col('measurement_type_name').like("%Application Usage Time Hourly%") |
                   col('measurement_type_name').like("%App Data Traffic - Daily (DL+UL)%"))
            .where(col('app_identifier').like('%maps%') | 
                   col('app_identifier').like('%com.waze%') |
                   col('app_identifier').like('%com.google.earth%') |
                   col('app_identifier').like('%coyotesystems%') |
                   col('app_identifier').like('%com.tomtom%')))

For each *measurement_type_name*, values are accumulated by the *sum* operator. The idea is to *cumsum* all the values (without taking averages or the count of times the *sum* has been aggregated.

In [7]:
# Obtain the aggregation per customer, year, month, day and type of measurement.
# At this moment, the app information is lost as we are making no distinction among the apps.
mapsUsagePerCustomerAndDay = \
(mapsUsage
 .groupBy(['id_client','year','month','day','measurement_type_name'])
 .agg({'value':'sum'})
 .withColumnRenamed('sum(value)','SumValuePerDay'))

## Read Netperform Mapping Table 

From this table *raw_es.netperform_mapping*, we will only keep the columns of *msisdn* and *client_id* in order to ojin with *netperform* table.

In [8]:
# Load data from netperform mapping, taking only the needed attributes (especially msisdn)
netPerformMapping = (spark.read.table('raw_es.netperform_mapping')
                     .select('year','month','day','client_id','msisdn')
                     .where(col('month').isin(monthsHistoricData))
                    )

Now, let's join both tables so that it is possible to connect information from netperform with customerbase (msisdn).

In [9]:
# Alias are used to avoid duplicates names when joining two tables
alias_mapsUsagePerCustomerAndDay = mapsUsagePerCustomerAndDay.alias('mapsUsagePerCustomerAndDay')
alias_netPerformMapping = netPerformMapping.alias('netPerformMapping')

In [10]:
# Join netperform_mapping with data from netperform so that at this moment the data is at a msisdn level.
mapsUsagePerMsisdn = \
(alias_mapsUsagePerCustomerAndDay.join(alias_netPerformMapping,
                                   (alias_mapsUsagePerCustomerAndDay.id_client == alias_netPerformMapping.client_id) &
                                   (alias_mapsUsagePerCustomerAndDay.year == alias_netPerformMapping.year) &
                                   (alias_mapsUsagePerCustomerAndDay.month == alias_netPerformMapping.month) &
                                   (alias_mapsUsagePerCustomerAndDay.day == alias_netPerformMapping.day) ,
                                   how='inner')
                                  .select('netPerformMapping.msisdn',
                                          'mapsUsagePerCustomerAndDay.SumValuePerDay',
                                          'mapsUsagePerCustomerAndDay.measurement_type_name',
                                          'mapsUsagePerCustomerAndDay.month',
                                          'mapsUsagePerCustomerAndDay.day'))

Another alternative to drop dublicates may be just to drop every single column

In [None]:
# Pivot the table so that the measurement_type_name are columns.
mapsUsagePerMsisdnPivot = \
(mapsUsagePerMsisdn
 .groupBy('month','day','msisdn')
 .pivot('measurement_type_name')
 # TODO: I think this line is useless as it is summing up only one value.
 .agg(sql_sum('SumValuePerDay'))
 .na.fill(1))

Former cell is very important, as aggregating by month, day and msisdn, it is possible to include a default value instead of null. That value is 1. If the aggregation would have been done on the same command (just by aggregating at a *msisdn* level), then the *na.fill* command would have filled with 1 at a *msisdn* level.

In [None]:
# Aggregate all the information at a msisdn level, forgetting about month and day.
mapsUsagePerMsisdnDelivery = \
(mapsUsagePerMsisdnPivot.groupBy('msisdn').sum('App Data Traffic - Daily (DL+UL)',
                                                      'Application Starts Hourly',
                                                      'Application Usage Time Hourly'
                                                     ))

In [None]:
mapsUsagePerMsisdnDelivery.count()

According to the documentation, the function *withColumn(colName, col)* returns a new DataFrame by adding a column or replacing the existing column that has the same name. Thus, it is possible to use the same name.

In [14]:
(mapsUsagePerMsisdnDelivery
 .withColumn('msisdn',(substring('msisdn',3,9)))
 .withColumnRenamed('sum(App Data Traffic - Daily (DL+UL))','Total_data_kb')
 .withColumnRenamed('sum(Application Starts Hourly)','Total_starts')
 .withColumnRenamed('sum(Application Usage Time Hourly)','Total_time')
 .where(col('msisdn').isNotNull())
 .where(length(col('msisdn')) == 9)
 .write
 .format('parquet')
 .mode('overwrite')
 .saveAsTable(nameTableAnonymized))

In [15]:
from subprocess import Popen, PIPE

In [23]:
p = (Popen(['sh',
            '/home/jsotovi2/desanonimizar.sh',
            '--fields',
            'msisdn=DE_MSISDN',
            '--overwrite',
            nameTableAnonymized,
            nameTableDeanonymized], stdin=PIPE, stdout=PIPE, stderr=PIPE))
output, err = p.communicate(input = "s\n")
rc = p.returncode

In [24]:
output

'[info] Unravel Sensor 4.2.1056/1.3.9-unravel-1705 initializing.\n\xc2\xbfEst\xc3\xa1s seguro de que quieres sobreescribir la tabla tests_es.ads_tmp_201804_maps_prepared? [s/n]: \xc2\xa1Completado!\n'

In [25]:
err



In [26]:
mapsDeliveryTable = (spark.read.table(nameTableDeanonymized))

In [27]:
mapsDeliveryTable.printSchema()

root
 |-- msisdn_anon: string (nullable = true)
 |-- Total_data_kb: double (nullable = true)
 |-- Total_starts: double (nullable = true)
 |-- Total_time: double (nullable = true)
 |-- msisdn: string (nullable = true)



In [28]:
mapsDeliveryTable.write.csv(nameFileHDFS)

In [34]:
p = (Popen(['hadoop',
            'fs',
            '-copyToLocal',
            nameFileHDFS,
            '/var/SP/data/home/adesant3/data/download/.'], stdin=PIPE, stdout=PIPE, stderr=PIPE))

In [35]:
p = (Popen(['sed 1d /var/SP/data/home/adesant3/data/download/'+nameFileHDFS+'/*.csv > /var/SP/data/home/adesant3/data/delivery/'+nameFileDelivery], 
           stdin=PIPE, stdout=PIPE, stderr=PIPE, shell=True))

In [36]:
output, err = p.communicate()

In [37]:
output

''

In [38]:
err

''