# Table of Contents
* [Preamble](#Preamble)
* [Load TNPS data by MSISDN](#Load-TNPS-data-by-MSISDN)
* [Load TNPS data by NIF](#Load-TNPS-data-by-NIF)
* [Load Call Centre Calls data by MSISDN](#Load-Call-Centre-Calls-data-by-MSISDN)
* [Load Call Centre Calls data by NIF](#Load-Call-Centre-Calls-data-by-NIF)
* [TNPS Oracle stack](#TNPS-Oracle-stack)
    * [Oracle Prepaid segment](#Oracle-Prepaid-segment)
        * [Oracle Prepaid segment - Data Preparation](#Oracle-Prepaid-segment---Data-Preparation)
        * [Oracle Prepaid segment - Training](#Oracle-Prepaid-segment---Training)
        * [Oracle Prepaid segment - Predictions](#Oracle-Prepaid-segment---Predictions)
    * [Oracle Non-Prepaid segments](#Oracle-Non-Prepaid-segments)
        * [Load Oracle Postpaid](#Load-Oracle-Postpaid)
        * [Oracle Non-Prepaid segment - Data Preparation](#Oracle-Non-Prepaid-segment---Data-Preparation)
        * [Oracle Non-Prepaid segment - Training](#Oracle-Non-Prepaid-segment---Training)
        * [Oracle Non-Prepaid segment - Predictions](#Oracle-Non-Prepaid-segment---Predictions)
    * [Write predictions to HDFS](#Write-predictions-to-HDFS)

# Preamble

In [1]:
'''
# Run this on the shell to start Jupyter Notebook

kinit ${USER}@INTERNAL.VODAFONE.COM -k -t .ssh/${USER}.keytab
klist

export BDA_USER_HOME=/var/SP/data/home/${USER}

. ${BDA_USER_HOME}/scripts/properties/common.sh
cd ${BDA_USER_HOME}

ipython notebook --port=65011 --no-browser --ip=127.0.0.1
'''

'\n# Run this on the shell to start Jupyter Notebook\nkinit ${USER}@INTERNAL.VODAFONE.COM -k -t .ssh/${USER}.keytab\nklist\n\nexport BDA_USER_HOME=/var/SP/data/home/${USER}\n\n. ${BDA_USER_HOME}/scripts/properties/common.sh\ncd ${BDA_USER_HOME}\n\nipython notebook --port=65011 --no-browser --ip=127.0.0.1\n'

In [1]:
from common.src.main.python.utils.hdfs_generic import *
import os

MAX_N_EXECUTORS=15
MIN_N_EXECUTORS=1
N_CORES_EXECUTOR=4
EXECUTOR_IDLE_MAX_TIME=120
EXECUTOR_MEMORY='16g'
DRIVER_MEMORY='16g'
N_CORES_DRIVER=1
MEMORY_OVERHEAD=N_CORES_EXECUTOR*2048
QUEUE="root.datascience.normal"
BDA_CORE_VERSION="1.0.0"

# The default directory is chosen from a platform-dependent list, but the user of the 
# application can control the directory location by setting the TMPDIR, TEMP or TMP 
# environment variables.
USER=os.environ.get('USER', '')
os.environ['TMPDIR'] = '/var/SP/data/home/'+USER+'/tmp/'
#os.environ['TEMP'] = '/var/SP/data/home/'+USER+'/tmp/'
#os.environ['TMP'] = '/var/SP/data/home/'+USER+'/tmp/'
import tempfile
print 'Default Temp dir:', tempfile.gettempdir() # prints the current temporary directory

SPARK_COMMON_OPTS=os.environ.get('SPARK_COMMON_OPTS', '')
SPARK_COMMON_OPTS+=" --executor-memory %s --driver-memory %s" % (EXECUTOR_MEMORY, DRIVER_MEMORY)
SPARK_COMMON_OPTS+=" --conf spark.port.maxRetries=500"
SPARK_COMMON_OPTS+=" --conf spark.shuffle.manager=tungsten-sort"
SPARK_COMMON_OPTS+=" --queue %s" % QUEUE

# Dynamic allocation configuration
SPARK_COMMON_OPTS+=" --conf spark.shuffle.service.enabled=true"
SPARK_COMMON_OPTS+=" --conf spark.dynamicAllocation.enabled=true"
SPARK_COMMON_OPTS+=" --conf spark.dynamicAllocation.maxExecutors=%s" % (MAX_N_EXECUTORS)
SPARK_COMMON_OPTS+=" --conf spark.dynamicAllocation.minExecutors=%s" % (MIN_N_EXECUTORS)
SPARK_COMMON_OPTS+=" --conf spark.dynamicAllocation.executorIdleTimeout=%s" % (EXECUTOR_IDLE_MAX_TIME)

BDA_ENV = os.environ.get('BDA_USER_HOME', '')

# Attach bda-core-ra codebase
SPARK_COMMON_OPTS+=" --files \
{}/scripts/properties/red_agent/nodes.properties,\
{}/scripts/properties/red_agent/nodes-de.properties,\
{}/scripts/properties/red_agent/nodes-es.properties,\
{}/scripts/properties/red_agent/nodes-ie.properties,\
{}/scripts/properties/red_agent/nodes-it.properties,\
{}/scripts/properties/red_agent/nodes-pt.properties,\
{}/scripts/properties/red_agent/nodes-uk.properties".format(*[BDA_ENV]*7)

os.environ["SPARK_COMMON_OPTS"] = SPARK_COMMON_OPTS
os.environ["PYSPARK_SUBMIT_ARGS"] = "%s pyspark-shell " % SPARK_COMMON_OPTS

print os.environ.get('SPARK_COMMON_OPTS', '')
print os.environ.get('PYSPARK_SUBMIT_ARGS', '')

sc, sparkSession, sqlContext = run_sc()
print sc.defaultParallelism

Default Temp dir: /var/SP/data/home/bbergua/tmp
 --queue root.datascience.normal  --conf spark.port.maxRetries=50  --conf spark.network.timeout=10000000  --conf spark.executor.heartbeatInterval=60  --conf spark.yarn.executor.memoryOverhead=2G  --conf spark.sql.broadcastTimeout=1200  --master yarn --conf spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.kryoserializer.buffer.max=1g --py-files /var/SP/data/home/bbergua/artifacts/bda-core-ra-complete-assembly-2.0.0.jar,/var/SP/data/home/bbergua/artifacts/common.zip,/var/SP/data/home/bbergua/artifacts/graphframes.zip,/var/SP/data/home/bbergua/artifacts/scripts.zip,/var/SP/data/home/bbergua/artifacts/xgboost4j-spark-2.1.1-0.7-jar-with-dependencies.jar --files /var/SP/data/home/bbergua/scripts/properties/red_agent/nodes-de.properties,/var/SP/data/home/bbergua/scripts/properties/red_agent/nodes-es.properties,/var/SP/data/home/bbergua/scripts/properties/red_agent/nodes-ie.properties,/var/SP/data/home/bbergua/scripts/prop

In [2]:
spark = (SparkSession.builder
         .appName("VF-ES Oracle TNPS model")
         .master("yarn")
         .config("spark.submit.deployMode", "client")
         .config("spark.ui.showConsoleProgress", "true")
         .enableHiveSupport()
         .getOrCreate()
         )
print 'Spark version:', spark.version

Spark version: 2.1.0.cloudera1


In [100]:
#spark.stop()

In [3]:
import h2o
from h2o.backend.connection import H2OConnectionError
#if h2o.cluster() is not None:
#    print 'Stopping H2O'
#    h2o.cluster().shutdown()
#if h2o.cluster() is not None:
#    print 'Stopping H2O'
#    h2o.cluster().shutdown()
try:
    #h2o.cluster().show_status()
    h2o.cluster().shutdown()
    print 'Stopped running H2O instance'
except H2OConnectionError as e:
    print e
except:
    pass

In [4]:
import h2o
#h2o.init(nthreads = 1024)
#h2o.init(nthreads = -1, min_mem_size = "28G")
#h2o.init(nthreads = -1, min_mem_size = "28G", max_mem_size = "56G")
#h2o.init(nthreads = -1, max_mem_size = "56G")
h2o.init(nthreads = -1, max_mem_size = "14G")
#h2o.cluster().show_status()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.7.0_79"; Java(TM) SE Runtime Environment (build 1.7.0_79-b15); Java HotSpot(TM) 64-Bit Server VM (build 24.79-b02, mixed mode)
  Starting server from /opt/cloudera/parcels/Anaconda-2.5.0/lib/python2.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/SP/data/home/bbergua/tmp/tmpkz3eP2
  JVM stdout: /var/SP/data/home/bbergua/tmp/tmpkz3eP2/h2o_bbergua_started_from_python.out
  JVM stderr: /var/SP/data/home/bbergua/tmp/tmpkz3eP2/h2o_bbergua_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,03 secs
H2O cluster version:,3.16.0.2
H2O cluster version age:,4 months and 10 days !!!
H2O cluster name:,H2O_from_python_bbergua_t0ami7
H2O cluster total nodes:,1
H2O cluster free memory:,12.44 Gb
H2O cluster total cores:,56
H2O cluster allowed cores:,56
H2O cluster status:,"accepting new members, healthy"
H2O connection url:,http://127.0.0.1:54321


In [None]:
import h2o
from h2o.backend.connection import H2OConnectionError
try:
    h2o.cluster().show_status()
except H2OConnectionError as e:
    print e

In [5]:
# Configuration
month = '201712'
label_preds = 'TNPS' # This is the label that we want for output
label_model = 'TNPS2DET' # This is the label used for training the model, and making predictions
#label_model = 'TNPS2PRO'
#filter_cond = 'TNPS4 == "PROMOTER" OR TNPS4 == "HARD DETRACTOR"'
filter_cond = None

In [6]:
import re
import subprocess
import sys
import time
# Spark utils
from pyspark.sql.functions import array_contains, bround, col, collect_set, concat, lit, lpad, size, struct, trim, udf, when
from pyspark.sql.types import IntegerType, StringType, StructField, StructType

Define some useful functions

In [7]:
# Replace column names of the type 'fun(colname)' by 'fun_colname'
# Also replace any character not in [a-zA-Z0-9_.] with '_'
def fix_column_names(df):
    names = df.schema.names

    for n in names:
        m = re.search('([^()]*)\(([^()]*)\)', n)
        if m is not None:
            # print m.group(0), '->', m.group(1) + '_' + m.group(2)
            df = df.withColumnRenamed(n, m.group(1) + '_' + m.group(2))

        m = re.sub('[^a-zA-Z0-9_.]', '_', n)
        if n != m:
            df = df.withColumnRenamed(n, m)

    return df

# Add a prefix to column names of a Dataset
def add_prefix_columns(df, prefix):
    print 'Appending', prefix, 'prefix to columns ...'
    for c in df.columns:
        if c not in ['msisdn', 'nif', 'SEGMENTACION', 'year', 'month', 'day', 'partitioned_month']:
            df = df.withColumnRenamed(c, prefix + '_' + c)
    
    return df

In [8]:
def balance_train_data(all_train_pred_df, label='label'):
    if 'PRED' in all_train_pred_df.columns:
        all_train_pred_df = all_train_pred_df.where('PRED = False').drop('PRED')

    # Get counts by label
    counts = all_train_pred_df.groupBy(label).count().orderBy(label).cache()
    # counts.show()
    counts_dict = dict(counts.rdd.map(lambda x: (x[label], x['count'])).collect())
    
    # Calculate the rate to apply to classes
    fractions_dict = {}
    for l in counts_dict.keys():
        fractions_dict[l] = float(min(counts_dict.values()))/counts_dict[l]
    print 'fractions:', fractions_dict
    
    #all_train_pred_df.select(['TNPS01', 'TNPS', 'TNPS2DET', 'TNPS2PRO', 'TNPS4']).show(1)
    #all_train_pred_df = all_train_pred_df.fillna('', subset=['TNPS01', 'TNPS', 'TNPS2DET', 'TNPS2PRO', 'TNPS4'])
    #all_train_pred_df = all_train_pred_df.fillna('')
    #all_train_pred_df = all_train_pred_df.fillna(0)
    #print all_train_pred_df.count()

    # Columns with missing values:
    #   AC_x_fecha_nacimiento, AC_min_llam_ultmes, AC_num_sms_ultmes, AC_ult3meses_total, AC_media_ult3meses, AC_diasdesdeultrecarga, AC_numrecargasult3meses_total, AC_codigo_postal, AC_cod_golden, AC_fecha_beneficio_activo, 
    
    #print all_train_pred_df.select("AC_fecha_ejecucion").filter(col("AC_fecha_ejecucion").isNull()).count()
    #all_train_pred_df.groupby("AC_fecha_ejecucion").count().show()
    #all_train_pred_df = all_train_pred_df.na.drop(how='any')
    #print all_train_pred_df.count()
    #all_train_pred_df.withColumn("AC_fecha_ejecucion", when(col("AC_fecha_ejecucion").isNull(), "__SOME_PLACEHOLDER__").otherwise(col("AC_fecha_ejecucion")))
    #print all_train_pred_df.select("AC_fecha_ejecucion").filter(col("AC_fecha_ejecucion").isNull()).count()
    #all_train_pred_df.groupby("AC_fecha_ejecucion").count().show()
    balanced = all_train_pred_df.sampleBy(label, fractions=fractions_dict, seed=1234)
    #print balanced.count()
    #balanced = balanced.na.drop(how='any')
    #print balanced.count()
    #balanced.groupBy(label).count().orderBy(label).show()
    #balanced.select(['TNPS01', 'TNPS', 'TNPS2DET', 'TNPS2PRO', 'TNPS4']).show(1)
    #balanced.withColumn("AC_fecha_ejecucion", when(col("AC_fecha_ejecucion").isNull(), "__SOME_PLACEHOLDER__").otherwise(col("AC_fecha_ejecucion")))
    #for c in balanced.columns:
    #    print c
    #    print balanced.select(c).filter(col(c).isNull()).count()
        #balanced.select(c).take(10)
    #print balanced.select("AC_fecha_ejecucion").filter(col("AC_fecha_ejecucion").isNull()).count()
    #balanced.show(1)

    return balanced

def prepare_training_data(df, balance=True, filter_cond=None, repartition_by=None):
    print 'Class count before cleaning:'
    df = df.dropna(subset=label_model)
    df_count = df.count()
    df_groupby = df.groupby(label_model).count()
    df_proportions = df_groupby.withColumn('proportion', 100*df_groupby['count']/df_count).cache()
    df_proportions_dict = dict(df_proportions.select(label_model, 'proportion').collect())
    df_proportions.withColumn('proportion', bround(df_proportions['proportion'], 2)).show()
    
    #df.groupby(label_model).count().show()
    #df = df.na.drop(how='any')
    df = df.drop('msisdn', 'nif')
    
    # If required, filter out rows
    if filter_cond is not None:
        print 'Filtering by', filter_cond
        df = df.filter(filter_cond)

    # Repartition/Coalesce
    # This is due to bug SPARK-18528 which is fixed in versions 2.1.1, and 2.2.0 ( https://issues.apache.org/jira/browse/SPARK-18528 )
    if repartition_by is not None:
        df = df.repartition(repartition_by)
    else:
        df = df.coalesce(1)
    
    #print 'Class count before balancing:'
    #df_before_count = df.count()
    #df_before_groupby = df.groupby(label_model).count()
    #df_before_proportions = df_before_groupby.withColumn('proportion', bround(100*df_before_groupby['count']/df_before_count, 2)).cache()
    #df_before_proportions.show()
    ##df_before_proportions_dict = dict(df_before_proportions.select(label_model, 'proportion').collect())
    
    # If required, balance classes
    balanced = df
    if balance:
        balanced = balance_train_data(df, label_model)
        print 'Class count after balancing:'
        balanced_count = balanced.count()
        balanced_groupby = balanced.groupby(label_model).count()
        balanced_groupby.withColumn('proportion', bround(100*balanced_groupby['count']/balanced_count, 2)).show()
    
    balanced.printSchema()
    #balanced.show(1)
    
    return balanced, df_proportions_dict

def generate_pandas_data(df):
    print time.ctime()
    start = time.time()

    print 'Converting to Pandas'
    df_pd = df.toPandas()

    #cols_to_reencode = []
    print 'Re-encoding String variables to UTF-8'
    #for col in cols_to_reencode:
    #    if col in df_pd.columns:
    #        df_pd[col] = df_pd[col].str.encode('utf-8')
    for col in df_pd.columns.values.tolist():
        if df_pd[col].dtype.kind == 'O':
            #print 'Re-encoding to UTF-8', col
            df_pd[col] = df_pd[col].str.encode('utf-8')

    #df_pd.head()
    
    end = time.time()
    print 'Process took:', "{0:.2f}".format((end - start)/60), 'minutes'

    return df_pd

# Load TNPS data by MSISDN

In [33]:
#del tnps_msisdn
if not 'tnps_msisdn' in globals():
    print 'Reading TNPS by MSISDN from HDFS for month =', month
    tnps_msisdn = spark.read.parquet('/tmp/bbergua/tnps/msisdn/')
    if month is not None:
        tnps_msisdn = tnps_msisdn.filter('partitioned_month == "%s"' % month)
    tnps_msisdn = tnps_msisdn.withColumn('TNPS2DET', 
                                         when(tnps_msisdn['TNPS'] == 'DETRACTOR', tnps_msisdn['TNPS'])
                                         .otherwise('NON DETRACTOR'))
    tnps_msisdn = tnps_msisdn.withColumn('TNPS2PRO', 
                                         when(tnps_msisdn['TNPS'] == 'PROMOTER', tnps_msisdn['TNPS'])
                                         .otherwise('NON PROMOTER'))
    tnps_msisdn = tnps_msisdn.select(['msisdn', 'nif', 'partitioned_month', 'SEGMENTACION',
                                      'TNPS01', 'TNPS', 'TNPS2DET', 'TNPS2PRO', 'TNPS4'])
    tnps_msisdn.printSchema()

# Load TNPS data by NIF

In [9]:
#del tnps_id
if not 'tnps_id' in globals():
    print 'Reading TNPS by Id (NIF) from HDFS for month =', month
    tnps_id = spark.read.parquet('/tmp/bbergua/tnps/id/')
    if month is not None:
        tnps_id = tnps_id.filter('partitioned_month == "%s"' % month)
    tnps_id = tnps_id.withColumn('TNPS2DET', 
                                 when(tnps_id['TNPS'] == 'DETRACTOR', tnps_id['TNPS'])
                                 .otherwise('NON DETRACTOR'))
    tnps_id = tnps_id.withColumn('TNPS2PRO', 
                                 when(tnps_id['TNPS'] == 'PROMOTER', tnps_id['TNPS'])
                                 .otherwise('NON PROMOTER'))
    tnps_id = tnps_id.select(['nif', 'partitioned_month', 'SEGMENTACION_Prepaid', 'SEGMENTACION',
                              'TNPS01', 'TNPS', 'TNPS2DET', 'TNPS2PRO', 'TNPS4'])
    tnps_id.printSchema()

Reading TNPS by Id (NIF) from HDFS for month = 201712
root
 |-- nif: string (nullable = true)
 |-- partitioned_month: string (nullable = true)
 |-- SEGMENTACION_Prepaid: string (nullable = true)
 |-- SEGMENTACION: string (nullable = true)
 |-- TNPS01: integer (nullable = true)
 |-- TNPS: string (nullable = true)
 |-- TNPS2DET: string (nullable = true)
 |-- TNPS2PRO: string (nullable = true)
 |-- TNPS4: string (nullable = true)



# Load Call Centre Calls data by MSISDN

In [35]:
#del ccc_msisdn
if not 'ccc_msisdn' in globals():
    print 'Reading CCC by MSISDN from HDFS for month =', month
    ccc_msisdn = spark.read.parquet('/tmp/bbergua/ccc/msisdn/')
    if month is not None:
        ccc_msisdn = ccc_msisdn.filter('partitioned_month == "%s"' % month)
    ccc_msisdn = add_prefix_columns(ccc_msisdn, 'CCC')
    ccc_msisdn.groupby('partitioned_month').count().show()
    ccc_msisdn.printSchema()

# Load Call Centre Calls data by NIF

In [10]:
#del ccc_id
if not 'ccc_id' in globals():
    print 'Reading CCC by Id (NIF) from HDFS for month =', month
    ccc_id = spark.read.parquet('/tmp/bbergua/ccc/id/')
    if month is not None:
        ccc_id = ccc_id.filter('partitioned_month == "%s"' % month)
    ccc_id = add_prefix_columns(ccc_id, 'CCC')
    ccc_id.groupby('partitioned_month').count().show()
    ccc_id.printSchema()

Reading CCC by Id (NIF) from HDFS for month = 201712
Appending CCC prefix to columns ...
+-----------------+-------+
|partitioned_month|  count|
+-----------------+-------+
|           201712|1115576|
+-----------------+-------+

root
 |-- nif: string (nullable = true)
 |-- partitioned_month: string (nullable = true)
 |-- CCC_Pagar_menos: long (nullable = true)
 |-- CCC_Incidencia_Provision_Neba: long (nullable = true)
 |-- CCC_Incidencia_Provision_Fibra: long (nullable = true)
 |-- CCC_Incidencia_Provision_DSL: long (nullable = true)
 |-- CCC_Incidencia_Tecnica: long (nullable = true)
 |-- CCC_Incidencia_SGI: long (nullable = true)
 |-- CCC_Incidencia_Resto: long (nullable = true)
 |-- CCC_Incidencia_Provision_Movil: long (nullable = true)
 |-- CCC_Resultado_No_Aplica: long (nullable = true)
 |-- CCC_Resultado_Informacion: long (nullable = true)
 |-- CCC_Resultado_Solucionado: long (nullable = true)
 |-- CCC_Resultado_Retenido: long (nullable = true)
 |-- CCC_Resultado_No_Retenido: lo

# TNPS Oracle stack

## Oracle Prepaid segment

### Oracle Prepaid segment - Data Preparation

In [37]:
##print 'ACC CAR rows:', vf_pre.filter('partitioned_month == "%s"' % month).count()
#print 'Accenture CAR distinct MSISDN:', spark.table('udf_es.pre_explicativas_4m_' + month).select('msisdn').distinct().count()
#print 'Accenture CAR distinct NIF:   ', spark.table('udf_es.pre_explicativas_4m_' + month).select('nif').distinct().count()
##print 'AC_FINAL_PRE rows:', spark.table('raw_es.vf_pre_ac_final').filter('partitioned_month == "%s"' % month).count()
#print 'AC_FINAL_PREP distinct MSISDN:', spark.table('raw_es.vf_pre_ac_final').filter('partitioned_month == "%s"' % month).select('msisdn').distinct().count()
#print 'AC_FINAL_PREP distinct NIF:   ', spark.table('raw_es.vf_pre_ac_final').filter('partitioned_month == "%s"' % month).select('num_documento_comprador').distinct().count()

In [38]:
def prepare_dataset_prepaid():
    # Load Oracle AC Final Prepaid
    #del vf_pre
    #if not 'vf_pre' in globals():
    #print 'Load Oracle Prepaid CAR from Hive for month =', month
    #vf_pre = spark.table('udf_es.pre_explicativas_4m_' + month)
    print 'Load Oracle AC Final Prepaid from Hive for month =', month
    vf_pre = spark.table('raw_es.vf_pre_ac_final')
    if month is not None:
        vf_pre = vf_pre.filter('partitioned_month == "%s"' % month)
    vf_pre.groupby('partitioned_month').count().show()
    vf_pre = vf_pre.drop('num_documento_cliente', 'sfid_canje')
    vf_pre = vf_pre.withColumnRenamed('num_documento_comprador', 'nif')
    vf_pre = vf_pre.withColumn('tipo_documento_cliente', when(vf_pre['tipo_documento_cliente'] == 'Tarj.Residente',  'Tarj. Residente')
                                                        .when(vf_pre['tipo_documento_cliente'] == 'NIE',             'Tarj. Residente')
                                                        .when(vf_pre['tipo_documento_cliente'] == 'T_RES',           'Tarj. Residente')
                                                        .when(vf_pre['tipo_documento_cliente'] == 'TARJ. RESIDENTE', 'Tarj. Residente')
                                                        .when(vf_pre['tipo_documento_cliente'] == 'PASAPORTE',       'Pasaporte')
                                                        .when(vf_pre['tipo_documento_cliente'] == 'GMT+1',           '')
                                                        .otherwise(vf_pre['tipo_documento_cliente']))
    vf_pre = add_prefix_columns(vf_pre, 'AC')
    #vf_pre = vf_pre.withColumnRenamed('nif', 'AC_nif')
    #vf_pre.printSchema()

    # Join Prepaid data with Call Centre Calls
    print 'Join Prepaid data with Call Centre Calls'
    vf_pre_ccc = vf_pre.join(ccc_msisdn, ['msisdn', 'partitioned_month'], 'left_outer')
    ccc_columns = [x for x in ccc_msisdn.columns if x not in ['msisdn', 'partitioned_month']]
    vf_pre_ccc = vf_pre_ccc.fillna(0, ccc_columns)
    #vf_pre_ccc.groupby('partitioned_month').count().show()
    #vf_pre_ccc.printSchema()

    # Join Prepaid data with TNPS (to get the label)
    #tnps_msisdn.show()
    print 'Join TNPS with Prepaid data'
    tnps_msisdn_prepaid = tnps_msisdn.filter('SEGMENTACION == "Prepaid"').drop('SEGMENTACION').select(['msisdn', 'partitioned_month', 'TNPS01', 'TNPS', 'TNPS2DET', 'TNPS2PRO', 'TNPS4'])
    #tnps_msisdn_prepaid.groupby('partitioned_month').count().show()
    tnps_msisdn_prepaid.groupby('TNPS4').count().show()
    vf_pre_ccc_tnps = vf_pre_ccc.join(tnps_msisdn_prepaid, ['msisdn', 'partitioned_month'], 'left_outer')
    #vf_pre_ccc_tnps.groupby('partitioned_month').count().show()
    
    # Now, generate dataset for model training
    vf_pre = vf_pre_ccc_tnps.drop('year', 'month', 'day')
    vf_pre = fix_column_names(vf_pre)

    # Calculate proportions of TNPS4 levels
    vf_pre_count = vf_pre.na.drop(subset='TNPS4').count()
    vf_pre_groupby = vf_pre.na.drop(subset='TNPS4').groupby('TNPS4').count()
    vf_pre_proportions = vf_pre_groupby.withColumn('proportion', 100*vf_pre_groupby['count']/vf_pre_count).cache()
    vf_pre_proportions_dict = dict(vf_pre_proportions.select('TNPS4', 'proportion').collect())
    vf_pre_proportions.withColumn('proportion', bround(vf_pre_proportions['proportion'], 2)).show()
    
    vf_pre.printSchema()
    
    return vf_pre, vf_pre_proportions_dict

In [39]:
vf_pre, vf_pre_proportions = prepare_dataset_prepaid()

Load Oracle AC Final Prepaid from Hive for month = 201712
+-----------------+-------+
|partitioned_month|  count|
+-----------------+-------+
|           201712|2730651|
+-----------------+-------+

Appending AC prefix to columns ...
Join Prepaid data with Call Centre Calls
Join TNPS with Prepaid data
+--------------+-----+
|         TNPS4|count|
+--------------+-----+
|HARD DETRACTOR| 6972|
|       NEUTRAL| 7111|
|      PROMOTER|30105|
|SOFT DETRACTOR| 4827|
+--------------+-----+

+--------------+-----+----------+
|         TNPS4|count|proportion|
+--------------+-----+----------+
|HARD DETRACTOR| 6972|     14.22|
|       NEUTRAL| 7111|     14.51|
|      PROMOTER|30105|     61.42|
|SOFT DETRACTOR| 4827|      9.85|
+--------------+-----+----------+

root
 |-- msisdn: string (nullable = true)
 |-- partitioned_month: string (nullable = true)
 |-- AC_fecha_ejecucion: string (nullable = true)
 |-- nif: string (nullable = true)
 |-- AC_fx_1llamada: string (nullable = true)
 |-- AC_fx_activ

In [40]:
#spark.table('raw_es.vf_pre_ac_final').filter('partitioned_month == "%s"' % month).groupby('tipo_documento_cliente').count().sort('count', ascending=False).show()
#vf_pre.groupby('AC_tipo_documento_cliente').count().sort('count', ascending=False).show()

In [41]:
vf_pre_proportions

{u'HARD DETRACTOR': 14.224217076405182,
 u'NEUTRAL': 14.507803733550954,
 u'PROMOTER': 61.419973477506886,
 u'SOFT DETRACTOR': 9.84800571253698}

### Oracle Prepaid segment - Training

In [42]:
vf_pre_training, vf_pre_training_proportions = prepare_training_data(vf_pre)

Class count before cleaning:
+-------------+-----+----------+
|     TNPS2DET|count|proportion|
+-------------+-----+----------+
|NON DETRACTOR|37216|     75.93|
|    DETRACTOR|11799|     24.07|
+-------------+-----+----------+

fractions: {u'NON DETRACTOR': 0.31704105760963025, u'DETRACTOR': 1.0}
Class count after balancing:
+-------------+-----+----------+
|     TNPS2DET|count|proportion|
+-------------+-----+----------+
|    DETRACTOR|11799|     50.17|
|NON DETRACTOR|11720|     49.83|
+-------------+-----+----------+

root
 |-- partitioned_month: string (nullable = true)
 |-- AC_fecha_ejecucion: string (nullable = true)
 |-- AC_fx_1llamada: string (nullable = true)
 |-- AC_fx_activacion: string (nullable = true)
 |-- AC_codigo_plan_precios: string (nullable = true)
 |-- AC_fx_ultima_recarga: string (nullable = true)
 |-- AC_estado_servicio: string (nullable = true)
 |-- AC_tipo_documento_cliente: string (nullable = true)
 |-- AC_tipo_documento_comprador: string (nullable = true)
 |--

We need to properly encode columns that contain special characters. For example, columns *AC_nacionalidad* (AC_FINAL_PREPAID) or *CAR_seg_nacionalidad* (Accenture CAR) contains the 'ñ' character. Other columns also contain special characters, for example, *AC_num_documento_cliente*, and *AC_nif* (AC_FINAL_PREPAID).

In [44]:
print time.ctime()
start = time.time()
print vf_pre_training.count()
vf_pre_training.write.mode('overwrite').format('parquet').save('/tmp/bbergua/tmp/vf_pre_training-'+month)
end = time.time()
print 'Process took:', "{0:.2f}".format((end - start)/60), 'minutes'

Mon Apr  9 09:45:54 2018
23519
Process took: 2.62 minutes


In [45]:
# $ hdfs getconf -confKey fs.defaultFS
# hdfs://nameservice1
# $ hdfs getconf -namenodes
# vgddp350hr.dc.sedc.internal.vodafone.com vgddp351hr.dc.sedc.internal.vodafone.com
out = subprocess.check_output('hdfs getconf -namenodes', shell=True)
nodes = ['hdfs://'+n for n in out.strip().split(' ')]

In [46]:
path_vf_pre_training = [n+'/tmp/bbergua/tmp/vf_pre_training-'+month for n in nodes]
path_vf_pre_training[0]

'hdfs://vgddp350hr.dc.sedc.internal.vodafone.com/tmp/bbergua/tmp/vf_pre_training-201712'

In [47]:
i=0
from h2o.backend.connection import H2OServerError
while i < len(nodes):
    try:
        vf_pre_training_df_h2o = h2o.import_file(path=path_vf_pre_training[i])
        vf_pre_training_df_h2o.nrows
        break
    except H2OServerError as e:
        i=i+1
        #print e
#subprocess.call('hdfs dfs -rm -r /tmp/bbergua/tmp/vf_pre_training-'+month, shell=True)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [48]:
if False:
    vf_pre_training_pd = generate_pandas_data(vf_pre_training)
    # Copy to H2O

    h2o.remove_all()
    #print balanced_pd.columns.values.tolist()
    #os.environ["PYTHONIOENCODING"] = "UTF-8"
    print 'Copying data to H2O'
    vf_pre_training_df_h2o = h2o.H2OFrame(vf_pre_training_pd, destination_frame='data_df_h2o.hex', header=1)
    h2o.ls()
    vf_pre_training_df_h2o.head()

In [90]:
vf_pre_training_df_h2o['AC_fx_1llamada'] = vf_pre_training_df_h2o['AC_fx_1llamada'].asnumeric()
vf_pre_training_df_h2o['AC_fx_activacion'] = vf_pre_training_df_h2o['AC_fx_activacion'].asnumeric()
vf_pre_training_df_h2o['AC_fx_ultima_recarga'] = vf_pre_training_df_h2o['AC_fx_ultima_recarga'].asnumeric()
vf_pre_training_df_h2o['AC_x_fecha_nacimiento'] = vf_pre_training_df_h2o['AC_x_fecha_nacimiento'].asnumeric()
vf_pre_training_df_h2o['AC_fecha_beneficio_activo'] = vf_pre_training_df_h2o['AC_fecha_beneficio_activo'].asnumeric()

vf_pre_training_df_h2o['AC_num_prepago'] = vf_pre_training_df_h2o['AC_num_prepago'].asnumeric()
vf_pre_training_df_h2o['AC_num_pospago'] = vf_pre_training_df_h2o['AC_num_pospago'].asnumeric()
vf_pre_training_df_h2o['AC_num_total'] = vf_pre_training_df_h2o['AC_num_total'].asnumeric()
vf_pre_training_df_h2o['AC_min_llam_ultmes'] = vf_pre_training_df_h2o['AC_min_llam_ultmes'].asnumeric()
vf_pre_training_df_h2o['AC_num_sms_ultmes'] = vf_pre_training_df_h2o['AC_num_sms_ultmes'].asnumeric()
vf_pre_training_df_h2o['AC_ult3meses_total'] = vf_pre_training_df_h2o['AC_ult3meses_total'].asnumeric()
vf_pre_training_df_h2o['AC_media_ult3meses'] = vf_pre_training_df_h2o['AC_media_ult3meses'].asnumeric()
vf_pre_training_df_h2o['AC_diasdesdeultrecarga'] = vf_pre_training_df_h2o['AC_diasdesdeultrecarga'].asnumeric()
vf_pre_training_df_h2o['AC_numrecargasult3meses_total'] = vf_pre_training_df_h2o['AC_numrecargasult3meses_total'].asnumeric()

#vf_pre_training_df_h2o[''] = vf_pre_training_df_h2o[''].asnumeric()

In [91]:
vf_pre_training_df_h2o.describe()

Rows:23519
Cols:101




Unnamed: 0,partitioned_month,AC_fecha_ejecucion,AC_fx_1llamada,AC_fx_activacion,AC_codigo_plan_precios,AC_fx_ultima_recarga,AC_estado_servicio,AC_tipo_documento_cliente,AC_tipo_documento_comprador,AC_nacionalidad,AC_x_fecha_nacimiento,AC_tac_fac,AC_num_prepago,AC_num_pospago,AC_num_total,AC_min_llam_ultmes,AC_num_sms_ultmes,AC_ult3meses_total,AC_media_ult3meses,AC_diasdesdeultrecarga,AC_numrecargasult3meses_total,AC_codigo_postal,AC_cod_golden,AC_cobertura_4g,AC_lortad,AC_deuda,AC_flag_huella_ono,AC_flag_4g_aperturas,AC_flag_4g_nodos,AC_flag_huella_vf,AC_flag_huella_neba,AC_flag_huella_euskaltel,AC_flag_cobertura_adsl,AC_flag_beneficio_activo,AC_fecha_beneficio_activo,CCC_Pagar_menos,CCC_Incidencia_Provision_Neba,CCC_Incidencia_Provision_Fibra,CCC_Incidencia_Provision_DSL,CCC_Incidencia_Tecnica,CCC_Incidencia_SGI,CCC_Incidencia_Resto,CCC_Incidencia_Provision_Movil,CCC_Resultado_No_Aplica,CCC_Resultado_Informacion,CCC_Resultado_Solucionado,CCC_Resultado_Retenido,CCC_Resultado_No_Retenido,CCC_Resultado_Escalo,CCC_Resultado_Envio_tecnico,CCC_Resultado_Transferencia,CCC_Resultado_Abono,CCC_Resultado_Bajas,CCC_Resultado_Reclamacion,CCC_Desactivacion_BA_Movil_TV,CCC_Desactivacion_TV,CCC_Desactivacion_Movil,CCC_Desactivacion_Total,CCC_Desactivacion_NET,CCC_Desactivacion_Fijo,CCC_Desactivacion_USB,CCC_Desactivacion_Resto,CCC_Ofrecimiento,CCC_Transferencia,CCC_Cobro,CCC_Precios,CCC_Portabilidad_Inversa,CCC_Portabilidad,CCC_Informacion,CCC_Cierre,CCC_Alta,CCC_Factura,CCC_Averia_DSL,CCC_Averia_Fibra,CCC_Averia_TV,CCC_Averia_Resto,CCC_Averia_Neba,CCC_Averia_Modem_Router,CCC_Baja,CCC_Consulta_Tecnica_TV,CCC_Consulta_Tecnica_Fibra,CCC_Consulta_Tecnica_Neba,CCC_Consulta_Tecnica_DSL,CCC_Consulta_Tecnica_Movil,CCC_Consulta_Tecnica_Modem_Router,CCC_Consulta_Tecnica_Resto,CCC_Consulta_Ficha,CCC_Consulta_Resto,CCC_Productos_Voz,CCC_Productos_Datos,CCC_Productos_Resto,CCC_Provision_Neba,CCC_Provision_Fibra,CCC_Provision_DSL,CCC_Provision_Resto,CCC_Provision_Movil,TNPS01,TNPS,TNPS2DET,TNPS2PRO,TNPS4
type,enum,enum,int,int,enum,int,enum,enum,enum,enum,int,enum,int,int,int,int,int,int,int,int,int,enum,enum,enum,enum,enum,enum,enum,enum,enum,enum,enum,enum,enum,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,enum,enum,enum,enum
mins,,,17530101.0,19700101.0,,17530101.0,,,,,0.0,,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,
mean,,,20126383.582,20139657.2485,,19985446.9337,,,,,26.8795441983,,1.48735065266,0.196054254007,1.68340490667,2849.36153748,17.6866363366,397.847102343,313.502487351,87.5212381479,19.3187635529,,,,,,,,,,,,,,8.48479952379,0.0,0.000170075258302,8.50376291509e-05,0.000170075258302,0.000170075258302,0.0,8.50376291509e-05,0.0,0.00880139461712,0.226922913389,0.137038139377,0.00667545388835,0.00403928738467,0.00348654279519,0.000552744589481,0.00943917683575,0.000212594072877,0.0,0.00229601598707,0.000680301033207,0.000212594072877,0.00939665802117,0.0,0.00212594072877,0.000637782218632,0.000212594072877,0.000340150516604,0.00136060206641,0.137803478039,0.00608019048429,0.0,4.25188145754e-05,0.0342276457332,0.123559675156,0.0167098941282,0.0307411029381,0.0114800799354,0.00255112887453,0.00204090309962,0.000552744589481,0.00233853480165,0.000765338662358,8.50376291509e-05,0.00361409923891,4.25188145754e-05,0.00046770696033,0.000127556443726,0.000722819847783,0.0273821165866,0.000722819847783,0.000297631702028,0.0,0.0,4.25188145754e-05,0.000255112887453,0.240443896424,0.000170075258302,0.000297631702028,0.000722819847783,0.000340150516604,4.25188145754e-05,5.92559207449,,,,
maxs,,,20171231.0,20171231.0,,20180101.0,,,,,90.0,,61.0,12.0,61.0,8405.0,280.0,839.0,839.0,181.0,53.0,,,,,,,,,,,,,,29.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,7.0,70.0,29.0,5.0,2.0,2.0,1.0,4.0,1.0,0.0,2.0,2.0,1.0,5.0,0.0,2.0,1.0,1.0,1.0,7.0,37.0,5.0,0.0,1.0,10.0,27.0,9.0,5.0,7.0,6.0,3.0,2.0,2.0,3.0,1.0,5.0,1.0,1.0,1.0,2.0,6.0,2.0,1.0,0.0,0.0,1.0,1.0,55.0,1.0,1.0,4.0,7.0,1.0,10.0,,,,
sigma,,,189969.677317,42600.4172406,,675328.283518,,,,,27.9944700636,,1.08566619884,0.71654466334,1.29046680409,2756.131618,50.9817614404,236.896321929,295.027202114,48.5351643623,15.7957254394,,,,,,,,,,,,,,9.53461767749,0.0,0.013040458703,0.00922138890287,0.013040458703,0.013040458703,0.0,0.00922138890287,0.0,0.126640625737,0.891272430434,0.645304828085,0.102679459599,0.0685818675845,0.0631251298425,0.0235045219682,0.106730811988,0.0145793660475,0.0,0.0487430297226,0.0276569769834,0.0145793660475,0.117902433676,0.0,0.0487507440348,0.0252468325553,0.0145793660475,0.0184404249604,0.0583077909327,0.781906402694,0.0945244997852,0.0,0.0065206452576,0.2317812376,0.507069487605,0.166558871857,0.206285096958,0.139992039065,0.0725675454661,0.0545186033622,0.026880197971,0.0508752296807,0.0368791831781,0.00922138890287,0.076238199179,0.0065206452576,0.0216219353532,0.0112936086424,0.0284142453144,0.217482163032,0.0284142453144,0.0172498049008,0.0,0.0,0.0065206452576,0.0159705557091,0.820305535829,0.013040458703,0.0172498049008,0.0374520767357,0.0461076503175,0.0065206452576,3.96568636489,,,,
zeros,,,0,0,,0,,,,,11072,,0,21193,0,2815,2815,1995,1995,1995,1995,,,,,,,,,,,,,,9975,23519,23515,23517,23515,23515,23519,23517,23519,23368,20050,21607,23397,23432,23443,23506,23313,23514,23519,23466,23504,23514,23340,23519,23472,23504,23514,23511,23495,21781,23399,23519,23518,22865,21361,23202,22908,23309,23476,23481,23508,23467,23507,23517,23449,23518,23508,23516,23503,23034,23503,23512,23519,23519,23518,23513,19699,23515,23512,23507,23517,23518,4890,,,,
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,201712,20171231,20171225.0,20171222.0,PPIB7,20171222.0,AC,NIF,NIF,España,68.0,35929005,1.0,0.0,1.0,0.0,0.0,499.0,2.0,12.0,2.0,68533,,1,0,0,0,0,0,0,0,0,,1,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,DETRACTOR,DETRACTOR,NON PROMOTER,HARD DETRACTOR
1,201712,20171231,20060808.0,20060808.0,PPFCL,20170805.0,AC,Pasaporte,TARJ. RESIDENTE,Rumania,82.0,35155405,2.0,0.0,2.0,1.0,1.0,1.0,1.0,57.0,1.0,41226,015050227,0,0,1,0,0,0,0,0,0,,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,PROMOTER,NON DETRACTOR,PROMOTER,PROMOTER
2,201712,20171231,20171228.0,20171215.0,PPFCL,17530101.0,AC,NIF,NIF,España,24.0,35989504,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,02337,,0,0,0,0,0,0,0,0,0,,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,NEUTRAL,NON DETRACTOR,NON PROMOTER,NEUTRAL


In [92]:
#data_df_h2o.types

In [93]:
# Split the data into Train/Test/Validation with Train having 70% and test and validation 15% each
train,test,valid = vf_pre_training_df_h2o.split_frame(ratios=[.7, .15])

In [94]:
# Identify predictors and response
#x = train.columns
y = label_model
#x.remove(y)
x = [item for item in train.columns if item not in 
     ['msisdn', 'nif', 'TNPS01', 'TNPS2DET', 'TNPS2PRO', 'TNPS', 'TNPS4',
      'AC_cod_golden', 'AC_tac_fac']]

In [95]:
train.types[y]

u'enum'

In [96]:
# For binary classification, response should be a factor
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()
valid[y] = valid[y].asfactor()

In [97]:
train.types[y]

u'enum'

In [98]:
from h2o.automl import H2OAutoML
# Run AutoML for 60 seconds
aml = H2OAutoML(max_runtime_secs = 60, seed=1234)
aml.train(x = x, y = y,
          training_frame = train,
          leaderboard_frame = test)

AutoML progress: |████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [99]:
# View the AutoML Leaderboard
lb = aml.leaderboard
lb

model_id,auc,logloss
StackedEnsemble_AllModels_0_AutoML_20180409_103218,0.548555,0.689099
StackedEnsemble_BestOfFamily_0_AutoML_20180409_103218,0.548505,0.689104
GLM_grid_0_AutoML_20180409_103218_model_0,0.54594,0.690217
GBM_grid_0_AutoML_20180409_103218_model_0,0.539046,0.692264
DRF_0_AutoML_20180409_103218,0.531176,0.706489
XRT_0_AutoML_20180409_103218,0.529478,0.70633




In [100]:
# The leader model is stored here
aml.leader

Model Details
H2OStackedEnsembleEstimator :  Stacked Ensemble
Model Key:  StackedEnsemble_AllModels_0_AutoML_20180409_103218
No model summary for this model


ModelMetricsBinomialGLM: stackedensemble
** Reported on train data. **

MSE: 0.215825318863
RMSE: 0.464570036553
LogLoss: 0.624151737299
Null degrees of freedom: 13104
Residual degrees of freedom: 13100
Null deviance: 18167.1396811
Residual deviance: 16359.0170346
AIC: 16369.0170346
AUC: 0.916394002913
Gini: 0.832788005826
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.500244202923: 


0,1,2,3,4
,DETRACTOR,NON DETRACTOR,Error,Rate
DETRACTOR,5260.0,1321.0,0.2007,(1321.0/6581.0)
NON DETRACTOR,930.0,5594.0,0.1426,(930.0/6524.0)
Total,6190.0,6915.0,0.1718,(2251.0/13105.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.5002442,0.8325024,191.0
max f2,0.4733915,0.8911676,238.0
max f0point5,0.5193943,0.8554511,157.0
max accuracy,0.5060606,0.8305990,181.0
max precision,0.7484017,1.0,0.0
max recall,0.2697283,1.0,384.0
max specificity,0.7484017,1.0,0.0
max absolute_mcc,0.5117139,0.6614098,172.0
max min_per_class_accuracy,0.5044461,0.8284795,184.0


Gains/Lift Table: Avg response rate: 49.78 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100725,0.6150401,1.9935193,1.9935193,0.9924242,0.9924242,0.0200797,0.0200797,99.3519267,99.3519267
,2,0.0200687,0.6033881,1.9934031,1.9934614,0.9923664,0.9923954,0.0199264,0.0400061,99.3403101,99.3461405
,3,0.0300649,0.5946779,2.0087370,1.9985403,1.0,0.9949239,0.0200797,0.0600858,100.8736971,99.8540337
,4,0.0400610,0.5882340,2.0087370,2.0010846,1.0,0.9961905,0.0200797,0.0801655,100.8736971,100.1084640
,5,0.0500572,0.5837185,2.0087370,2.0026128,1.0,0.9969512,0.0200797,0.1002452,100.8736971,100.2612773
,6,0.1000382,0.5670059,1.9934031,1.9980114,0.9923664,0.9946606,0.0996321,0.1998774,99.3403101,99.8011450
,7,0.1500191,0.5557066,1.9627354,1.9862587,0.9770992,0.9888098,0.0980993,0.2979767,96.2735361,98.6258734
,8,0.2,0.5460068,1.9228673,1.9704169,0.9572519,0.9809233,0.0961067,0.3940834,92.2867299,97.0416922
,9,0.3000382,0.5302448,1.7972910,1.9126936,0.8947368,0.9521872,0.1797977,0.5738811,79.7290974,91.2693596




ModelMetricsBinomialGLM: stackedensemble
** Reported on validation data. **

MSE: 0.247906306579
RMSE: 0.497901904575
LogLoss: 0.6889597582
Null degrees of freedom: 3324
Residual degrees of freedom: 3320
Null deviance: 4609.84831375
Residual deviance: 4581.58239203
AIC: 4591.58239203
AUC: 0.552125829478
Gini: 0.104251658955
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.376539531802: 


0,1,2,3,4
,DETRACTOR,NON DETRACTOR,Error,Rate
DETRACTOR,21.0,1621.0,0.9872,(1621.0/1642.0)
NON DETRACTOR,5.0,1678.0,0.003,(5.0/1683.0)
Total,26.0,3299.0,0.489,(1626.0/3325.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.3765395,0.6736251,377.0
max f2,0.3282766,0.8369825,390.0
max f0point5,0.4207338,0.5643660,345.0
max accuracy,0.5067993,0.5428571,176.0
max precision,0.7176405,1.0,0.0
max recall,0.1659443,1.0,399.0
max specificity,0.7176405,1.0,0.0
max absolute_mcc,0.5067993,0.0866781,176.0
max min_per_class_accuracy,0.5036770,0.5401070,184.0


Gains/Lift Table: Avg response rate: 50.62 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0102256,0.5923444,1.0459264,1.0459264,0.5294118,0.5294118,0.0106952,0.0106952,4.5926392,4.5926392
,2,0.0201504,0.5821958,1.1973568,1.1205115,0.6060606,0.5671642,0.0118835,0.0225787,19.7356812,12.0511524
,3,0.0300752,0.5737932,1.3769603,1.2051396,0.6969697,0.61,0.0136661,0.0362448,37.6960334,20.5139632
,4,0.04,0.5669623,1.3170925,1.2329174,0.6666667,0.6240602,0.0130719,0.0493167,31.7092494,23.2917409
,5,0.0502256,0.5629761,1.1040334,1.2066776,0.5588235,0.6107784,0.0112894,0.0606061,10.4033414,20.6677554
,6,0.1001504,0.5498778,1.0830309,1.1450399,0.5481928,0.5795796,0.0540701,0.1146762,8.3030876,14.5039871
,7,0.1500752,0.5415496,1.1306366,1.1402484,0.5722892,0.5771543,0.0564468,0.1711230,13.0636629,14.0248411
,8,0.2,0.5349718,1.1306366,1.1378491,0.5722892,0.5759398,0.0564468,0.2275698,13.0636629,13.7849079
,9,0.3001504,0.5232571,1.0619800,1.1125340,0.5375375,0.5631263,0.1063577,0.3339275,6.1979984,11.2534040




ModelMetricsBinomialGLM: stackedensemble
** Reported on cross-validation data. **

MSE: 0.248308703442
RMSE: 0.498305833241
LogLoss: 0.689780172192
Null degrees of freedom: 13104
Residual degrees of freedom: 13100
Null deviance: 18169.3129705
Residual deviance: 18079.1383131
AIC: 18089.1383131
AUC: 0.54501397526
Gini: 0.0900279505192
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.381560150234: 


0,1,2,3,4
,DETRACTOR,NON DETRACTOR,Error,Rate
DETRACTOR,95.0,6486.0,0.9856,(6486.0/6581.0)
NON DETRACTOR,38.0,6486.0,0.0058,(38.0/6524.0)
Total,133.0,12972.0,0.4978,(6524.0/13105.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.3815602,0.6653673,349.0
max f2,0.1529387,0.8321216,399.0
max f0point5,0.4375481,0.5576618,296.0
max accuracy,0.5034567,0.5337657,168.0
max precision,0.6610970,0.6666667,4.0
max recall,0.1529387,1.0,399.0
max specificity,0.8044995,0.9998480,0.0
max absolute_mcc,0.4885030,0.0679144,200.0
max min_per_class_accuracy,0.5010706,0.5323421,173.0


Gains/Lift Table: Avg response rate: 49.78 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100725,0.5893328,1.2630695,1.2630695,0.6287879,0.6287879,0.0127223,0.0127223,26.3069459,26.3069459
,2,0.0200687,0.5779948,1.1807080,1.2220453,0.5877863,0.6083650,0.0118026,0.0245248,18.0707991,22.2045306
,3,0.0300649,0.5700745,1.0733709,1.1726130,0.5343511,0.5837563,0.0107296,0.0352544,7.3370901,17.2612953
,4,0.0400610,0.5652431,1.1500403,1.1669805,0.5725191,0.5809524,0.0114960,0.0467505,15.0040251,16.6980526
,5,0.0500572,0.5614923,1.1040386,1.1544113,0.5496183,0.5746951,0.0110362,0.0577866,10.4038641,15.4411339
,6,0.1000382,0.5475180,1.1071054,1.1307764,0.5511450,0.5629291,0.0553342,0.1131208,10.7105415,13.0776419
,7,0.1500191,0.5378468,1.0611038,1.1075640,0.5282443,0.5513733,0.0530349,0.1661557,6.1103805,10.7564027
,8,0.2,0.5306248,1.0549703,1.0944206,0.5251908,0.5448302,0.0527284,0.2188841,5.4970257,9.4420601
,9,0.3000382,0.5198274,1.0602944,1.0830423,0.5278413,0.5391658,0.1060699,0.3249540,6.0294420,8.3042314







Model metrics of train dataset for max accuracy (if BinomialModel):

In [101]:
from h2o.model.metrics_base import H2OBinomialModelMetrics#, H2OMultinomialModelMetrics
train_metrics = aml.leader.model_performance()
if isinstance(train_metrics, H2OBinomialModelMetrics):
    #print aml.leader.confusion_matrix(metrics="accuracy")
    print train_metrics.confusion_matrix(metrics="accuracy")
    print 'AUC:', train_metrics.auc()
    print 'Accuracy:', train_metrics.accuracy()
    #train_metrics.plot(type='roc', server=True)

Confusion Matrix (Act/Pred) for max accuracy @ threshold = 0.50606064703: 


0,1,2,3,4
,DETRACTOR,NON DETRACTOR,Error,Rate
DETRACTOR,5561.0,1020.0,0.155,(1020.0/6581.0)
NON DETRACTOR,1200.0,5324.0,0.1839,(1200.0/6524.0)
Total,6761.0,6344.0,0.1694,(2220.0/13105.0)



AUC: 0.916394002913
Accuracy: [[0.5060606470296386, 0.8305990080122091]]


In [102]:
#train_metrics.plot(type='roc', server=True)

Model metrics of test dataset, and confusion matrix for max accuracy (if BinomialModel):

In [103]:
#test_pred = aml.leader.predict(test)
test_metrics = aml.leader.model_performance(test_data=test)
print test_metrics


ModelMetricsBinomialGLM: stackedensemble
** Reported on test data. **

MSE: 0.247973432803
RMSE: 0.497969309098
LogLoss: 0.689098530378
Null degrees of freedom: 3586
Residual degrees of freedom: 3582
Null deviance: 4972.40126705
Residual deviance: 4943.59285693
AIC: 4953.59285693
AUC: 0.548555405903
Gini: 0.0971108118057
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.415292952739: 


0,1,2,3,4
,DETRACTOR,NON DETRACTOR,Error,Rate
DETRACTOR,80.0,1731.0,0.9558,(1731.0/1811.0)
NON DETRACTOR,30.0,1746.0,0.0169,(30.0/1776.0)
Total,110.0,3477.0,0.4909,(1761.0/3587.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.4152930,0.6647630,349.0
max f2,0.2639327,0.8307606,397.0
max f0point5,0.4491535,0.5586344,305.0
max accuracy,0.4974340,0.5372177,198.0
max precision,0.5878169,0.6956522,21.0
max recall,0.2639327,1.0,397.0
max specificity,0.8105789,0.9994478,0.0
max absolute_mcc,0.4491535,0.0793813,305.0
max min_per_class_accuracy,0.5056392,0.5343468,178.0


Gains/Lift Table: Avg response rate: 49.51 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100362,0.5899129,1.3464715,1.3464715,0.6666667,0.6666667,0.0135135,0.0135135,34.6471471,34.6471471
,2,0.0200725,0.5805125,1.0659566,1.2062140,0.5277778,0.5972222,0.0106982,0.0242117,6.5956582,20.6214027
,3,0.0301087,0.5745599,1.0098536,1.1407606,0.5,0.5648148,0.0101351,0.0343468,0.9853604,14.0760552
,4,0.0401450,0.5683727,0.8976476,1.0799823,0.4444444,0.5347222,0.0090090,0.0433559,-10.2352352,7.9982326
,5,0.0501812,0.5650378,1.0659566,1.0771772,0.5277778,0.5333333,0.0106982,0.0540541,6.5956582,7.7177177
,6,0.1000836,0.5499833,1.1508946,1.1139332,0.5698324,0.5515320,0.0574324,0.1114865,15.0894610,11.3933223
,7,0.1499861,0.5421807,1.0154952,1.0811816,0.5027933,0.5353160,0.0506757,0.1621622,1.5495244,8.1181553
,8,0.2001673,0.5355407,1.0771772,1.0801777,0.5333333,0.5348189,0.0540541,0.2162162,7.7177177,8.0177671
,9,0.2999721,0.5243192,1.1283280,1.0961980,0.5586592,0.5427509,0.1126126,0.3288288,12.8328049,9.6197964






In [104]:
#print test_metrics.__class__
#if str(test_metrics.__class__) == "<class 'h2o.model.metrics_base.H2OBinomialModelMetrics'>":
from h2o.model.metrics_base import H2OBinomialModelMetrics
if isinstance(test_metrics, H2OBinomialModelMetrics):
    print test_metrics.confusion_matrix(metrics="accuracy")
    print 'AUC:', test_metrics.auc()
    print 'Accuracy:', test_metrics.accuracy()

Confusion Matrix (Act/Pred) for max accuracy @ threshold = 0.497434049034: 


0,1,2,3,4
,DETRACTOR,NON DETRACTOR,Error,Rate
DETRACTOR,837.0,974.0,0.5378,(974.0/1811.0)
NON DETRACTOR,686.0,1090.0,0.3863,(686.0/1776.0)
Total,1523.0,2064.0,0.4628,(1660.0/3587.0)



AUC: 0.548555405903
Accuracy: [[0.4974340490344474, 0.5372177306941734]]


In [105]:
print 'Accuracy:', test_metrics.accuracy()
print 'Threshold max acc:', test_metrics.find_threshold_by_max_metric('accuracy')
print 'Idx max acc:', test_metrics.find_idx_by_threshold(test_metrics.find_threshold_by_max_metric('accuracy'))
#test_metrics.plot(type='roc', server=True)

Accuracy: [[0.4974340490344474, 0.5372177306941734]]
Threshold max acc: 0.497434049034
Idx max acc: 198


### Oracle Prepaid segment - Predictions

Make predictions

In [106]:
vf_pre.printSchema()

root
 |-- msisdn: string (nullable = true)
 |-- partitioned_month: string (nullable = true)
 |-- AC_fecha_ejecucion: string (nullable = true)
 |-- nif: string (nullable = true)
 |-- AC_fx_1llamada: string (nullable = true)
 |-- AC_fx_activacion: string (nullable = true)
 |-- AC_codigo_plan_precios: string (nullable = true)
 |-- AC_fx_ultima_recarga: string (nullable = true)
 |-- AC_estado_servicio: string (nullable = true)
 |-- AC_tipo_documento_cliente: string (nullable = true)
 |-- AC_tipo_documento_comprador: string (nullable = true)
 |-- AC_nacionalidad: string (nullable = true)
 |-- AC_x_fecha_nacimiento: string (nullable = true)
 |-- AC_tac_fac: string (nullable = true)
 |-- AC_num_prepago: string (nullable = true)
 |-- AC_num_pospago: string (nullable = true)
 |-- AC_num_total: string (nullable = true)
 |-- AC_min_llam_ultmes: string (nullable = true)
 |-- AC_num_sms_ultmes: string (nullable = true)
 |-- AC_ult3meses_total: string (nullable = true)
 |-- AC_media_ult3meses: strin

In [107]:
h2o.ls()

Unnamed: 0,key
0,AutoML_20180409_095222
1,AutoML_20180409_103218
2,AutoML_Feedback_AutoML_20180409_095222
3,AutoML_Feedback_AutoML_20180409_103218
4,AutoML_Leaderboard_automl_py_48_sid_88f9
5,AutoML_Leaderboard_automl_py_8_sid_88f9
6,DRF_0_AutoML_20180409_095222
7,DRF_0_AutoML_20180409_095222_cv_1
8,DRF_0_AutoML_20180409_095222_cv_2
9,DRF_0_AutoML_20180409_095222_cv_3


In [108]:
if False:
    #h2o.remove_all()
    vf_pre_all_pd = generate_pandas_data(vf_pre)

    # Copy to H2O

    #print balanced_pd.columns.values.tolist()
    #os.environ["PYTHONIOENCODING"] = "UTF-8"
    print 'Copying data to H2O'
    all_data_df_h2o = h2o.H2OFrame(vf_pre_all_pd, destination_frame='all_data_df_h2o.hex', header=1)
    all_data_df_h2o.head()

    #preds = aml.leader.predict()

In [109]:
#vf_pre_all_pd = generate_pandas_prepaid(vf_pre.drop('msisdn', 'partitioned_month'))

In [110]:
#vf_pre_all_pd.dtypes
#print vf_pre_all_pd.columns.values.tolist()

In [111]:
print vf_pre.rdd.getNumPartitions()
vf_pre.coalesce(1).write.mode('overwrite').format('parquet').save('/tmp/bbergua/tmp/vf_pre_all-'+month)

In [117]:
# $ hdfs getconf -confKey fs.defaultFS
# hdfs://nameservice1
# $ hdfs getconf -namenodes
# vgddp350hr.dc.sedc.internal.vodafone.com vgddp351hr.dc.sedc.internal.vodafone.com
out = subprocess.check_output('hdfs getconf -namenodes', shell=True)
nodes = ['hdfs://'+n for n in out.strip().split(' ')]

In [118]:
path_vf_pre_all = [n+'/tmp/bbergua/tmp/vf_pre_all-'+month for n in nodes]
path_vf_pre_all

['hdfs://vgddp350hr.dc.sedc.internal.vodafone.com/tmp/bbergua/tmp/vf_pre_all-201712',
 'hdfs://vgddp351hr.dc.sedc.internal.vodafone.com/tmp/bbergua/tmp/vf_pre_all-201712']

In [119]:
i=0
from h2o.backend.connection import H2OServerError
while i < len(nodes):
    try:
        vf_pre_all_df_h2o = h2o.import_file(path=path_vf_pre_all[i])
        print vf_pre_all_df_h2o.nrows
        break
    except H2OServerError as e:
        print 'ERROR in', path_vf_pre_all[i]
        i=i+1
        #print e
#subprocess.call('hdfs dfs -rm -r /tmp/bbergua/tmp/vf_pre_all-'+month, shell=True)

ERROR in hdfs://vgddp350hr.dc.sedc.internal.vodafone.com/tmp/bbergua/tmp/vf_pre_all-201712
Parse progress: |█████████████████████████████████████████████████████████| 100%
2730651


In [120]:
h2o.ls()

Unnamed: 0,key
0,AutoML_20180409_095222
1,AutoML_20180409_103218
2,AutoML_Feedback_AutoML_20180409_095222
3,AutoML_Feedback_AutoML_20180409_103218
4,AutoML_Leaderboard_automl_py_48_sid_88f9
5,AutoML_Leaderboard_automl_py_8_sid_88f9
6,DRF_0_AutoML_20180409_095222
7,DRF_0_AutoML_20180409_095222_cv_1
8,DRF_0_AutoML_20180409_095222_cv_2
9,DRF_0_AutoML_20180409_095222_cv_3


In [124]:
vf_pre_all_df_h2o['AC_fx_1llamada'] = vf_pre_all_df_h2o['AC_fx_1llamada'].asnumeric()
vf_pre_all_df_h2o['AC_fx_activacion'] = vf_pre_all_df_h2o['AC_fx_activacion'].asnumeric()
vf_pre_all_df_h2o['AC_fx_ultima_recarga'] = vf_pre_all_df_h2o['AC_fx_ultima_recarga'].asnumeric()
vf_pre_all_df_h2o['AC_x_fecha_nacimiento'] = vf_pre_all_df_h2o['AC_x_fecha_nacimiento'].asnumeric()
vf_pre_all_df_h2o['AC_fecha_beneficio_activo'] = vf_pre_all_df_h2o['AC_fecha_beneficio_activo'].asnumeric()

vf_pre_all_df_h2o['AC_num_prepago'] = vf_pre_all_df_h2o['AC_num_prepago'].asnumeric()
vf_pre_all_df_h2o['AC_num_pospago'] = vf_pre_all_df_h2o['AC_num_pospago'].asnumeric()
vf_pre_all_df_h2o['AC_num_total'] = vf_pre_all_df_h2o['AC_num_total'].asnumeric()
vf_pre_all_df_h2o['AC_min_llam_ultmes'] = vf_pre_all_df_h2o['AC_min_llam_ultmes'].asnumeric()
vf_pre_all_df_h2o['AC_num_sms_ultmes'] = vf_pre_all_df_h2o['AC_num_sms_ultmes'].asnumeric()
vf_pre_all_df_h2o['AC_ult3meses_total'] = vf_pre_all_df_h2o['AC_ult3meses_total'].asnumeric()
vf_pre_all_df_h2o['AC_media_ult3meses'] = vf_pre_all_df_h2o['AC_media_ult3meses'].asnumeric()
vf_pre_all_df_h2o['AC_diasdesdeultrecarga'] = vf_pre_all_df_h2o['AC_diasdesdeultrecarga'].asnumeric()
vf_pre_all_df_h2o['AC_numrecargasult3meses_total'] = vf_pre_all_df_h2o['AC_numrecargasult3meses_total'].asnumeric()

#vf_pre_all_df_h2o[''] = vf_pre_all_df_h2o[''].asnumeric()

In [125]:
vf_pre_all_df_h2o.describe()

Rows:2730651
Cols:103




Unnamed: 0,msisdn,partitioned_month,AC_fecha_ejecucion,nif,AC_fx_1llamada,AC_fx_activacion,AC_codigo_plan_precios,AC_fx_ultima_recarga,AC_estado_servicio,AC_tipo_documento_cliente,AC_tipo_documento_comprador,AC_nacionalidad,AC_x_fecha_nacimiento,AC_tac_fac,AC_num_prepago,AC_num_pospago,AC_num_total,AC_min_llam_ultmes,AC_num_sms_ultmes,AC_ult3meses_total,AC_media_ult3meses,AC_diasdesdeultrecarga,AC_numrecargasult3meses_total,AC_codigo_postal,AC_cod_golden,AC_cobertura_4g,AC_lortad,AC_deuda,AC_flag_huella_ono,AC_flag_4g_aperturas,AC_flag_4g_nodos,AC_flag_huella_vf,AC_flag_huella_neba,AC_flag_huella_euskaltel,AC_flag_cobertura_adsl,AC_flag_beneficio_activo,AC_fecha_beneficio_activo,CCC_Pagar_menos,CCC_Incidencia_Provision_Neba,CCC_Incidencia_Provision_Fibra,CCC_Incidencia_Provision_DSL,CCC_Incidencia_Tecnica,CCC_Incidencia_SGI,CCC_Incidencia_Resto,CCC_Incidencia_Provision_Movil,CCC_Resultado_No_Aplica,CCC_Resultado_Informacion,CCC_Resultado_Solucionado,CCC_Resultado_Retenido,CCC_Resultado_No_Retenido,CCC_Resultado_Escalo,CCC_Resultado_Envio_tecnico,CCC_Resultado_Transferencia,CCC_Resultado_Abono,CCC_Resultado_Bajas,CCC_Resultado_Reclamacion,CCC_Desactivacion_BA_Movil_TV,CCC_Desactivacion_TV,CCC_Desactivacion_Movil,CCC_Desactivacion_Total,CCC_Desactivacion_NET,CCC_Desactivacion_Fijo,CCC_Desactivacion_USB,CCC_Desactivacion_Resto,CCC_Ofrecimiento,CCC_Transferencia,CCC_Cobro,CCC_Precios,CCC_Portabilidad_Inversa,CCC_Portabilidad,CCC_Informacion,CCC_Cierre,CCC_Alta,CCC_Factura,CCC_Averia_DSL,CCC_Averia_Fibra,CCC_Averia_TV,CCC_Averia_Resto,CCC_Averia_Neba,CCC_Averia_Modem_Router,CCC_Baja,CCC_Consulta_Tecnica_TV,CCC_Consulta_Tecnica_Fibra,CCC_Consulta_Tecnica_Neba,CCC_Consulta_Tecnica_DSL,CCC_Consulta_Tecnica_Movil,CCC_Consulta_Tecnica_Modem_Router,CCC_Consulta_Tecnica_Resto,CCC_Consulta_Ficha,CCC_Consulta_Resto,CCC_Productos_Voz,CCC_Productos_Datos,CCC_Productos_Resto,CCC_Provision_Neba,CCC_Provision_Fibra,CCC_Provision_DSL,CCC_Provision_Resto,CCC_Provision_Movil,TNPS01,TNPS,TNPS2DET,TNPS2PRO,TNPS4
type,string,enum,enum,string,int,int,enum,int,enum,enum,enum,enum,int,enum,int,int,int,int,int,int,int,int,int,enum,enum,enum,enum,enum,enum,enum,enum,enum,enum,enum,enum,enum,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,enum,enum,enum,enum
mins,,,,,17530101.0,17530101.0,,17530101.0,,,,,0.0,,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,
mean,,,,,19965682.4616,20141385.6089,,19793635.6881,,,,,41.0638433839,,1.52362568486,0.263266891302,1.78689257617,13220.6039578,85.9490509772,1994.65055549,1689.22380304,77.4486904405,21.6841830025,,,,,,,,,,,,,,4.40123509009,1.83106519288e-06,1.53809476202e-05,1.53809476202e-05,1.72120128131e-05,1.06201781187e-05,0.0,1.94092910445e-05,1.83106519288e-06,0.000507205058427,0.0191166868267,0.0103770126611,0.000408327538012,0.000219361610107,0.000314577000137,3.40578125875e-05,0.000819584780333,2.12403562374e-05,0.0,0.000166260719513,4.35793515905e-05,2.27052083917e-05,0.000631717491543,0.0,9.99761595312e-05,3.69875168962e-05,1.79444388902e-05,1.06201781187e-05,0.000116821959306,0.0112313876801,0.000454104167834,2.19727823145e-06,1.4648521543e-06,0.00261585973455,0.0138142882412,0.00147693718458,0.0024008926809,0.00102136816459,0.000204713088564,0.00011828681146,4.61428428605e-05,0.000195191549561,4.65090558991e-05,5.12698254006e-06,0.000262940961697,1.2085030273e-05,3.69875168962e-05,6.59183469436e-06,5.0903612362e-05,0.00214490976694,4.43117776677e-05,2.34376344689e-05,1.09863911573e-06,1.09863911573e-06,3.66213038576e-06,1.79444388902e-05,0.0193734021667,3.55226647419e-05,4.68752689377e-05,4.68752689377e-05,1.09863911573e-05,3.66213038576e-06,7.75980822197,,,,
maxs,,,,,20180101.0,20171231.0,,20180101.0,,,,,165.0,,355.0,49.0,355.0,63024.0,2366.0,6058.0,6058.0,182.0,123.0,,,,,,,,,,,,,,29.0,2.0,2.0,3.0,3.0,3.0,0.0,7.0,1.0,11.0,96.0,40.0,5.0,3.0,13.0,2.0,34.0,2.0,0.0,3.0,2.0,2.0,5.0,0.0,3.0,2.0,3.0,1.0,8.0,55.0,6.0,1.0,1.0,43.0,51.0,15.0,40.0,15.0,8.0,8.0,2.0,3.0,6.0,2.0,22.0,2.0,7.0,1.0,3.0,19.0,2.0,4.0,1.0,1.0,1.0,2.0,89.0,7.0,4.0,4.0,8.0,1.0,10.0,,,,
sigma,,,,,651905.44065,42713.7468876,,924304.150861,,,,,48.5683181112,,2.44922125841,0.868713741846,2.61501043359,18596.6215797,342.309884557,1983.28721515,2063.42698757,57.4051970884,27.6754878495,,,,,,,,,,,,,,8.13502094409,0.00160108989629,0.00401411796347,0.00436381122485,0.00480324199118,0.00368100184749,0.0,0.0061416268257,0.00135316758409,0.0318745085789,0.238193914914,0.172308882545,0.0239518122522,0.01569755859,0.0212471617681,0.00589822782455,0.0387685383732,0.0046874661821,0.0,0.0133123826922,0.00676570376424,0.00484119071359,0.0289458612784,0.0,0.0110093853109,0.00620088824728,0.0045687885528,0.00325884476766,0.0128794766917,0.199764152016,0.0258128216904,0.00148232054836,0.00121031010281,0.0727856790978,0.159445436105,0.0499209031862,0.0669189793602,0.0411957192976,0.0199690820418,0.0130769858863,0.00760654666205,0.0148095776673,0.00836328601031,0.00242061654895,0.0240225482033,0.00368099732972,0.00728694325234,0.00256744886143,0.0076305506537,0.0604260329936,0.00692617947268,0.00586715339068,0.00104815948741,0.00104815948741,0.00191366619756,0.00440556195615,0.224912052611,0.00915759546417,0.00816385938987,0.0080736452391,0.00716028794641,0.00191366619756,3.33347172622,,,,
zeros,0,,,0,0,0,,0,,,,,1515426,,0,2406875,0,115274,115274,560848,560848,560848,560848,,,,,,,,,,,,,,1924984,2730647,2730610,2730613,2730611,2730625,2730651,2730608,2730646,2729634,2694283,2712021,2729711,2730087,2729882,2730559,2728655,2730594,2730651,2730210,2730535,2730590,2729153,2730651,2730403,2730552,2730605,2730622,2730361,2711362,2729609,2730645,2730647,2724838,2699871,2727388,2725201,2728373,2730229,2730372,2730541,2730146,2730543,2730638,2730071,2730620,2730557,2730633,2730521,2725959,2730535,2730596,2730648,2730648,2730641,2730604,2693195,2730581,2730544,2730540,2730638,2730641,4890,,,,
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2681636,2681636,2681636,2681636,2681636
0,600006000,201712,20171231,06754428C,20170516.0,20170516.0,PPIB7,20171230.0,AC,NIF,NIF,España,0.0,35173706,2.0,0.0,2.0,7091.0,1.0,2653.0,5822.0,106.0,50.0,83231,342110439,1,0,0,0,0,0,0,0,0,,1,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,
1,600009265,201712,20171231,10577906N,20170620.0,20170620.0,PPFCL,20170929.0,AC,,NIF,España,0.0,35236607,1.0,0.0,1.0,1.0,1.0,1.0,1.0,178.0,1.0,,868018853,0,0,1,0,0,0,0,0,0,,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,
2,600013939,201712,20171231,77470442T,20131206.0,20131206.0,PPIB8,20171223.0,AC,NIF,NIF,España,0.0,35391407,4.0,0.0,4.0,23301.0,1.0,5490.0,2666.0,2.0,27.0,46382,652202778,0,0,1,0,0,0,0,0,0,D,1,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,


In [180]:
vf_pre_all_df_h2o_predict = vf_pre_all_df_h2o.drop(['TNPS01', 'TNPS2DET', 'TNPS2PRO', 'TNPS', 'TNPS4'])
vf_pre_all_preds = aml.predict(vf_pre_all_df_h2o_predict)

Parse progress: |█████████████████████████████████████████████████████████| 100%
stackedensemble prediction progress: |████████████████████████████████████| 100%


In [181]:
#vf_pre_all_preds.head()

Append MSISDN, NIF and actual labels to predictions

In [182]:
vf_pre_all_preds_msisdn = vf_pre_all_preds.cbind(vf_pre_all_df_h2o[['msisdn', 'nif', 'TNPS01', label_preds, label_model]])
vf_pre_all_preds_msisdn

predict,DETRACTOR,NON DETRACTOR,msisdn,nif,TNPS01,TNPS,TNPS2DET
NON DETRACTOR,0.429564,0.570436,600006000,06754428C,,,
NON DETRACTOR,0.59915,0.40085,600009265,10577906N,,,
NON DETRACTOR,0.326921,0.673079,600013939,77470442T,,,
NON DETRACTOR,0.570642,0.429358,600014045,84144071V,,,
NON DETRACTOR,0.438054,0.561946,600017215,L7403214K,,,
NON DETRACTOR,0.418074,0.581926,600036363,11272337N,,,
NON DETRACTOR,0.538727,0.461273,600037750,0709620,,,
NON DETRACTOR,0.544664,0.455336,600055070,M8898863I,,,
NON DETRACTOR,0.529528,0.470472,600063018,867234945,,,
NON DETRACTOR,0.483511,0.516489,600069074,K9892729A,,,




Count number of rows and proportion of predictions

In [183]:
vf_pre_all_preds_nrows = vf_pre_all_preds_msisdn.nrows
vf_pre_all_preds_groupby = vf_pre_all_preds_msisdn.group_by(by='predict').count().get_frame()
proportion = 100*vf_pre_all_preds_groupby['nrow']/vf_pre_all_preds_nrows
proportion.set_names(['proportion'])
vf_pre_all_preds_groupby.cbind(proportion)

predict,nrow,proportion
DETRACTOR,2353.0,0.0861699
NON DETRACTOR,2728300.0,99.9138




As proportions of predictions are highly unbalanced, let's create a new prediction column (**predict2**) using a threshold on scoring column

In [184]:
#th_max_acc = test_metrics.accuracy()[0][0]
th_max_acc = 0.5
vf_pre_all_preds_msisdn[vf_pre_all_preds_msisdn["NON DETRACTOR"] >= th_max_acc, "predict2"] = "NON DETRACTOR"
vf_pre_all_preds_msisdn[vf_pre_all_preds_msisdn["NON DETRACTOR"] < th_max_acc, "predict2"] = "DETRACTOR"
vf_pre_all_preds_msisdn['predict2'] = vf_pre_all_preds_msisdn['predict2'].asfactor()
vf_pre_all_preds_msisdn

predict,DETRACTOR,NON DETRACTOR,msisdn,nif,TNPS01,TNPS,TNPS2DET,predict2
NON DETRACTOR,0.429564,0.570436,600006000,06754428C,,,,NON DETRACTOR
NON DETRACTOR,0.59915,0.40085,600009265,10577906N,,,,DETRACTOR
NON DETRACTOR,0.326921,0.673079,600013939,77470442T,,,,NON DETRACTOR
NON DETRACTOR,0.570642,0.429358,600014045,84144071V,,,,DETRACTOR
NON DETRACTOR,0.438054,0.561946,600017215,L7403214K,,,,NON DETRACTOR
NON DETRACTOR,0.418074,0.581926,600036363,11272337N,,,,NON DETRACTOR
NON DETRACTOR,0.538727,0.461273,600037750,0709620,,,,DETRACTOR
NON DETRACTOR,0.544664,0.455336,600055070,M8898863I,,,,DETRACTOR
NON DETRACTOR,0.529528,0.470472,600063018,867234945,,,,DETRACTOR
NON DETRACTOR,0.483511,0.516489,600069074,K9892729A,,,,NON DETRACTOR




Compare real label with this **predict2**  column newly created

In [185]:
vf_pre_all_preds_nrows = vf_pre_all_preds_msisdn.nrows
vf_pre_all_preds_groupby = vf_pre_all_preds_msisdn.group_by(by=['TNPS2DET', 'predict2']).count().get_frame()
proportion = 100*vf_pre_all_preds_groupby['nrow']/vf_pre_all_preds_nrows
proportion.set_names(['proportion'])
vf_pre_all_preds_groupby.cbind(proportion)

TNPS2DET,predict2,nrow,proportion
,NON DETRACTOR,1538290.0,56.3342
,DETRACTOR,1143350.0,41.8708
DETRACTOR,DETRACTOR,3472.0,0.127149
DETRACTOR,NON DETRACTOR,8327.0,0.304946
NON DETRACTOR,DETRACTOR,7284.0,0.26675
NON DETRACTOR,NON DETRACTOR,29932.0,1.09615




Count number of rows and proportion of predictions using the new **predict2** column

In [186]:
vf_pre_all_preds_nrows = vf_pre_all_preds_msisdn.nrows
vf_pre_all_preds_groupby = vf_pre_all_preds_msisdn.group_by(by='predict2').count().get_frame()
proportion = 100*vf_pre_all_preds_groupby['nrow']/vf_pre_all_preds_nrows
proportion.set_names(['proportion'])
vf_pre_all_preds_groupby.cbind(proportion)

predict2,nrow,proportion
DETRACTOR,1154100.0,42.2647
NON DETRACTOR,1576550.0,57.7353




As Prepaid data is by MSISDN (instead of NIF), then we need to group by NIF and aggregate scorings

In [415]:
vf_pre_all_preds_msisdn['nif'] = vf_pre_all_preds_msisdn['nif'].asfactor()
vf_pre_all_preds_id = vf_pre_all_preds_msisdn.group_by(by='nif').min(['NON DETRACTOR', 'TNPS01']).max(['DETRACTOR']).get_frame()
vf_pre_all_preds_id = vf_pre_all_preds_id[vf_pre_all_preds_id['nif'] != '']
#vf_pre_all_preds_id[(vf_pre_all_preds_id['min_TNPS01'] != None)].show()
#vf_pre_all_preds_id[(vf_pre_all_preds_id['min_TNPS01'] != vf_pre_all_preds_id['max_TNPS01'])].show()
vf_pre_all_preds_id.set_names(['nif', 'TNPS01', 'DETRACTOR', 'NON DETRACTOR'])
#vf_pre_all_preds_id
vf_pre_all_preds_id[(vf_pre_all_preds_id['TNPS01'] != None)].show()

nif,TNPS01,DETRACTOR,NON DETRACTOR
O62278511,1,0.325979,0.674021
00000398D,10,0.38746,0.61254
00001717R,10,0.437993,0.562007
00005606Y,8,0.485624,0.514376
00006283X,10,0.511998,0.488002
00008950P,8,0.555096,0.444904
00011821K,10,0.446366,0.553634
00014405D,10,0.506025,0.493975
00016419M,9,0.375972,0.624028
00017562E,8,0.346366,0.653634


In [391]:
vf_pre_all_preds_id.types

{u'DETRACTOR': u'real',
 u'NON DETRACTOR': u'real',
 u'TNPS01': u'int',
 u'nif': u'enum'}

In [406]:
#h2o.download_all_logs(dirname='/var/SP/data/home/bbergua/h2ologs', filename = 'autoh2o_log.zip')

Writing H2O logs to /var/SP/data/home/bbergua/h2ologs/autoh2o_log.zip


'/var/SP/data/home/bbergua/h2ologs/autoh2o_log.zip'

In [416]:
vf_pre_all_preds_id[(vf_pre_all_preds_id['TNPS01'] == None), 'TNPS01'] = -1

# H2O does not allow empty strings '' here, so we need to use 'NA' instead
vf_pre_all_preds_id[(vf_pre_all_preds_id['TNPS01'] <= -1), 'TNPS4'] = 'NA'
vf_pre_all_preds_id[(vf_pre_all_preds_id['TNPS01'] >= 0) & (vf_pre_all_preds_id['TNPS01'] <= 3), 'TNPS4'] = 'HARD DETRACTOR'
vf_pre_all_preds_id[(vf_pre_all_preds_id['TNPS01'] >= 4) & (vf_pre_all_preds_id['TNPS01'] <= 6), 'TNPS4'] = 'SOFT DETRACTOR'
vf_pre_all_preds_id[(vf_pre_all_preds_id['TNPS01'] >= 7) & (vf_pre_all_preds_id['TNPS01'] <= 8), 'TNPS4'] = 'NEUTRAL'
vf_pre_all_preds_id[(vf_pre_all_preds_id['TNPS01'] >= 9) & (vf_pre_all_preds_id['TNPS01'] <= 10), 'TNPS4'] = 'PROMOTER'

vf_pre_all_preds_id[(vf_pre_all_preds_id['TNPS4'] == 'NA'), 'TNPS'] = 'NA'
vf_pre_all_preds_id[(vf_pre_all_preds_id['TNPS4'] == 'HARD DETRACTOR') | (vf_pre_all_preds_id['TNPS4'] == 'SOFT DETRACTOR'), 'TNPS'] = 'DETRACTOR'
vf_pre_all_preds_id[(vf_pre_all_preds_id['TNPS4'] == 'NEUTRAL'), 'TNPS'] = 'NEUTRAL'
vf_pre_all_preds_id[(vf_pre_all_preds_id['TNPS4'] == 'PROMOTER'), 'TNPS'] = 'PROMOTER'

vf_pre_all_preds_id[(vf_pre_all_preds_id['TNPS'] != 'DETRACTOR'), 'TNPS2DET'] = 'NON DETRACTOR'
vf_pre_all_preds_id[(vf_pre_all_preds_id['TNPS'] == 'DETRACTOR'), 'TNPS2DET'] = 'DETRACTOR'
vf_pre_all_preds_id[(vf_pre_all_preds_id['TNPS'] == 'NA'), 'TNPS2DET'] = 'NA'

vf_pre_all_preds_id[(vf_pre_all_preds_id['TNPS'] != 'PROMOTER'), 'TNPS2PRO'] = 'NON PROMOTER'
vf_pre_all_preds_id[(vf_pre_all_preds_id['TNPS'] == 'PROMOTER'), 'TNPS2PRO'] = 'PROMOTER'
vf_pre_all_preds_id[(vf_pre_all_preds_id['TNPS'] == 'NA'), 'TNPS2PRO'] = 'NA'

vf_pre_all_preds_id['TNPS2DET'] = vf_pre_all_preds_id['TNPS2DET'].asfactor()
vf_pre_all_preds_id['TNPS2PRO'] = vf_pre_all_preds_id['TNPS2PRO'].asfactor()
vf_pre_all_preds_id['TNPS'] = vf_pre_all_preds_id['TNPS'].asfactor()
vf_pre_all_preds_id['TNPS4'] = vf_pre_all_preds_id['TNPS4'].asfactor()

vf_pre_all_preds_id[(vf_pre_all_preds_id['TNPS01'] != -1)].show()

nif,TNPS01,DETRACTOR,NON DETRACTOR,TNPS4,TNPS,TNPS2DET,TNPS2PRO
O62278511,1,0.325979,0.674021,HARD DETRACTOR,DETRACTOR,DETRACTOR,NON PROMOTER
00000398D,10,0.38746,0.61254,PROMOTER,PROMOTER,NON DETRACTOR,PROMOTER
00001717R,10,0.437993,0.562007,PROMOTER,PROMOTER,NON DETRACTOR,PROMOTER
00005606Y,8,0.485624,0.514376,NEUTRAL,NEUTRAL,NON DETRACTOR,NON PROMOTER
00006283X,10,0.511998,0.488002,PROMOTER,PROMOTER,NON DETRACTOR,PROMOTER
00008950P,8,0.555096,0.444904,NEUTRAL,NEUTRAL,NON DETRACTOR,NON PROMOTER
00011821K,10,0.446366,0.553634,PROMOTER,PROMOTER,NON DETRACTOR,PROMOTER
00014405D,10,0.506025,0.493975,PROMOTER,PROMOTER,NON DETRACTOR,PROMOTER
00016419M,9,0.375972,0.624028,PROMOTER,PROMOTER,NON DETRACTOR,PROMOTER
00017562E,8,0.346366,0.653634,NEUTRAL,NEUTRAL,NON DETRACTOR,NON PROMOTER


In [417]:
vf_pre_all_preds_id[(vf_pre_all_preds_id['TNPS01'] == -1)].show()

nif,TNPS01,DETRACTOR,NON DETRACTOR,TNPS4,TNPS,TNPS2DET,TNPS2PRO
G0475215P,-1,0.416053,0.583947,,,,
166867538,-1,0.521175,0.478825,,,,
1855209326,-1,0.444169,0.555831,,,,
20497392A,-1,0.51382,0.48618,,,,
30296569O,-1,0.396315,0.603685,,,,
375636270,-1,0.427803,0.572197,,,,
588673,-1,0.552913,0.447087,,,,
60846858O,-1,0.414228,0.585772,,,,
65562000E,-1,0.472199,0.527801,,,,
B1577102-T,-1,0.448444,0.551556,,,,


In [418]:
vf_pre_all_preds_id.types

{u'DETRACTOR': u'real',
 u'NON DETRACTOR': u'real',
 u'TNPS': u'enum',
 u'TNPS01': u'int',
 u'TNPS2DET': u'enum',
 u'TNPS2PRO': u'enum',
 u'TNPS4': u'enum',
 u'nif': u'enum'}

In [419]:
#vf_pre_all_preds_msisdn['nif'] = vf_pre_all_preds_msisdn['nif'].asfactor()
#vf_pre_all_preds_id = vf_pre_all_preds_msisdn.group_by(by='nif').mean(['DETRACTOR', 'NON DETRACTOR', 'TNPS01']).min(['TNPS01']).max(['TNPS01']).get_frame()
#vf_pre_all_preds_id.set_names(['nif', 'TNPS01', 'DETRACTOR', 'NON DETRACTOR'])
vf_pre_all_preds_id[vf_pre_all_preds_id["NON DETRACTOR"] >= vf_pre_all_preds_id["DETRACTOR"], "predict"] = "NON DETRACTOR"
vf_pre_all_preds_id[vf_pre_all_preds_id["NON DETRACTOR"] < vf_pre_all_preds_id["DETRACTOR"], "predict"] = "DETRACTOR"
vf_pre_all_preds_id['predict'] = vf_pre_all_preds_id['predict'].asfactor()
#vf_pre_all_preds_id['suma'] = vf_pre_all_preds_id.sum(False, 1, ['DETRACTOR', 'NON DETRACTOR'])
#vf_pre_all_preds_id
vf_pre_all_preds_id[(vf_pre_all_preds_id['TNPS01'] != None)]

nif,TNPS01,DETRACTOR,NON DETRACTOR,TNPS4,TNPS,TNPS2DET,TNPS2PRO,predict
G0475215P,-1,0.416053,0.583947,,,,,NON DETRACTOR
166867538,-1,0.521175,0.478825,,,,,DETRACTOR
1855209326,-1,0.444169,0.555831,,,,,NON DETRACTOR
20497392A,-1,0.51382,0.48618,,,,,DETRACTOR
30296569O,-1,0.396315,0.603685,,,,,NON DETRACTOR
375636270,-1,0.427803,0.572197,,,,,NON DETRACTOR
588673,-1,0.552913,0.447087,,,,,DETRACTOR
60846858O,-1,0.414228,0.585772,,,,,NON DETRACTOR
65562000E,-1,0.472199,0.527801,,,,,NON DETRACTOR
B1577102-T,-1,0.448444,0.551556,,,,,NON DETRACTOR




Count number of rows and proportion of predictions

In [420]:
vf_pre_all_preds_nrows = vf_pre_all_preds_id.nrows
vf_pre_all_preds_groupby = vf_pre_all_preds_id.group_by(by='predict').count().get_frame()
proportion = 100*vf_pre_all_preds_groupby['nrow']/vf_pre_all_preds_nrows
proportion.set_names(['proportion'])
vf_pre_all_preds_groupby.cbind(proportion)

predict,nrow,proportion
DETRACTOR,1022630.0,45.1091
NON DETRACTOR,1244390.0,54.8909




As proportions of predictions are highly unbalanced, let's create a new prediction column (**predict2**) using a threshold on scoring column

In [421]:
#th_max_acc = test_metrics.accuracy()[0][0]
th_max_acc = 0.5
vf_pre_all_preds_id[vf_pre_all_preds_id["NON DETRACTOR"] >= th_max_acc, "predict2"] = "NON DETRACTOR"
vf_pre_all_preds_id[vf_pre_all_preds_id["NON DETRACTOR"] < th_max_acc, "predict2"] = "DETRACTOR"
vf_pre_all_preds_id['predict2'] = vf_pre_all_preds_id['predict2'].asfactor()
vf_pre_all_preds_id

nif,TNPS01,DETRACTOR,NON DETRACTOR,TNPS4,TNPS,TNPS2DET,TNPS2PRO,predict,predict2
G0475215P,-1,0.416053,0.583947,,,,,NON DETRACTOR,NON DETRACTOR
166867538,-1,0.521175,0.478825,,,,,DETRACTOR,DETRACTOR
1855209326,-1,0.444169,0.555831,,,,,NON DETRACTOR,NON DETRACTOR
20497392A,-1,0.51382,0.48618,,,,,DETRACTOR,DETRACTOR
30296569O,-1,0.396315,0.603685,,,,,NON DETRACTOR,NON DETRACTOR
375636270,-1,0.427803,0.572197,,,,,NON DETRACTOR,NON DETRACTOR
588673,-1,0.552913,0.447087,,,,,DETRACTOR,DETRACTOR
60846858O,-1,0.414228,0.585772,,,,,NON DETRACTOR,NON DETRACTOR
65562000E,-1,0.472199,0.527801,,,,,NON DETRACTOR,NON DETRACTOR
B1577102-T,-1,0.448444,0.551556,,,,,NON DETRACTOR,NON DETRACTOR




In [435]:
vf_pre_all_preds_id[vf_pre_all_preds_id['predict'] != vf_pre_all_preds_id['predict2']].show()

Count number of rows and proportion of predictions using the new **predict2** column

In [422]:
vf_pre_all_preds_nrows = vf_pre_all_preds_id.nrows
vf_pre_all_preds_groupby = vf_pre_all_preds_id.group_by(by='predict2').count().get_frame()
proportion = 100*vf_pre_all_preds_groupby['nrow']/vf_pre_all_preds_nrows
proportion.set_names(['proportion'])
vf_pre_all_preds_groupby.cbind(proportion)

predict2,nrow,proportion
DETRACTOR,1022630.0,45.1091
NON DETRACTOR,1244390.0,54.8909




As what we really need is to provide a prediction of the type ['PROMOTER', 'NEUTRAL', 'DETRACTOR'], we calculate cut points in predictions to preserve original proportions

In [423]:
print 'nrows:', vf_pre_all_preds_nrows
#print 'proportions', vf_pre_proportions

if 'HARD DETRACTOR' in vf_pre_proportions.keys() and 'SOFT DETRACTOR' in vf_pre_proportions.keys():
    vf_pre_proportions['DETRACTOR'] = vf_pre_proportions['HARD DETRACTOR'] + vf_pre_proportions['SOFT DETRACTOR']
    del vf_pre_proportions['HARD DETRACTOR']
    del vf_pre_proportions['SOFT DETRACTOR']

vf_pre_counts = {}
for k in vf_pre_proportions.keys():
    vf_pre_counts[k] = int(vf_pre_all_preds_nrows*vf_pre_proportions[k]/100)
diff = vf_pre_all_preds_nrows - sum(vf_pre_counts.values())
min_key = min(vf_pre_counts, key=vf_pre_counts.get) # Get the key with minimum value
vf_pre_counts[min_key] = vf_pre_counts[min_key] + diff # Assign rest to key with minimum num rows
print 'counts:', vf_pre_counts

ini = 0
end = 0
vf_pre_ranges = {}
for k in ['DETRACTOR', 'NEUTRAL', 'PROMOTER']: # It is necessary to force the order
    if end > 0:
        ini = end + 1
    end = ini + vf_pre_counts[k] - 1
    vf_pre_ranges[k] = [ini, end]
    #print k, ini, end
print 'ranges:', vf_pre_ranges

nrows: 2267021
counts: {u'NEUTRAL': 328896, u'PROMOTER': 1392403, 'DETRACTOR': 545722}
ranges: {'NEUTRAL': [545722, 874617], 'PROMOTER': [874618, 2267020], 'DETRACTOR': [0, 545721]}


Using the cut points calculated above, create a new **predict3** column with final predictions

In [424]:
for k in vf_pre_ranges.keys():
    r = range(vf_pre_ranges[k][0], vf_pre_ranges[k][1]+1, 1)
    print k, r[0], r[-1]
    vf_pre_all_preds_id[r, 'predict3'] = k
    #tmp = vf_pre_all_preds_id[r, :]
    #print tmp.head()
vf_pre_all_preds_id['predict3'] = vf_pre_all_preds_id['predict3'].asfactor()
vf_pre_all_preds_id

NEUTRAL 545722 874617
PROMOTER 874618 2267020
DETRACTOR 0 545721


nif,TNPS01,DETRACTOR,NON DETRACTOR,TNPS4,TNPS,TNPS2DET,TNPS2PRO,predict,predict2,predict3
G0475215P,-1,0.416053,0.583947,,,,,NON DETRACTOR,NON DETRACTOR,DETRACTOR
166867538,-1,0.521175,0.478825,,,,,DETRACTOR,DETRACTOR,DETRACTOR
1855209326,-1,0.444169,0.555831,,,,,NON DETRACTOR,NON DETRACTOR,DETRACTOR
20497392A,-1,0.51382,0.48618,,,,,DETRACTOR,DETRACTOR,DETRACTOR
30296569O,-1,0.396315,0.603685,,,,,NON DETRACTOR,NON DETRACTOR,DETRACTOR
375636270,-1,0.427803,0.572197,,,,,NON DETRACTOR,NON DETRACTOR,DETRACTOR
588673,-1,0.552913,0.447087,,,,,DETRACTOR,DETRACTOR,DETRACTOR
60846858O,-1,0.414228,0.585772,,,,,NON DETRACTOR,NON DETRACTOR,DETRACTOR
65562000E,-1,0.472199,0.527801,,,,,NON DETRACTOR,NON DETRACTOR,DETRACTOR
B1577102-T,-1,0.448444,0.551556,,,,,NON DETRACTOR,NON DETRACTOR,DETRACTOR




Count number of rows and proportion of predictions using the final **predict3** column, that must match original proportions

In [425]:
vf_pre_all_preds_nrows = vf_pre_all_preds_id.nrows
vf_pre_all_preds_groupby = vf_pre_all_preds_id.group_by(by='predict3').count().get_frame()
proportion = 100*vf_pre_all_preds_groupby['nrow']/vf_pre_all_preds_nrows
proportion.set_names(['proportion'])
vf_pre_all_preds_groupby.cbind(proportion)

predict3,nrow,proportion
DETRACTOR,545722.0,24.0722
NEUTRAL,328896.0,14.5078
PROMOTER,1392400.0,61.4199




Convert predictions to Pandas, to bring predictions back to Spark again

In [426]:
print time.ctime()
start = time.time()
print 'Converting predictions to Pandas'
vf_pre_all_preds_id_pd = vf_pre_all_preds_id[['nif', label_preds, 'predict3']].as_data_frame()
#vf_pre_all_preds_id_pandas
end = time.time()
print 'Process took:', "{0:.2f}".format((end - start)/60), 'minutes'

Tue Apr 10 10:31:44 2018
Converting predictions to Pandas
Process took: 0.11 minutes


Create a Spark DataFrame from the Pandas object created above

In [430]:
#Create PySpark DataFrame Schema
pd_schema = StructType([StructField('nif',        StringType(),True),
                        StructField('label_preds',StringType(),True),
                        StructField('predict',    StringType(),True)])

#Create Spark DataFrame from Pandas
df_vf_pre_preds = sqlContext.createDataFrame(vf_pre_all_preds_id_pd, pd_schema)
#Important to order columns in the same order as the target database
df_vf_pre_preds = df_vf_pre_preds.select('nif', 'label_preds', 'predict')

Count number of rows of actual label and final predictions

In [431]:
print df_vf_pre_preds.count()
#df_vf_pre_preds.groupby('label_preds').count().show()
#print df_vf_pre_preds.filter('label_preds == "NaN"').count()
df_vf_pre_preds.groupby('predict').count().show()

2267021
+---------+-------+
|  predict|  count|
+---------+-------+
|  NEUTRAL| 328896|
| PROMOTER|1392403|
|DETRACTOR| 545722|
+---------+-------+



For those clients that we really have actual label, use actual label. Otherwise, use prediction.

In [432]:
vf_pre_preds = df_vf_pre_preds.withColumn('predict', 
                                          when(df_vf_pre_preds['label_preds'] != "NaN",
                                               df_vf_pre_preds['label_preds'])\
                                         .otherwise(df_vf_pre_preds['predict']))\
                              .drop('label_preds')
#print vf_pre_preds.count()
#vf_pre_preds.groupby('predict').count().show()

vf_pre_preds_count = vf_pre_preds.count()
print vf_pre_preds_count
vf_pre_preds_groupby = vf_pre_preds.groupby('predict').count()
vf_pre_preds_proportions = vf_pre_preds_groupby.withColumn('proportion', 100*vf_pre_preds_groupby['count']/vf_pre_preds_count).cache()
vf_pre_preds_proportions.withColumn('proportion', bround(vf_pre_preds_proportions['proportion'], 2)).show()

2267021
+---------+-------+----------+
|  predict|  count|proportion|
+---------+-------+----------+
|  NEUTRAL| 328991|     14.51|
| PROMOTER|1392814|     61.44|
|DETRACTOR| 545216|     24.05|
+---------+-------+----------+



In [434]:
print vf_pre_preds.select('nif').distinct().count()

2267021


Write predictions to HDFS

In [433]:
vf_pre_preds.write.mode('overwrite').format('parquet').save('/tmp/bbergua/tmp/vf_pre_preds-'+month)

In [75]:
#vf_pre_preds = spark.read.parquet('/tmp/bbergua/tmp/vf_pre_preds-'+month)

## Oracle Non-Prepaid segments

### Load Oracle Postpaid

In [436]:
##print 'ACC CAR rows:', vf_pos.filter('partitioned_month == "%s"' % month).count()
#print 'Accenture CAR distinct MSISDN:', spark.table('udf_es.par_explic_lin_6m_' + month).select('msisdn').distinct().count()
#print 'Accenture CAR distinct NIF:   ', spark.table('udf_es.par_explic_cli_6m_' + month).select('nif').distinct().count()
##print 'AC_FINAL_POS rows:', spark.table('raw_es.vf_pos_ac_final').filter('partitioned_month == "%s"' % month).count()
#print 'AC_FINAL_POSP distinct MSISDN:', spark.table('raw_es.vf_pos_ac_final').filter('partitioned_month == "%s"' % month).select('x_id_red').distinct().count()
#print 'AC_FINAL_POSP distinct NIF:   ', spark.table('raw_es.vf_pos_ac_final').filter('partitioned_month == "%s"' % month).select('x_num_ident').distinct().count()

In [437]:
#spark.table('raw_es.vf_pos_ac_final').printSchema()print time.ctime()

In [16]:
if '/var/SP/data/home/bbergua/fy17.capsule/convergence/src/main/python' not in sys.path:
    print 'Adding [...]/convergence/src/main/python to sys.path'
    sys.path.append('/var/SP/data/home/bbergua/fy17.capsule/convergence/src/main/python')

from configuration import Configuration
from DP_prepare_input_cvm_pospago import DPPrepareInputCvmPospago

#del ac_postpaid
if not 'ac_postpaid' in globals():
    print time.ctime()
    start = time.time()
    
    conf = Configuration(sc)
    ac_postpaid = DPPrepareInputCvmPospago(conf, month)
    ac_postpaid.calculate_mobile_only_and_convergent_by_id()
    
    end = time.time()
    print 'Process took:', "{0:.2f}".format((end - start)/60), 'minutes'

Tue Apr 10 14:19:55 2018
SparkContext already loaded. master is yarn
Removing spurious header in raw_es.vf_pos_ac_final ...
Removing corrupt rows from raw_es.vf_pos_ac_final ...
Finished generate_features()
Making 194490 mobile-only clients convergent
Making 194490 non-convergent clients convergent
Finished set_convergent_ids()
Finished calculate_mobile_only_and_convergent_by_id()
Process took: 9.18 minutes


In [440]:
#print ac_postpaid.dataById.filter('partitioned_month == "%s"' % month).count()
#print ac_postpaid.dataById.filter('partitioned_month == "%s"' % month).select('x_num_ident').distinct().count()

3290364
3285656


In [443]:
#print ac_postpaid.dataById.filter('partitioned_month == "%s"' % month).distinct().count()

3285656


In [441]:
#ac_postpaid.dataById.filter('partitioned_month == "%s"' % month).groupby('x_num_ident').count().sort('count', ascending=False).show()

+-----------+-----+
|x_num_ident|count|
+-----------+-----+
|  14788947G|    4|
|  94841368R|    3|
|  12325272Y|    3|
|  16552274I|    3|
|  02438499Q|    3|
|  19294348U|    3|
|  N9131075Z|    3|
|  34702948D|    3|
|  43970668J|    3|
|  55612284L|    3|
|  13160056H|    3|
|  05123249T|    3|
|  14723519U|    3|
|  50035787D|    3|
|  72627840X|    3|
|  Z8363564W|    3|
|  I3160786X|    3|
|  21963355X|    3|
|  50300247O|    3|
|  41109451F|    3|
+-----------+-----+
only showing top 20 rows



In [442]:
#ac_postpaid.dataById.filter('x_num_ident == "14788947G"').show()

+-----------+-----------------+------+--------+------+------------+-------+------+-----------------+----------------+---------------+--------+------------+-----------+----------------+------+---------+------------+----------+--------------+------------+--------------+-------------------+-------------+-------------+---------+---------+--------+--------------+--------------+-------------+----------+-----------+------------+-----------+--------------+------------+------------+------------------+----------------------+---------------------+------------------+--------------+-----------------+-------------+----------------+------------------+-----------------+----------+-----------------+----------------+----------------+---------------+-----------------+-------------------------+---------------+-----------+------------------------+---------------+------------------+----------------------+---------------------+-----------------------+---------------+---------------+--------------------+-----

### Oracle Non-Prepaid segment - Data Preparation

In [17]:
def prepare_dataset_postpaid():
    # Load Oracle AC Final Postpaid
    #del vf_pos
    #if not 'vf_pos' in globals():
    #print 'Load Oracle Postpaid CAR from Hive for month =', month
    #vf_pos = spark.table('udf_es.par_explic_cli_6m_' + month)
    print 'Load Oracle AC Final Postpaid from Hive for month =', month
    # Load AC Final Postpaid by Id
    vf_pos = ac_postpaid.dataById
    if 'partitioned_month' in vf_pos.columns:
        if month is not None:
            vf_pos = vf_pos.filter('partitioned_month == "%s"' % month)
        vf_pos.groupby('partitioned_month').count().show()
    #vf_pos = vf_pos.drop('x_sfid_cuenta', 'x_sfid_servicio', 'sfid_adsl', 'sfid_adslenprov', 
    #                     'sfid_ftth', 'sfid_hz', 'sfid_lpd', 'sfid_tv', 'sfid_futbol', 
    #                     'sfid_canje', 'sfid_cambio_pprecios', 'sfid_simo')
    vf_pos = vf_pos.withColumnRenamed('x_id_red', 'msisdn')
    vf_pos = vf_pos.withColumnRenamed('x_num_ident', 'nif')
    #vf_pos = vf_pos.withColumn('tipo_documento_cliente', when(vf_pos['tipo_documento_cliente'] == 'Tarj.Residente',  'Tarj. Residente')
    #                                                    .when(vf_pos['tipo_documento_cliente'] == 'NIE',             'Tarj. Residente')
    #                                                    .when(vf_pos['tipo_documento_cliente'] == 'T_RES',           'Tarj. Residente')
    #                                                    .when(vf_pos['tipo_documento_cliente'] == 'TARJ. RESIDENTE', 'Tarj. Residente')
    #                                                    .when(vf_pos['tipo_documento_cliente'] == 'PASAPORTE',       'Pasaporte')
    #                                                    .when(vf_pos['tipo_documento_cliente'] == 'GMT+1',           '')
    #                                                    .otherwise(vf_pos['tipo_documento_cliente']))
    vf_pos = add_prefix_columns(vf_pos, 'AC')
    #vf_pos = vf_pos.withColumnRenamed('nif', 'AC_nif')
    #vf_pos.printSchema()

    # Join Postpaid data with Call Centre Calls
    print 'Join Postpaid data with Call Centre Calls'
    vf_pos_ccc = vf_pos.join(ccc_id, ['nif', 'partitioned_month'], 'left_outer')
    ccc_columns = [x for x in ccc_id.columns if x not in ['nif', 'msisdn', 'partitioned_month']]
    vf_pos_ccc = vf_pos_ccc.fillna(0, ccc_columns)
    #vf_pos_ccc.groupby('partitioned_month').count().show()
    #vf_pos_ccc.printSchema()

    # Join Postpaid data with TNPS (to get the label)
    #tnps_nif.show()
    tnps_id_pospaid = tnps_id.filter('SEGMENTACION != "Prepaid"').select(['nif', 'partitioned_month', 'SEGMENTACION', 'TNPS01', 'TNPS', 'TNPS2DET', 'TNPS2PRO', 'TNPS4'])
    #tnps_id_pospaid.groupby('partitioned_month').count().show()
    print 'TNPS count:', tnps_id_pospaid.count()
    tnps_id_pospaid.groupby('TNPS4').count().show()
    print 'Join TNPS with Postpaid data'
    vf_pos_ccc_tnps = vf_pos_ccc.join(tnps_id_pospaid, ['nif', 'partitioned_month'], 'left_outer')
    #vf_pos_ccc_tnps.groupby('partitioned_month').count().show()
    
    # Now, generate dataset for model training
    vf_pos = vf_pos_ccc_tnps.drop('year', 'month', 'day')
    vf_pos = fix_column_names(vf_pos)

    # Calculate proportions of TNPS4 levels
    vf_pos_count = vf_pos.na.drop(subset='TNPS4').count()
    print 'vf_pos count:', vf_pos_count
    vf_pos_groupby = vf_pos.na.drop(subset='TNPS4').groupby('TNPS4').count()
    vf_pos_proportions = vf_pos_groupby.withColumn('proportion', 100*vf_pos_groupby['count']/vf_pos_count).cache()
    vf_pos_proportions_dict = dict(vf_pos_proportions.select('TNPS4', 'proportion').collect())
    vf_pos_proportions.withColumn('proportion', bround(vf_pos_proportions['proportion'], 2)).show()
    
    vf_pos.printSchema()
    
    return vf_pos, vf_pos_proportions_dict

In [18]:
vf_pos, vf_pos_proportions = prepare_dataset_postpaid()

Load Oracle AC Final Postpaid from Hive for month = 201712
+-----------------+-------+
|partitioned_month|  count|
+-----------------+-------+
|           201712|3285656|
+-----------------+-------+

Appending AC prefix to columns ...
Join Postpaid data with Call Centre Calls
TNPS count: 284737
+--------------+------+
|         TNPS4| count|
+--------------+------+
|HARD DETRACTOR| 37155|
|       NEUTRAL| 60803|
|      PROMOTER|154759|
|SOFT DETRACTOR| 32020|
+--------------+------+

Join TNPS with Postpaid data
vf_pos count: 284737
+--------------+------+----------+
|         TNPS4| count|proportion|
+--------------+------+----------+
|HARD DETRACTOR| 37155|     13.05|
|       NEUTRAL| 60803|     21.35|
|      PROMOTER|154759|     54.35|
|SOFT DETRACTOR| 32020|     11.25|
+--------------+------+----------+

root
 |-- nif: string (nullable = true)
 |-- partitioned_month: string (nullable = true)
 |-- AC_prodhz: string (nullable = true)
 |-- AC_prodadsl: string (nullable = true)
 |-- AC

In [19]:
vf_pos_proportions

{u'HARD DETRACTOR': 13.048883706718831,
 u'NEUTRAL': 21.354091670559146,
 u'PROMOTER': 54.351559509301566,
 u'SOFT DETRACTOR': 11.245465113420455}

### Oracle Non-Prepaid segment - Training

In [20]:
vf_pos_training, vf_pos_training_proportions = prepare_training_data(vf_pos)

Class count before cleaning:
+-------------+------+----------+
|     TNPS2DET| count|proportion|
+-------------+------+----------+
|NON DETRACTOR|215562|     75.71|
|    DETRACTOR| 69175|     24.29|
+-------------+------+----------+

fractions: {u'NON DETRACTOR': 0.3209053543760032, u'DETRACTOR': 1.0}
Class count after balancing:
+-------------+-----+----------+
|     TNPS2DET|count|proportion|
+-------------+-----+----------+
|    DETRACTOR|69175|      50.0|
|NON DETRACTOR|69181|      50.0|
+-------------+-----+----------+

root
 |-- partitioned_month: string (nullable = true)
 |-- AC_prodhz: string (nullable = true)
 |-- AC_prodadsl: string (nullable = true)
 |-- AC_x_plan: string (nullable = true)
 |-- AC_promocion_vf: string (nullable = true)
 |-- AC_prodlpd: string (nullable = true)
 |-- AC_puntos: integer (nullable = true)
 |-- AC_sistema_operativo: string (nullable = true)
 |-- AC_promocion_tarifa: string (nullable = true)
 |-- AC_flag_huella_von: string (nullable = true)
 |-- A

In [21]:
vf_pos_training_proportions

{u'DETRACTOR': 24.294348820139287, u'NON DETRACTOR': 75.70565117986071}

In [22]:
#vf_pos_training.show(1)

In [23]:
print time.ctime()
start = time.time()
print vf_pos_training.count()
vf_pos_training.write.mode('overwrite').format('parquet').save('/tmp/bbergua/tmp/vf_pos_training-'+month)
end = time.time()
print 'Process took:', "{0:.2f}".format((end - start)/60), 'minutes'

Tue Apr 10 16:09:49 2018
138356
Process took: 25.32 minutes


In [24]:
# $ hdfs getconf -confKey fs.defaultFS
# hdfs://nameservice1
# $ hdfs getconf -namenodes
# vgddp350hr.dc.sedc.internal.vodafone.com vgddp351hr.dc.sedc.internal.vodafone.com
out = subprocess.check_output('hdfs getconf -namenodes', shell=True)
nodes = ['hdfs://'+n for n in out.strip().split(' ')]

In [25]:
path_vf_pos_training = [n+'/tmp/bbergua/tmp/vf_pos_training-'+month for n in nodes]
path_vf_pos_training[0]

'hdfs://vgddp350hr.dc.sedc.internal.vodafone.com/tmp/bbergua/tmp/vf_pos_training-201712'

In [26]:
vf_pos_training_df_h2o = h2o.import_file(path=path_vf_pos_training[0])
vf_pos_training_df_h2o.nrows
#subprocess.call('hdfs dfs -rm -r /tmp/bbergua/tmp/vf_pos_training-'+month, shell=True)

Parse progress: |█████████████████████████████████████████████████████████| 100%


138356

In [27]:
if False:
    vf_pos_training_pd = generate_pandas_data(vf_pos_training)
    # Copy to H2O

    #h2o.remove_all()
    #print balanced_pd.columns.values.tolist()
    #os.environ["PYTHONIOENCODING"] = "UTF-8"
    print 'Copying data to H2O'
    h2o.remove('vf_pos_training_df_h2o.hex')
    vf_pos_training_df_h2o = h2o.H2OFrame(vf_pos_training_pd, destination_frame='vf_pos_training_df_h2o.hex', header=1)
    vf_pos_training_df_h2o.head()

In [28]:
h2o.ls()

Unnamed: 0,key
0,part_00000_ee124499_36a1_40c4_a2ee_04852471cb6...


In [29]:
vf_pos_training_df_h2o.describe()

Rows:138356
Cols:281




Unnamed: 0,partitioned_month,AC_prodhz,AC_prodadsl,AC_x_plan,AC_promocion_vf,AC_prodlpd,AC_puntos,AC_sistema_operativo,AC_promocion_tarifa,AC_flag_huella_von,AC_prodtivo,AC_x_tipo_ident,AC_part_status,AC_pprecios_destino,AC_x_sexo,AC_plandatos,AC_ppid_destino,AC_cod_segfid,AC_modelo,AC_tarifa_canje,AC_x_nacionalidad,AC_flag_cobertura_adsl,AC_codigo_postal,AC_x_dia_emision,AC_x_subtipo,AC_prodvfbox,AC_prodftth,AC_prodadslenprov,AC_x_tipo_cliente,AC_sum_flagvfbox,AC_sum_flaghz,AC_sum_flagvoz,AC_sum_flagtivo,AC_sum_flaglpd,AC_sum_flagfutbol,AC_sum_flagftth,AC_sum_flagadsl,AC_sum_arpu,AC_sum_cantidad_pendiente,AC_sum_cuotas_pendientes,AC_is_mobile_only_new,AC_is_mobile_only,AC_is_convergent_new,AC_is_convergent,AC_max_cobertura_4g,AC_max_flag_huella_vf,AC_max_flag_ebilling,AC_max_lortad,AC_max_flag_financia,AC_max_vfsmartphone,AC_max_flag_nh_real,AC_max_seg_cliente,AC_max_flag_4g_nodos,AC_max_flag_existe_fbb_hogar,AC_max_flag_siebel,AC_max_flag_4g,AC_max_flag_huella_movistar,AC_max_terminal_4g,AC_max_flag_desc_conv,AC_max_flag_financia_simo,AC_max_flag_4g_aperturas,AC_max_flag_huella_jazztel,AC_max_terminalmms,AC_max_num_pospago,AC_max_flag_huella_neba,AC_max_flag_nh_prevista,AC_max_deuda,AC_max_cobertura_4g_plus,AC_max_flag_huella_ono,AC_max_num_prepago,AC_max_flag_cuenta_superintegral,AC_max_flag_huella_euskaltel,AC_max_num_total,AC_min_days_since_install_date,AC_min_days_since_fecha_fin_cp_tarifa,AC_min_days_since_x_fecha_nacimiento,AC_min_days_since_x_fecha_activacion,AC_min_days_since_fecha_fin_cp_vf,AC_min_days_since_fecha_alta_ss_ftth,AC_min_days_since_fecha_alta_ss_adsl,AC_min_days_since_fecha_alta_tivo,AC_min_days_since_fecha_alta_ss_hz,AC_min_days_since_fecha_alta_ss_adslenprov,AC_min_days_since_fecha_alta_vfbox,AC_min_days_since_fecha_ultima_financiacion,AC_min_days_since_fecha_alta_futbol,AC_min_days_since_fecha_alta_servicio,AC_min_days_since_fecha_transferencia,AC_min_days_since_fecha_antiguedad_cliente,AC_min_days_since_fecha_canje,AC_min_days_since_x_fecha_creacion_cuenta,AC_min_days_since_last_upd,AC_min_days_since_fecha_cambio,AC_min_days_since_x_fecha_recepcion,AC_min_days_since_x_fecha_ini_prov,AC_min_days_since_x_fecha_creacion_servicio,AC_min_x_dia_emision,AC_min_years_since_install_date,AC_min_years_since_fecha_fin_cp_tarifa,AC_min_years_since_x_fecha_nacimiento,AC_min_years_since_x_fecha_activacion,AC_min_years_since_fecha_fin_cp_vf,AC_min_years_since_fecha_alta_ss_ftth,AC_min_years_since_fecha_alta_ss_adsl,AC_min_years_since_fecha_alta_tivo,AC_min_years_since_fecha_alta_ss_hz,AC_min_years_since_fecha_alta_ss_adslenprov,AC_min_years_since_fecha_alta_vfbox,AC_min_years_since_fecha_ultima_financiacion,AC_min_years_since_fecha_alta_futbol,AC_min_years_since_fecha_alta_servicio,AC_min_years_since_fecha_transferencia,AC_min_years_since_fecha_antiguedad_cliente,AC_min_years_since_fecha_canje,AC_min_years_since_x_fecha_creacion_cuenta,AC_min_years_since_last_upd,AC_min_years_since_fecha_cambio,AC_min_years_since_x_fecha_recepcion,AC_min_years_since_x_fecha_ini_prov,AC_min_years_since_x_fecha_creacion_servicio,AC_avg_days_since_install_date,AC_avg_days_since_fecha_fin_cp_tarifa,AC_avg_days_since_x_fecha_nacimiento,AC_avg_days_since_x_fecha_activacion,AC_avg_days_since_fecha_fin_cp_vf,AC_avg_days_since_fecha_alta_ss_ftth,AC_avg_days_since_fecha_alta_ss_adsl,AC_avg_days_since_fecha_alta_tivo,AC_avg_days_since_fecha_alta_ss_hz,AC_avg_days_since_fecha_alta_ss_adslenprov,AC_avg_days_since_fecha_alta_vfbox,AC_avg_days_since_fecha_ultima_financiacion,AC_avg_days_since_fecha_alta_futbol,AC_avg_days_since_fecha_alta_servicio,AC_avg_days_since_fecha_transferencia,AC_avg_days_since_fecha_antiguedad_cliente,AC_avg_days_since_fecha_canje,AC_avg_days_since_x_fecha_creacion_cuenta,AC_avg_days_since_last_upd,AC_avg_days_since_fecha_cambio,AC_avg_days_since_x_fecha_recepcion,AC_avg_days_since_x_fecha_ini_prov,AC_avg_days_since_x_fecha_creacion_servicio,AC_avg_x_dia_emision,AC_avg_years_since_install_date,AC_avg_years_since_fecha_fin_cp_tarifa,AC_avg_years_since_x_fecha_nacimiento,AC_avg_years_since_x_fecha_activacion,AC_avg_years_since_fecha_fin_cp_vf,AC_avg_years_since_fecha_alta_ss_ftth,AC_avg_years_since_fecha_alta_ss_adsl,AC_avg_years_since_fecha_alta_tivo,AC_avg_years_since_fecha_alta_ss_hz,AC_avg_years_since_fecha_alta_ss_adslenprov,AC_avg_years_since_fecha_alta_vfbox,AC_avg_years_since_fecha_ultima_financiacion,AC_avg_years_since_fecha_alta_futbol,AC_avg_years_since_fecha_alta_servicio,AC_avg_years_since_fecha_transferencia,AC_avg_years_since_fecha_antiguedad_cliente,AC_avg_years_since_fecha_canje,AC_avg_years_since_x_fecha_creacion_cuenta,AC_avg_years_since_last_upd,AC_avg_years_since_fecha_cambio,AC_avg_years_since_x_fecha_recepcion,AC_avg_years_since_x_fecha_ini_prov,AC_avg_years_since_x_fecha_creacion_servicio,AC_max_days_since_install_date,AC_max_days_since_fecha_fin_cp_tarifa,AC_max_days_since_x_fecha_nacimiento,AC_max_days_since_x_fecha_activacion,AC_max_days_since_fecha_fin_cp_vf,AC_max_days_since_fecha_alta_ss_ftth,AC_max_days_since_fecha_alta_ss_adsl,AC_max_days_since_fecha_alta_tivo,AC_max_days_since_fecha_alta_ss_hz,AC_max_days_since_fecha_alta_ss_adslenprov,AC_max_days_since_fecha_alta_vfbox,AC_max_days_since_fecha_ultima_financiacion,AC_max_days_since_fecha_alta_futbol,AC_max_days_since_fecha_alta_servicio,AC_max_days_since_fecha_transferencia,AC_max_days_since_fecha_antiguedad_cliente,AC_max_days_since_fecha_canje,AC_max_days_since_x_fecha_creacion_cuenta,AC_max_days_since_last_upd,AC_max_days_since_fecha_cambio,AC_max_days_since_x_fecha_recepcion,AC_max_days_since_x_fecha_ini_prov,AC_max_days_since_x_fecha_creacion_servicio,AC_max_x_dia_emision,AC_max_years_since_install_date,AC_max_years_since_fecha_fin_cp_tarifa,AC_max_years_since_x_fecha_nacimiento,AC_max_years_since_x_fecha_activacion,AC_max_years_since_fecha_fin_cp_vf,AC_max_years_since_fecha_alta_ss_ftth,AC_max_years_since_fecha_alta_ss_adsl,AC_max_years_since_fecha_alta_tivo,AC_max_years_since_fecha_alta_ss_hz,AC_max_years_since_fecha_alta_ss_adslenprov,AC_max_years_since_fecha_alta_vfbox,AC_max_years_since_fecha_ultima_financiacion,AC_max_years_since_fecha_alta_futbol,AC_max_years_since_fecha_alta_servicio,AC_max_years_since_fecha_transferencia,AC_max_years_since_fecha_antiguedad_cliente,AC_max_years_since_fecha_canje,AC_max_years_since_x_fecha_creacion_cuenta,AC_max_years_since_last_upd,AC_max_years_since_fecha_cambio,AC_max_years_since_x_fecha_recepcion,AC_max_years_since_x_fecha_ini_prov,AC_max_years_since_x_fecha_creacion_servicio,CCC_Pagar_menos,CCC_Incidencia_Provision_Neba,CCC_Incidencia_Provision_Fibra,CCC_Incidencia_Provision_DSL,CCC_Incidencia_Tecnica,CCC_Incidencia_SGI,CCC_Incidencia_Resto,CCC_Incidencia_Provision_Movil,CCC_Resultado_No_Aplica,CCC_Resultado_Informacion,CCC_Resultado_Solucionado,CCC_Resultado_Retenido,CCC_Resultado_No_Retenido,CCC_Resultado_Escalo,CCC_Resultado_Envio_tecnico,CCC_Resultado_Transferencia,CCC_Resultado_Abono,CCC_Resultado_Bajas,CCC_Resultado_Reclamacion,CCC_Desactivacion_BA_Movil_TV,CCC_Desactivacion_TV,CCC_Desactivacion_Movil,CCC_Desactivacion_Total,CCC_Desactivacion_NET,CCC_Desactivacion_Fijo,CCC_Desactivacion_USB,CCC_Desactivacion_Resto,CCC_Ofrecimiento,CCC_Transferencia,CCC_Cobro,CCC_Precios,CCC_Portabilidad_Inversa,CCC_Portabilidad,CCC_Informacion,CCC_Cierre,CCC_Alta,CCC_Factura,CCC_Averia_DSL,CCC_Averia_Fibra,CCC_Averia_TV,CCC_Averia_Resto,CCC_Averia_Neba,CCC_Averia_Modem_Router,CCC_Baja,CCC_Consulta_Tecnica_TV,CCC_Consulta_Tecnica_Fibra,CCC_Consulta_Tecnica_Neba,CCC_Consulta_Tecnica_DSL,CCC_Consulta_Tecnica_Movil,CCC_Consulta_Tecnica_Modem_Router,CCC_Consulta_Tecnica_Resto,CCC_Consulta_Ficha,CCC_Consulta_Resto,CCC_Productos_Voz,CCC_Productos_Datos,CCC_Productos_Resto,CCC_Provision_Neba,CCC_Provision_Fibra,CCC_Provision_DSL,CCC_Provision_Resto,CCC_Provision_Movil,SEGMENTACION,TNPS01,TNPS,TNPS2DET,TNPS2PRO,TNPS4
type,enum,enum,enum,enum,enum,enum,int,enum,enum,enum,enum,enum,enum,enum,enum,enum,enum,int,enum,enum,enum,enum,enum,int,enum,string,enum,enum,enum,int,int,int,int,int,int,int,int,real,real,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,real,real,int,int,int,int,int,int,int,int,int,int,real,int,int,real,real,real,real,real,real,int,int,real,real,int,int,int,int,int,int,int,int,int,int,real,int,int,real,real,real,real,real,real,real,int,int,real,real,int,int,int,int,int,int,int,int,int,int,real,int,int,real,real,real,real,real,real,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,real,real,int,int,int,int,int,int,int,int,int,int,real,int,int,real,real,real,real,real,real,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,enum,int,enum,enum,enum,enum
mins,,,,,,,-2598.0,,,,,,,,,,,0.0,,,,,,1.0,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-24.9155991077,-429.480010986,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,-1.0,-636884.0,100.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,100.0,-1.0,-745.0,-1.0,100.0,-1.0,1.0,-1.0,-1.0,-1743.72912598,0.273790687323,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.273790687323,-1.0,-2.03974080086,-1.0,0.273790687323,-1.0,-1.0,-1.0,-636884.0,100.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,100.0,-1.0,-156.5,25.4,100.0,25.4,1.0,-1.0,-1.0,-1743.72912598,0.273790687323,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.273790687323,-1.0,-1.20740056038,-0.728266835213,0.273790687323,-0.728266835213,-1.0,-1.0,-636884.0,100.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,100.0,-1.0,-1.0,100.0,100.0,100.0,1.0,-1.0,-1.0,-1743.72912598,0.273790687323,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.273790687323,-1.0,-1.0,0.273790687323,0.273790687323,0.273790687323,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,,,
mean,,,,,,,-2.44358755674,,,,,,,,,,,30.87214866,,,,,,10.4539159849,,,,,,0.0,0.0954277371419,1.83427534765,0.28311746509,0.194613894591,0.0729784035387,0.469303824915,0.193377952528,48.5127022327,399884.366545,7.90147879384,0.966354910521,0.207334701784,0.0,0.759020208737,0.300883228772,0.157362167163,0.846179421203,0.117667466536,0.393203041429,0.947750730001,0.00106970424123,3.7677007141,0.0471248084651,0.624208563416,0.99973257394,0.014347046749,0.679413975541,0.871967966695,0.626131139958,0.295946688253,0.00239960681141,0.187978837203,0.0,2.78960796785,0.162240885831,5.05941195178e-05,0.0419208418861,0.855257451791,0.330972274423,0.116561623638,0.0294457775593,0.0288025094683,2.90616959149,-1.0,-1.0,18502.848355,1916.22527393,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,154.245692272,-1.0,-1.0,1077.68901963,189.567723843,94.3711584608,596.306130562,808.000693862,596.367183209,10.2829078609,-1.0,-1.0,50.6422257851,5.24644660545,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.508386382347,-1.0,-1.0,2.95061226963,-0.118466057292,-0.56795348838,0.995628815102,2.21223076193,0.995795971585,-1.0,-1.0,18503.0788361,1953.43923962,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,450.270654209,-1.0,-1.0,1089.45802395,444.043371155,223.825216934,1257.19545979,1324.47462961,1257.03098143,10.4556817707,-1.0,-1.0,50.6428590634,5.34833498289,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.407884894381,-1.0,-1.0,2.98283470793,0.950193098734,0.0141900128929,3.17691625979,3.62628836982,3.17646593338,-1.0,-1.0,18503.823918,1985.45625054,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1022.32071612,-1.0,-1.0,1099.95266559,810.962748273,412.7898248,2069.9779771,1993.41086039,2069.36487033,10.6267960912,-1.0,-1.0,50.6449112017,5.43599458147,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2.15811042797,-1.0,-1.0,3.01156805984,2.22013153117,0.823114595366,5.66740720862,5.45777356357,5.66572857938,0.0,0.0100031802018,0.00647604729827,0.00499436236954,0.000159010089913,0.0,0.000209604209431,0.0,0.0877591141692,0.425756743473,0.653271271213,0.0968010061002,0.0413209401833,0.13044609558,0.0132122929255,0.0730723640464,0.00808060366012,0.0,0.00771921709214,0.00972129867877,0.01993408309,0.0313611263697,0.0,0.0278629043916,0.00449564890572,0.00312237994738,0.00156841770505,0.00166960594409,0.410824250484,0.048729364827,0.0,0.0,0.101802596201,0.8632802336,0.0194209141635,0.0255355748938,0.200070831767,0.0955650640377,0.0652519587152,0.0440241117118,0.0150698198849,0.03162855243,0.00104079331579,0.0233528000231,0.00915753563272,0.0188860620428,0.00488594639914,0.0222180461997,0.0189655670878,0.00420653965133,0.00518228338489,2.1683194079e-05,0.000477030269739,7.95050449565e-05,0.00143109080922,0.160658012663,0.0170574460088,0.0169345745757,0.0123955592819,7.22773135968e-06,0.0,,5.94815548296,,,,
maxs,,,,,,,1500.0,,,,,,,,,,,49.0,,,,,,22.0,,,,,,0.0,6.0,14.0,5.0,10.0,3.0,9.0,5.0,5074.68169403,55308824612.1,163.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,17.0,1.0,1.0,1.0,1.0,1.0,27.0,1.0,1.0,33.0,-1.0,-1.0,96888.0,8208.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,7083.0,-1.0,-1.0,7193.0,13979.0,3563.0,8145.0,8145.0,8145.0,22.0,-1.0,-1.0,265.270324707,22.4727401733,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,19.3925952911,-1.0,-1.0,19.6937656403,38.2732009888,9.75516223907,22.3002529144,22.3002529144,22.3002529144,-1.0,-1.0,96888.0,8208.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,7083.0,-1.0,-1.0,7193.0,13979.0,3563.0,34827.0,8145.0,8145.0,22.0,-1.0,-1.0,265.270324707,22.4727401733,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,19.3925952911,-1.0,-1.0,19.6937656403,38.2732009888,9.75516223907,95.3530842463,22.3002529144,22.3002529144,-1.0,-1.0,96888.0,8208.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,7088.0,-1.0,-1.0,7193.0,13979.0,5853.0,96888.0,8208.0,8219.0,22.0,-1.0,-1.0,265.270324707,22.4727401733,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,19.4062843323,-1.0,-1.0,19.6937656403,38.2732009888,16.024969101,265.270324707,22.4727401733,22.5028572083,0.0,12.0,4.0,4.0,1.0,0.0,4.0,0.0,21.0,104.0,50.0,29.0,5.0,13.0,3.0,16.0,3.0,0.0,4.0,4.0,7.0,26.0,0.0,5.0,5.0,2.0,2.0,2.0,69.0,59.0,0.0,0.0,19.0,75.0,7.0,12.0,75.0,44.0,20.0,18.0,8.0,12.0,2.0,8.0,4.0,5.0,2.0,11.0,5.0,5.0,5.0,1.0,3.0,1.0,2.0,14.0,11.0,6.0,19.0,1.0,0.0,,10.0,,,,
sigma,,,,,,,22.806925495,,,,,,,,,,,7.36696347888,,,,,,8.14227199967,,,,,,0.0,0.309896836376,1.04109863857,0.460715055892,0.477742289183,0.263086005693,0.522312207287,0.404263112989,37.5665000575,148694679.582,13.2317030845,0.180314537492,0.405398829654,0.0,0.427679615471,0.458643687189,0.36414320522,0.360777977574,0.322215121086,0.48846303253,0.222530091669,0.0326889537518,3.42504566565,0.211906548858,0.484328326873,0.0163510389881,0.118917245209,0.466703545807,0.334126680559,0.483831197311,0.45646878531,0.0489271499352,0.390696681885,0.0,1.43184002794,0.368672975938,0.0071128001109,0.200409019744,0.351842347778,0.470564797272,0.435157701762,0.169053039914,0.167251687881,1.5063401686,0.0,0.0,10833.5272888,1777.97653694,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,725.41781216,0.0,0.0,611.294376064,1015.38706168,261.682681778,1240.89013906,1117.16035621,1241.04020782,8.12392204296,0.0,0.0,29.6902377529,4.86793441905,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.1903969763,0.0,0.0,1.67366710864,2.93652501057,1.04216471752,3.72230841028,3.05868116778,3.72271203196,0.0,0.0,10833.3744368,1779.75393845,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,903.39210476,0.0,0.0,605.134085437,1258.14716483,285.303073767,1291.62763611,1221.05212442,1288.61266648,8.07602855791,0.0,0.0,29.6898044782,4.87280077917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.70631404294,0.0,0.0,1.65680080594,3.49904713913,1.06901347078,3.63609307989,3.34312716735,3.62803239187,0.0,0.0,10835.3525693,1803.29524612,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1716.28503881,0.0,0.0,607.20203544,2229.02217934,431.360341181,1890.31901909,1792.73951162,1873.11221538,8.15350566498,0.0,0.0,29.6952069984,4.93725469045,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.08926615368,0.0,0.0,1.66246266024,6.10294849095,1.51740609296,5.17551768389,4.90835407009,5.12840705885,0.0,0.135284414138,0.0923591460444,0.0912312787503,0.0126089632727,0.0,0.0172132272792,0.0,0.507074228614,1.13172846503,1.47517182148,0.406326181552,0.215223580457,0.448010542655,0.120226736048,0.352333515909,0.0918399525606,0.0,0.0919496728531,0.105156862658,0.154182806493,0.212522636279,0.0,0.188414168355,0.0728964559615,0.0579516853135,0.040653443797,0.0413545294869,1.32952872507,0.33782073339,0.0,0.0,0.565324425114,1.67073895182,0.167566731187,0.204336330798,0.643899207325,0.539920493368,0.365777005664,0.303317271237,0.152546462992,0.266571997656,0.0344132714497,0.170955793814,0.101714456544,0.152216128255,0.0736605470368,0.170196359566,0.153694797084,0.0741911825407,0.0780709706238,0.00465648801525,0.0243403100985,0.00891623791203,0.0394861142926,0.483614599826,0.17125565283,0.157253226203,0.158772294978,0.00268844404065,0.0,,3.74282310521,,,,
zeros,,,,,,,130598,,,,,,,,,,,33,,,,,,0,,0,,,,138356,125775,4655,99798,115484,128363,74947,112093,8169,84012,84008,4655,109670,138356,33341,96727,116584,21282,122076,83954,7229,138208,54421,131836,51993,37,136371,44355,17714,51727,97410,138024,112348,103220,0,115909,138349,132556,20026,92564,125210,134282,134371,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,138356,137289,137579,137843,138334,138356,138330,138356,131118,102135,94358,128150,133049,124575,136621,130259,137265,138356,137337,137102,135840,134568,138356,134993,137781,137941,138145,138128,111319,133366,138356,138356,130468,83485,136126,135467,117892,130663,132067,134136,136657,135533,138222,135439,137169,136006,137719,135643,136029,137845,137689,138353,138297,138345,138167,120726,136538,136428,137078,138355,138356,,26301,,,,
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,35136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,201712,,,TARRM,,,-44.0,Android 4.3,,N,,NIF,AC,TARRM,Varon,IPRM1,,39.0,Samsung Galaxy Note 3,OTRO,España,D,75842,8.0,sin IAE,,,,Particulares,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,51.148399353,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,7.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,3.0,-1.0,-1.0,15440.0,5604.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1662.0,-1.0,-1.0,-1.0,130.0,-1.0,8.0,-1.0,-1.0,42.2732849121,15.3432312012,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,4.55040121078,-1.0,-1.0,-1.0,0.355927914381,-1.0,-1.0,-1.0,15440.0,5604.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1662.0,302.0,302.0,1867.33333333,2140.0,1867.33333333,8.0,-1.0,-1.0,42.2732849121,15.3432312012,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,4.55040121078,0.16200653712,0.16200653712,4.44774373372,5.85912111402,4.44774373372,-1.0,-1.0,15440.0,5604.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1662.0,908.0,908.0,5604.0,5604.0,5604.0,8.0,-1.0,-1.0,42.2732849121,15.3432312012,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,4.55040121078,2.48601961136,2.48601961136,15.3432312012,15.3432312012,15.3432312012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Convergent,0.0,DETRACTOR,DETRACTOR,NON PROMOTER,HARD DETRACTOR
1,201712,,,TREDL,CPP24000,,0.0,Android 5.1,TAR18050,,,NIF,AC,TREDL,Varon,IREDL,,39.0,Samsung Galaxy J7 2016,OTRO,España,I,23499,22.0,sin IAE,,,,Particulares,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,84.9208011627,327.080001831,31.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,8.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,3.0,-1.0,-1.0,15074.0,3065.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1662.0,-1.0,-1.0,-1.0,362.0,-1.0,22.0,-1.0,-1.0,41.2712097168,8.39168453217,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,4.55040121078,-1.0,-1.0,-1.0,0.991122364998,-1.0,-1.0,-1.0,15074.0,3065.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1662.0,162.5,90.5,856.25,1095.25,856.25,22.0,-1.0,-1.0,41.2712097168,8.39168453217,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,4.55040121078,-0.0537211596966,-0.500165984035,1.84570172429,2.99869254231,1.84570172429,-1.0,-1.0,15074.0,3065.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1662.0,326.0,365.0,3065.0,3065.0,3065.0,22.0,-1.0,-1.0,41.2712097168,8.39168453217,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,4.55040121078,0.892557680607,0.999336063862,8.39168453217,8.39168453217,8.39168453217,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Convergent,10.0,PROMOTER,NON DETRACTOR,PROMOTER,PROMOTER
2,201712,,,TREDM,CPP24000,,-64.0,iOS 10,TAR18050,N,,NIF,AC,TREDM,Varon,IREDM,NVTOD,39.0,Apple iPhone 7,OTRO,España,D,77717,8.0,sin IAE,,,,Particulares,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,91.3816995621,377.0,13.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,7.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,,3.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,4.0,-1.0,-1.0,22745.0,4648.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1627.0,123.0,350.0,3226.0,3226.0,3226.0,8.0,-1.0,-1.0,62.2736930847,12.7257919312,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,4.45457458496,0.336762547493,0.958267450333,8.83248806,8.83248806,8.83248806,-1.0,-1.0,22745.0,4648.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1074.66666667,-1.0,-1.0,1627.0,557.0,672.0,3654.66666667,3654.66666667,3654.66666667,8.0,-1.0,-1.0,62.2736930847,12.7257919312,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2.27749602,-1.0,-1.0,4.45457458496,1.5250142018,1.83987351259,10.00613753,10.00613753,10.00613753,-1.0,-1.0,22745.0,4648.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,3226.0,-1.0,-1.0,1627.0,1198.0,1198.0,4337.0,4337.0,4337.0,8.0,-1.0,-1.0,62.2736930847,12.7257919312,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,8.83248806,-1.0,-1.0,4.45457458496,3.28001260757,3.28001260757,11.8743028641,11.8743028641,11.8743028641,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Mobile-Only,2.0,DETRACTOR,DETRACTOR,NON PROMOTER,HARD DETRACTOR


In [30]:
#data_df_h2o.types

In [31]:
# Split the data into Train/Test/Validation with Train having 70% and test and validation 15% each
train,test,valid = vf_pos_training_df_h2o.split_frame(ratios=[.7, .15])

In [32]:
# Identify predictors and response
#x = train.columns
y = label_model
#x.remove(y)
x = [item for item in train.columns if item not in ['msisdn', 'nif', 'TNPS01', 'TNPS2DET', 'TNPS2PRO', 'TNPS', 'TNPS4']]

In [33]:
train.types[y]

u'enum'

In [34]:
# For binary classification, response should be a factor
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()
valid[y] = valid[y].asfactor()

In [35]:
train.types[y]

u'enum'

In [37]:
from h2o.automl import H2OAutoML
# Run AutoML for 60 seconds
aml = H2OAutoML(max_runtime_secs = 60, seed=1234)
aml.train(x = x, y = y,
          training_frame = train,
          leaderboard_frame = test)

AutoML progress: |████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [38]:
# View the AutoML Leaderboard
lb = aml.leaderboard
lb

model_id,auc,logloss
DRF_0_AutoML_20180410_164942,0.592199,0.679856
StackedEnsemble_AllModels_0_AutoML_20180410_164942,0.592184,0.679868
StackedEnsemble_BestOfFamily_0_AutoML_20180410_164942,0.592184,0.679868




In [39]:
# The leader model is stored here
aml.leader

Model Details
H2ORandomForestEstimator :  Distributed Random Forest
Model Key:  DRF_0_AutoML_20180410_164942


ModelMetricsBinomial: drf
** Reported on train data. **

MSE: 0.246620204321
RMSE: 0.496608703428
LogLoss: 0.686637591279
Mean Per-Class Error: 0.447131467861
AUC: 0.575128795932
Gini: 0.150257591865
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.257488788052: 


0,1,2,3,4
,DETRACTOR,NON DETRACTOR,Error,Rate
DETRACTOR,604.0,38149.0,0.9844,(38149.0/38753.0)
NON DETRACTOR,258.0,38488.0,0.0067,(258.0/38746.0)
Total,862.0,76637.0,0.4956,(38407.0/77499.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.2574888,0.6671347,367.0
max f2,0.0405477,0.8333082,399.0
max f0point5,0.4260851,0.5667787,270.0
max accuracy,0.5100273,0.5528716,189.0
max precision,0.9340097,1.0,0.0
max recall,0.0405477,1.0,399.0
max specificity,0.9340097,1.0,0.0
max absolute_mcc,0.4696407,0.1074353,229.0
max min_per_class_accuracy,0.5026248,0.5501213,197.0


Gains/Lift Table: Avg response rate: 50.00 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100001,0.7444515,1.3291523,1.3291523,0.6645161,0.6645161,0.0132917,0.0132917,32.9152312,32.9152312
,2,0.0200003,0.7129444,1.2362407,1.2826965,0.6180645,0.6412903,0.0123626,0.0256543,23.6240694,28.2696503
,3,0.0300004,0.6935590,1.3136670,1.2930200,0.6567742,0.6464516,0.0131368,0.0387911,31.3667042,29.3020016
,4,0.0400005,0.6788100,1.2826965,1.2904391,0.6412903,0.6451613,0.0128271,0.0516182,28.2696503,29.0439138
,5,0.0500006,0.6675325,1.1872040,1.2697921,0.5935484,0.6348387,0.0118722,0.0634904,18.7204007,26.9792112
,6,0.1000013,0.6286797,1.1856555,1.2277238,0.5927742,0.6138065,0.0592835,0.1227740,18.5655480,22.7723796
,7,0.1500019,0.6023544,1.1464261,1.2006246,0.5731613,0.6002581,0.0573220,0.1800960,14.6426130,20.0624574
,8,0.2000026,0.5823875,1.1417805,1.1859136,0.5708387,0.5929032,0.0570898,0.2371858,14.1780549,18.5913568
,9,0.3000039,0.5510419,1.0842270,1.1520180,0.5420645,0.5759570,0.1084241,0.3456099,8.4226964,15.2018033




ModelMetricsBinomial: drf
** Reported on validation data. **

MSE: 0.244198409466
RMSE: 0.49416435471
LogLoss: 0.681426721527
Mean Per-Class Error: 0.438122502178
AUC: 0.586597054065
Gini: 0.173194108129
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.280672404692: 


0,1,2,3,4
,DETRACTOR,NON DETRACTOR,Error,Rate
DETRACTOR,89.0,9676.0,0.9909,(9676.0/9765.0)
NON DETRACTOR,29.0,9673.0,0.003,(29.0/9702.0)
Total,118.0,19349.0,0.4985,(9705.0/19467.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.2806724,0.6659323,377.0
max f2,0.2023476,0.8324896,395.0
max f0point5,0.4503305,0.5708872,259.0
max accuracy,0.5140738,0.5620794,185.0
max precision,0.7928407,0.8461538,5.0
max recall,0.2023476,1.0,395.0
max specificity,0.8729208,0.9998976,0.0
max absolute_mcc,0.5280720,0.1256470,168.0
max min_per_class_accuracy,0.5015709,0.5592422,200.0


Gains/Lift Table: Avg response rate: 49.84 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100170,0.6999642,1.4199800,1.4199800,0.7076923,0.7076923,0.0142239,0.0142239,41.9980020,41.9980020
,2,0.0200339,0.6763394,1.3067932,1.3633866,0.6512821,0.6794872,0.0130901,0.0273140,30.6793207,36.3386613
,3,0.0299995,0.6624886,1.2721583,1.3330813,0.6340206,0.6643836,0.0126778,0.0399918,27.2158254,33.3081302
,4,0.0400164,0.6486568,1.2450549,1.3110465,0.6205128,0.6534018,0.0124717,0.0524634,24.5054945,31.1046463
,5,0.0500334,0.6396368,1.2553447,1.2998947,0.6256410,0.6478439,0.0125747,0.0650381,25.5344655,29.9894664
,6,0.1000154,0.6079445,1.2785467,1.2892262,0.6372045,0.6425270,0.0639043,0.1289425,27.8546736,28.9226182
,7,0.1499974,0.5863764,1.1610029,1.2464997,0.5786228,0.6212329,0.0580293,0.1869718,16.1002923,24.6499733
,8,0.2000308,0.5688856,1.1227299,1.2155414,0.5595483,0.6058038,0.0561740,0.2431457,12.2729939,21.5541392
,9,0.2999949,0.5433969,1.1187284,1.1832814,0.5575540,0.5897260,0.1118326,0.3549784,11.8728394,18.3281445




ModelMetricsBinomial: drf
** Reported on cross-validation data. **

MSE: 0.244069054842
RMSE: 0.494033455185
LogLoss: 0.681172908158
Mean Per-Class Error: 0.437890461776
AUC: 0.587148209641
Gini: 0.174296419282
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.340189022413: 


0,1,2,3,4
,DETRACTOR,NON DETRACTOR,Error,Rate
DETRACTOR,1723.0,37030.0,0.9555,(37030.0/38753.0)
NON DETRACTOR,742.0,38004.0,0.0192,(742.0/38746.0)
Total,2465.0,75034.0,0.4874,(37772.0/77499.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.3401890,0.6680260,345.0
max f2,0.1884785,0.8333405,393.0
max f0point5,0.4577257,0.5710712,251.0
max accuracy,0.5044375,0.5621105,197.0
max precision,0.8795566,1.0,0.0
max recall,0.1298842,1.0,398.0
max specificity,0.8795566,1.0,0.0
max absolute_mcc,0.5106646,0.1244180,190.0
max min_per_class_accuracy,0.5021179,0.5596212,200.0


Gains/Lift Table: Avg response rate: 50.00 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100001,0.7054781,1.3394758,1.3394758,0.6696774,0.6696774,0.0133949,0.0133949,33.9475825,33.9475825
,2,0.0200003,0.6803259,1.2826965,1.3110862,0.6412903,0.6554839,0.0128271,0.0262221,28.2696503,31.1086164
,3,0.0300004,0.6637327,1.3781890,1.3334538,0.6890323,0.6666667,0.0137821,0.0400041,37.8188999,33.3453776
,4,0.0400005,0.6514355,1.3033435,1.3259262,0.6516129,0.6629032,0.0130336,0.0530377,30.3343529,32.5926214
,5,0.0500006,0.6417333,1.2414025,1.3090215,0.6206452,0.6544516,0.0124142,0.0654519,24.1402451,30.9021462
,6,0.1000013,0.6086772,1.2119804,1.2605009,0.6059355,0.6301935,0.0605998,0.1260517,21.1980438,26.0500950
,7,0.1500019,0.5875692,1.1804937,1.2338319,0.5901935,0.6168602,0.0590254,0.1850772,18.0493723,23.3831874
,8,0.2000026,0.5709854,1.1510717,1.2131418,0.5754839,0.6065161,0.0575543,0.2426315,15.1071711,21.3141834
,9,0.3000039,0.5437729,1.1115843,1.1792893,0.5557419,0.5895914,0.1111599,0.3537914,11.1584273,17.9289314



Cross-Validation Metrics Summary: 


0,1,2,3,4,5,6,7
,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.5135551,0.0031132,0.5092258,0.5112258,0.5101935,0.5161936,0.5209369
auc,0.5871711,0.0011969,0.5851927,0.5880604,0.5850992,0.5883446,0.5891587
err,0.4864449,0.0031132,0.4907742,0.4887742,0.4898064,0.4838065,0.4790631
err_count,7539.8,48.36817,7607.0,7576.0,7592.0,7499.0,7425.0
f0point5,0.5612220,0.0021856,0.5611295,0.5591162,0.5570284,0.5627880,0.5660476
f1,0.6684521,0.0016144,0.6698781,0.6675735,0.6646051,0.6688744,0.6713293
f2,0.8263587,0.0019637,0.830911,0.8282342,0.8236788,0.8242464,0.8247232
lift_top_group,1.3424709,0.0518920,1.2820513,1.4524705,1.4071661,1.3005408,1.2701259
logloss,0.6811729,0.0005823,0.6823981,0.6801784,0.6817157,0.6811777,0.6803946


Scoring History: 


0,1,2,3,4,5,6,7,8,9,10,11,12,13
,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_lift,training_classification_error,validation_rmse,validation_logloss,validation_auc,validation_lift,validation_classification_error
,2018-04-10 16:53:07,3 min 20.523 sec,0.0,,,,,,,,,,
,2018-04-10 16:53:08,3 min 21.445 sec,1.0,0.5710796,5.3017650,0.5330851,1.0217926,0.4981983,0.5739113,5.3760769,0.5274343,0.9829753,0.5016181
,2018-04-10 16:53:10,3 min 23.249 sec,2.0,0.5633280,4.5207792,0.5320447,1.0214381,0.4980371,0.5336069,1.7084646,0.5363241,1.0295098,0.5016181
,2018-04-10 16:53:15,3 min 28.261 sec,6.0,0.5371090,2.2636675,0.5376243,1.0208506,0.4996769,0.5060093,0.7194868,0.5548797,1.2403778,0.5016181
,2018-04-10 16:53:20,3 min 33.201 sec,13.0,0.5126108,0.9234059,0.5482026,1.0400939,0.5000517,0.4989278,0.6917752,0.5658210,1.2862138,0.4976627
,2018-04-10 16:53:27,3 min 40.199 sec,23.0,0.5025203,0.7089151,0.5602217,1.2517260,0.4964128,0.4960639,0.6854595,0.5771540,1.3376623,0.4966353
,2018-04-10 16:53:35,3 min 48.149 sec,34.0,0.4991695,0.6937076,0.5671836,1.2646304,0.4935160,0.4948015,0.6827607,0.5832114,1.3582418,0.4979196
,2018-04-10 16:53:44,3 min 56.849 sec,46.0,0.4970847,0.6877000,0.5733551,1.3007627,0.4935289,0.4943275,0.6817647,0.5857964,1.3685315,0.4949402
,2018-04-10 16:53:47,4 min 0.420 sec,50.0,0.4966087,0.6866376,0.5751288,1.3291523,0.4955806,0.4941644,0.6814267,0.5865971,1.4199800,0.4985360


Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
AC_sistema_operativo,18618.1855469,1.0,0.0855072
AC_ppid_destino,15517.4912109,0.8334588,0.0712667
AC_plandatos,9306.8466797,0.4998794,0.0427433
AC_x_plan,8838.8134766,0.4747409,0.0405938
AC_pprecios_destino,6839.9912109,0.3673823,0.0314138
---,---,---,---
CCC_Consulta_Ficha,0.5662745,0.0000304,0.0000026
AC_max_flag_siebel,0.4444586,0.0000239,0.0000020
CCC_Productos_Voz,0.4102564,0.0000220,0.0000019



See the whole table with table.as_data_frame()




Model metrics of train dataset for max accuracy (if BinomialModel):

In [40]:
from h2o.model.metrics_base import H2OBinomialModelMetrics#, H2OMultinomialModelMetrics
train_metrics = aml.leader.model_performance()
if isinstance(train_metrics, H2OBinomialModelMetrics):
    #print aml.leader.confusion_matrix(metrics="accuracy")
    print train_metrics.confusion_matrix(metrics="accuracy")
    print 'AUC:', train_metrics.auc()
    print 'Accuracy:', train_metrics.accuracy()

Confusion Matrix (Act/Pred) for max accuracy @ threshold = 0.510027268324: 


0,1,2,3,4
,DETRACTOR,NON DETRACTOR,Error,Rate
DETRACTOR,22763.0,15990.0,0.4126,(15990.0/38753.0)
NON DETRACTOR,18662.0,20084.0,0.4816,(18662.0/38746.0)
Total,41425.0,36074.0,0.4471,(34652.0/77499.0)



AUC: 0.575128795932
Accuracy: [[0.5100272683239445, 0.5528716499567736]]


Model metrics of test dataset, and confusion matrix for max accuracy (if BinomialModel):

In [41]:
#test_pred = aml.leader.predict(test)
test_metrics = aml.leader.model_performance(test_data=test)
print test_metrics


ModelMetricsBinomial: drf
** Reported on test data. **

MSE: 0.243427142234
RMSE: 0.49338336234
LogLoss: 0.679856088284
Mean Per-Class Error: 0.433031974694
AUC: 0.592198721966
Gini: 0.184397443932
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.328618249443: 


0,1,2,3,4
,DETRACTOR,NON DETRACTOR,Error,Rate
DETRACTOR,329.0,9935.0,0.9679,(9935.0/10264.0)
NON DETRACTOR,145.0,10266.0,0.0139,(145.0/10411.0)
Total,474.0,20201.0,0.4875,(10080.0/20675.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.3286182,0.6707174,357.0
max f2,0.1896887,0.8353392,396.0
max f0point5,0.4431816,0.5790450,269.0
max accuracy,0.4976830,0.5671100,206.0
max precision,0.8941962,1.0,0.0
max recall,0.1896887,1.0,396.0
max specificity,0.8941962,1.0,0.0
max absolute_mcc,0.4976830,0.1340421,206.0
max min_per_class_accuracy,0.5014615,0.5640101,201.0


Gains/Lift Table: Avg response rate: 50.36 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100121,0.7028346,1.3910756,1.3910756,0.7004831,0.7004831,0.0139276,0.0139276,39.1075586,39.1075586
,2,0.0200242,0.6770226,1.3335138,1.3622947,0.6714976,0.6859903,0.0133513,0.0272788,33.3513837,36.2294712
,3,0.0300363,0.6608922,1.3239202,1.3495032,0.6666667,0.6795491,0.0132552,0.0405341,32.3920213,34.9503212
,4,0.04,0.6474268,1.2532254,1.3255211,0.6310680,0.6674728,0.0124868,0.0530208,25.3225444,32.5521083
,5,0.0500121,0.6380550,1.2759521,1.3155977,0.6425121,0.6624758,0.0127749,0.0657958,27.5952089,31.5597697
,6,0.1000242,0.6066923,1.2022834,1.2589406,0.6054159,0.6339458,0.0601287,0.1259245,20.2283443,25.8940570
,7,0.1500363,0.5853330,1.1734747,1.2304520,0.5909091,0.6196003,0.0586879,0.1846124,17.3474734,23.0451958
,8,0.2,0.5697029,1.1996024,1.2227452,0.6040658,0.6157195,0.0599366,0.2445490,19.9602439,22.2745173
,9,0.3000242,0.5424444,1.1072147,1.1842288,0.5575435,0.5963244,0.1107482,0.3552973,10.7214704,18.4228809






In [42]:
#print test_metrics.__class__
#if str(test_metrics.__class__) == "<class 'h2o.model.metrics_base.H2OBinomialModelMetrics'>":
from h2o.model.metrics_base import H2OBinomialModelMetrics
if isinstance(test_metrics, H2OBinomialModelMetrics):
    print test_metrics.confusion_matrix(metrics="accuracy")
    print 'AUC:', test_metrics.auc()
    print 'Accuracy:', test_metrics.accuracy()

Confusion Matrix (Act/Pred) for max accuracy @ threshold = 0.497682978678: 


0,1,2,3,4
,DETRACTOR,NON DETRACTOR,Error,Rate
DETRACTOR,5605.0,4659.0,0.4539,(4659.0/10264.0)
NON DETRACTOR,4291.0,6120.0,0.4122,(4291.0/10411.0)
Total,9896.0,10779.0,0.4329,(8950.0/20675.0)



AUC: 0.592198721966
Accuracy: [[0.49768297867849487, 0.5671100362756953]]


### Oracle Non-Prepaid segment - Predictions

Make predictions

In [43]:
vf_pos.printSchema()

root
 |-- nif: string (nullable = true)
 |-- partitioned_month: string (nullable = true)
 |-- AC_prodhz: string (nullable = true)
 |-- AC_prodadsl: string (nullable = true)
 |-- AC_x_plan: string (nullable = true)
 |-- AC_promocion_vf: string (nullable = true)
 |-- AC_prodlpd: string (nullable = true)
 |-- AC_puntos: integer (nullable = true)
 |-- AC_sistema_operativo: string (nullable = true)
 |-- AC_promocion_tarifa: string (nullable = true)
 |-- AC_flag_huella_von: string (nullable = true)
 |-- AC_prodtivo: string (nullable = true)
 |-- AC_x_tipo_ident: string (nullable = true)
 |-- AC_part_status: string (nullable = true)
 |-- AC_pprecios_destino: string (nullable = true)
 |-- AC_x_sexo: string (nullable = true)
 |-- AC_plandatos: string (nullable = true)
 |-- AC_ppid_destino: string (nullable = true)
 |-- AC_cod_segfid: integer (nullable = true)
 |-- AC_modelo: string (nullable = true)
 |-- AC_tarifa_canje: string (nullable = true)
 |-- AC_x_nacionalidad: string (nullable = true)


In [44]:
h2o.ls()

Unnamed: 0,key
0,AutoML_20180410_163527
1,AutoML_20180410_164942
2,AutoML_Feedback_AutoML_20180410_163527
3,AutoML_Feedback_AutoML_20180410_164942
4,AutoML_Leaderboard_automl_py_7_sid_af3d
5,DRF_0_AutoML_20180410_164942
6,DRF_0_AutoML_20180410_164942_cv_1
7,DRF_0_AutoML_20180410_164942_cv_2
8,DRF_0_AutoML_20180410_164942_cv_3
9,DRF_0_AutoML_20180410_164942_cv_4


In [45]:
vf_pos.write.mode('overwrite').format('parquet').save('/tmp/bbergua/tmp/vf_pos_all-'+month)

In [46]:
# $ hdfs getconf -confKey fs.defaultFS
# hdfs://nameservice1
# $ hdfs getconf -namenodes
# vgddp350hr.dc.sedc.internal.vodafone.com vgddp351hr.dc.sedc.internal.vodafone.com
out = subprocess.check_output('hdfs getconf -namenodes', shell=True)
nodes = ['hdfs://'+n for n in out.strip().split(' ')]

In [47]:
path_vf_pos_all = [n+'/tmp/bbergua/tmp/vf_pos_all-'+month for n in nodes]
path_vf_pos_all[0]

'hdfs://vgddp350hr.dc.sedc.internal.vodafone.com/tmp/bbergua/tmp/vf_pos_all-201712'

In [48]:
vf_pos_all_df_h2o = h2o.import_file(path=path_vf_pos_all[0])
vf_pos_all_df_h2o.nrows
#subprocess.call('hdfs dfs -rm -r /tmp/bbergua/tmp/vf_pos_all-'+month, shell=True)

Parse progress: |█████████████████████████████████████████████████████████| 100%


3285656

In [51]:
if False:
    #h2o.remove_all()
    #vf_pos_all_pd = generate_pandas_data(vf_pos)

    # Copy to H2O

    #print balanced_pd.columns.values.tolist()
    #os.environ["PYTHONIOENCODING"] = "UTF-8"
    print 'Copying data to H2O'
    h2o.remove('vf_pos_all_df_h2o.hex')
    vf_pos_all_df_h2o = h2o.H2OFrame(vf_pos_all_pd, destination_frame='vf_pos_all_df_h2o.hex', header=1)
    vf_pos_all_df_h2o.head()

    #preds = aml.leader.predict()

In [54]:
#vf_pos_all_pd.dtypes
#print vf_pos_all_pd.columns.values.tolist()

In [55]:
h2o.ls()

Unnamed: 0,key
0,AutoML_20180410_163527
1,AutoML_20180410_164942
2,AutoML_Feedback_AutoML_20180410_163527
3,AutoML_Feedback_AutoML_20180410_164942
4,AutoML_Leaderboard_automl_py_7_sid_af3d
5,DRF_0_AutoML_20180410_164942
6,DRF_0_AutoML_20180410_164942_cv_1
7,DRF_0_AutoML_20180410_164942_cv_2
8,DRF_0_AutoML_20180410_164942_cv_3
9,DRF_0_AutoML_20180410_164942_cv_4


In [56]:
#vf_pos_all_df_h2o[''] = vf_pos_all_df_h2o[''].asnumeric()
#vf_pos_all_df_h2o[''] = vf_pos_all_df_h2o[''].asnumeric()
#vf_pos_all_df_h2o[''] = vf_pos_all_df_h2o[''].asnumeric()

In [57]:
vf_pos_all_df_h2o.describe()

Rows:3285656
Cols:282




Unnamed: 0,nif,partitioned_month,AC_prodhz,AC_prodadsl,AC_x_plan,AC_promocion_vf,AC_prodlpd,AC_puntos,AC_sistema_operativo,AC_promocion_tarifa,AC_flag_huella_von,AC_prodtivo,AC_x_tipo_ident,AC_part_status,AC_pprecios_destino,AC_x_sexo,AC_plandatos,AC_ppid_destino,AC_cod_segfid,AC_modelo,AC_tarifa_canje,AC_x_nacionalidad,AC_flag_cobertura_adsl,AC_codigo_postal,AC_x_dia_emision,AC_x_subtipo,AC_prodvfbox,AC_prodftth,AC_prodadslenprov,AC_x_tipo_cliente,AC_sum_flagvfbox,AC_sum_flaghz,AC_sum_flagvoz,AC_sum_flagtivo,AC_sum_flaglpd,AC_sum_flagfutbol,AC_sum_flagftth,AC_sum_flagadsl,AC_sum_arpu,AC_sum_cantidad_pendiente,AC_sum_cuotas_pendientes,AC_is_mobile_only_new,AC_is_mobile_only,AC_is_convergent_new,AC_is_convergent,AC_max_cobertura_4g,AC_max_flag_huella_vf,AC_max_flag_ebilling,AC_max_lortad,AC_max_flag_financia,AC_max_vfsmartphone,AC_max_flag_nh_real,AC_max_seg_cliente,AC_max_flag_4g_nodos,AC_max_flag_existe_fbb_hogar,AC_max_flag_siebel,AC_max_flag_4g,AC_max_flag_huella_movistar,AC_max_terminal_4g,AC_max_flag_desc_conv,AC_max_flag_financia_simo,AC_max_flag_4g_aperturas,AC_max_flag_huella_jazztel,AC_max_terminalmms,AC_max_num_pospago,AC_max_flag_huella_neba,AC_max_flag_nh_prevista,AC_max_deuda,AC_max_cobertura_4g_plus,AC_max_flag_huella_ono,AC_max_num_prepago,AC_max_flag_cuenta_superintegral,AC_max_flag_huella_euskaltel,AC_max_num_total,AC_min_days_since_install_date,AC_min_days_since_fecha_fin_cp_tarifa,AC_min_days_since_x_fecha_nacimiento,AC_min_days_since_x_fecha_activacion,AC_min_days_since_fecha_fin_cp_vf,AC_min_days_since_fecha_alta_ss_ftth,AC_min_days_since_fecha_alta_ss_adsl,AC_min_days_since_fecha_alta_tivo,AC_min_days_since_fecha_alta_ss_hz,AC_min_days_since_fecha_alta_ss_adslenprov,AC_min_days_since_fecha_alta_vfbox,AC_min_days_since_fecha_ultima_financiacion,AC_min_days_since_fecha_alta_futbol,AC_min_days_since_fecha_alta_servicio,AC_min_days_since_fecha_transferencia,AC_min_days_since_fecha_antiguedad_cliente,AC_min_days_since_fecha_canje,AC_min_days_since_x_fecha_creacion_cuenta,AC_min_days_since_last_upd,AC_min_days_since_fecha_cambio,AC_min_days_since_x_fecha_recepcion,AC_min_days_since_x_fecha_ini_prov,AC_min_days_since_x_fecha_creacion_servicio,AC_min_x_dia_emision,AC_min_years_since_install_date,AC_min_years_since_fecha_fin_cp_tarifa,AC_min_years_since_x_fecha_nacimiento,AC_min_years_since_x_fecha_activacion,AC_min_years_since_fecha_fin_cp_vf,AC_min_years_since_fecha_alta_ss_ftth,AC_min_years_since_fecha_alta_ss_adsl,AC_min_years_since_fecha_alta_tivo,AC_min_years_since_fecha_alta_ss_hz,AC_min_years_since_fecha_alta_ss_adslenprov,AC_min_years_since_fecha_alta_vfbox,AC_min_years_since_fecha_ultima_financiacion,AC_min_years_since_fecha_alta_futbol,AC_min_years_since_fecha_alta_servicio,AC_min_years_since_fecha_transferencia,AC_min_years_since_fecha_antiguedad_cliente,AC_min_years_since_fecha_canje,AC_min_years_since_x_fecha_creacion_cuenta,AC_min_years_since_last_upd,AC_min_years_since_fecha_cambio,AC_min_years_since_x_fecha_recepcion,AC_min_years_since_x_fecha_ini_prov,AC_min_years_since_x_fecha_creacion_servicio,AC_avg_days_since_install_date,AC_avg_days_since_fecha_fin_cp_tarifa,AC_avg_days_since_x_fecha_nacimiento,AC_avg_days_since_x_fecha_activacion,AC_avg_days_since_fecha_fin_cp_vf,AC_avg_days_since_fecha_alta_ss_ftth,AC_avg_days_since_fecha_alta_ss_adsl,AC_avg_days_since_fecha_alta_tivo,AC_avg_days_since_fecha_alta_ss_hz,AC_avg_days_since_fecha_alta_ss_adslenprov,AC_avg_days_since_fecha_alta_vfbox,AC_avg_days_since_fecha_ultima_financiacion,AC_avg_days_since_fecha_alta_futbol,AC_avg_days_since_fecha_alta_servicio,AC_avg_days_since_fecha_transferencia,AC_avg_days_since_fecha_antiguedad_cliente,AC_avg_days_since_fecha_canje,AC_avg_days_since_x_fecha_creacion_cuenta,AC_avg_days_since_last_upd,AC_avg_days_since_fecha_cambio,AC_avg_days_since_x_fecha_recepcion,AC_avg_days_since_x_fecha_ini_prov,AC_avg_days_since_x_fecha_creacion_servicio,AC_avg_x_dia_emision,AC_avg_years_since_install_date,AC_avg_years_since_fecha_fin_cp_tarifa,AC_avg_years_since_x_fecha_nacimiento,AC_avg_years_since_x_fecha_activacion,AC_avg_years_since_fecha_fin_cp_vf,AC_avg_years_since_fecha_alta_ss_ftth,AC_avg_years_since_fecha_alta_ss_adsl,AC_avg_years_since_fecha_alta_tivo,AC_avg_years_since_fecha_alta_ss_hz,AC_avg_years_since_fecha_alta_ss_adslenprov,AC_avg_years_since_fecha_alta_vfbox,AC_avg_years_since_fecha_ultima_financiacion,AC_avg_years_since_fecha_alta_futbol,AC_avg_years_since_fecha_alta_servicio,AC_avg_years_since_fecha_transferencia,AC_avg_years_since_fecha_antiguedad_cliente,AC_avg_years_since_fecha_canje,AC_avg_years_since_x_fecha_creacion_cuenta,AC_avg_years_since_last_upd,AC_avg_years_since_fecha_cambio,AC_avg_years_since_x_fecha_recepcion,AC_avg_years_since_x_fecha_ini_prov,AC_avg_years_since_x_fecha_creacion_servicio,AC_max_days_since_install_date,AC_max_days_since_fecha_fin_cp_tarifa,AC_max_days_since_x_fecha_nacimiento,AC_max_days_since_x_fecha_activacion,AC_max_days_since_fecha_fin_cp_vf,AC_max_days_since_fecha_alta_ss_ftth,AC_max_days_since_fecha_alta_ss_adsl,AC_max_days_since_fecha_alta_tivo,AC_max_days_since_fecha_alta_ss_hz,AC_max_days_since_fecha_alta_ss_adslenprov,AC_max_days_since_fecha_alta_vfbox,AC_max_days_since_fecha_ultima_financiacion,AC_max_days_since_fecha_alta_futbol,AC_max_days_since_fecha_alta_servicio,AC_max_days_since_fecha_transferencia,AC_max_days_since_fecha_antiguedad_cliente,AC_max_days_since_fecha_canje,AC_max_days_since_x_fecha_creacion_cuenta,AC_max_days_since_last_upd,AC_max_days_since_fecha_cambio,AC_max_days_since_x_fecha_recepcion,AC_max_days_since_x_fecha_ini_prov,AC_max_days_since_x_fecha_creacion_servicio,AC_max_x_dia_emision,AC_max_years_since_install_date,AC_max_years_since_fecha_fin_cp_tarifa,AC_max_years_since_x_fecha_nacimiento,AC_max_years_since_x_fecha_activacion,AC_max_years_since_fecha_fin_cp_vf,AC_max_years_since_fecha_alta_ss_ftth,AC_max_years_since_fecha_alta_ss_adsl,AC_max_years_since_fecha_alta_tivo,AC_max_years_since_fecha_alta_ss_hz,AC_max_years_since_fecha_alta_ss_adslenprov,AC_max_years_since_fecha_alta_vfbox,AC_max_years_since_fecha_ultima_financiacion,AC_max_years_since_fecha_alta_futbol,AC_max_years_since_fecha_alta_servicio,AC_max_years_since_fecha_transferencia,AC_max_years_since_fecha_antiguedad_cliente,AC_max_years_since_fecha_canje,AC_max_years_since_x_fecha_creacion_cuenta,AC_max_years_since_last_upd,AC_max_years_since_fecha_cambio,AC_max_years_since_x_fecha_recepcion,AC_max_years_since_x_fecha_ini_prov,AC_max_years_since_x_fecha_creacion_servicio,CCC_Pagar_menos,CCC_Incidencia_Provision_Neba,CCC_Incidencia_Provision_Fibra,CCC_Incidencia_Provision_DSL,CCC_Incidencia_Tecnica,CCC_Incidencia_SGI,CCC_Incidencia_Resto,CCC_Incidencia_Provision_Movil,CCC_Resultado_No_Aplica,CCC_Resultado_Informacion,CCC_Resultado_Solucionado,CCC_Resultado_Retenido,CCC_Resultado_No_Retenido,CCC_Resultado_Escalo,CCC_Resultado_Envio_tecnico,CCC_Resultado_Transferencia,CCC_Resultado_Abono,CCC_Resultado_Bajas,CCC_Resultado_Reclamacion,CCC_Desactivacion_BA_Movil_TV,CCC_Desactivacion_TV,CCC_Desactivacion_Movil,CCC_Desactivacion_Total,CCC_Desactivacion_NET,CCC_Desactivacion_Fijo,CCC_Desactivacion_USB,CCC_Desactivacion_Resto,CCC_Ofrecimiento,CCC_Transferencia,CCC_Cobro,CCC_Precios,CCC_Portabilidad_Inversa,CCC_Portabilidad,CCC_Informacion,CCC_Cierre,CCC_Alta,CCC_Factura,CCC_Averia_DSL,CCC_Averia_Fibra,CCC_Averia_TV,CCC_Averia_Resto,CCC_Averia_Neba,CCC_Averia_Modem_Router,CCC_Baja,CCC_Consulta_Tecnica_TV,CCC_Consulta_Tecnica_Fibra,CCC_Consulta_Tecnica_Neba,CCC_Consulta_Tecnica_DSL,CCC_Consulta_Tecnica_Movil,CCC_Consulta_Tecnica_Modem_Router,CCC_Consulta_Tecnica_Resto,CCC_Consulta_Ficha,CCC_Consulta_Resto,CCC_Productos_Voz,CCC_Productos_Datos,CCC_Productos_Resto,CCC_Provision_Neba,CCC_Provision_Fibra,CCC_Provision_DSL,CCC_Provision_Resto,CCC_Provision_Movil,SEGMENTACION,TNPS01,TNPS,TNPS2DET,TNPS2PRO,TNPS4
type,string,enum,enum,enum,enum,enum,enum,int,enum,enum,enum,enum,enum,enum,enum,enum,enum,enum,int,enum,enum,enum,enum,enum,int,enum,string,enum,enum,enum,int,int,int,int,int,int,int,int,real,real,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,real,real,int,int,int,int,int,int,int,int,int,int,real,int,int,real,real,real,real,real,real,int,int,real,real,int,int,int,int,int,int,int,int,int,int,real,int,int,real,real,real,real,real,real,real,int,int,real,real,int,int,int,int,int,int,int,int,int,int,real,int,int,real,real,real,real,real,real,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,real,real,int,int,int,int,int,int,int,int,int,int,real,int,int,real,real,real,real,real,real,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,enum,int,enum,enum,enum,enum
mins,,,,,,,,-5256.0,,,,,,,,,,,0.0,,,,,,1.0,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-41.3698997498,-868.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-650763.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,100.0,-1.0,-745.0,-1.0,100.0,-1.0,1.0,-1.0,-1.0,-1781.7286377,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.273790687323,-1.0,-2.03974080086,-1.0,0.273790687323,-1.0,-1.0,-1.0,-650763.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,100.0,-1.0,-745.0,-1.0,100.0,-1.0,1.0,-1.0,-1.0,-1781.7286377,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.273790687323,-1.0,-2.03974080086,-1.0,0.273790687323,-1.0,-1.0,-1.0,-650763.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,100.0,-1.0,-745.0,-1.0,100.0,-1.0,1.0,-1.0,-1.0,-1781.7286377,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.273790687323,-1.0,-2.03974080086,-1.0,0.273790687323,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,,,
mean,,,,,,,,-2.46408753686,,,,,,,,,,,28.1123172968,,,,,,10.4388201321,,,,,,9.13059675146e-06,0.14361576501,1.33782994933,0.150477104116,0.1475595741,0.0411686433394,0.301201647403,0.142745010433,40.8464686021,16916.151972,5.24267026128,0.855375304049,0.313780870548,0.0,0.541594433501,0.303783171458,0.162810105501,0.840382864183,0.134133944637,0.31883465585,0.815424682316,0.000791318385126,3.74701520792,0.0462379506558,0.519569912371,0.999332857731,0.0153664899795,0.663495204611,0.739966995936,0.396495250872,0.180183500646,0.00325657950802,0.191868534016,0.0,2.07510828888,0.13972856562,6.23924111349e-05,0.0543772689533,0.843805924905,0.339180060238,0.0985681398174,0.0270223054392,0.0284208085083,2.17367642869,-1.0,-1.0,18711.6594467,2154.20064334,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,293.683998568,-1.0,-1.0,1213.44654736,695.56696197,191.661886089,1133.66924687,1303.17968375,1133.7307664,10.3234039108,-1.0,-1.0,51.2117814613,5.8980007328,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.0784384220292,-1.0,-1.0,3.32230374881,1.47260439955,-0.190870489764,2.6730538536,3.56798478787,2.67322228834,-1.0,-1.0,18712.1134175,2178.16418075,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,513.867934544,-1.0,-1.0,1219.85014062,925.905540611,300.319639798,1632.07398604,1686.72449771,1632.05444382,10.4400261638,-1.0,-1.0,51.2130314728,5.96361066978,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.600991257316,-1.0,-1.0,3.33983619103,2.33565090053,0.273228645331,4.26991356556,4.61809482161,4.26986006078,-1.0,-1.0,18712.6868823,2199.49563436,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,882.502310041,-1.0,-1.0,1225.75052592,1225.59781943,444.372996747,2215.40950483,2159.12648585,2215.27481301,10.556366826,-1.0,-1.0,51.2146084463,6.02201420605,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.72920918743,-1.0,-1.0,3.35599089672,3.33361313401,0.870898726378,6.04432636727,5.91148753512,6.04395759359,1.82611935029e-06,0.00153485331392,0.00101775718456,0.000928277336398,4.68703966575e-05,0.0,7.00012417612e-05,0.0,0.0187286800566,0.11196729055,0.135870279786,0.0208171518869,0.00933938306384,0.022862101206,0.00235447654898,0.0166764262601,0.00212499421729,0.0,0.00184072830509,0.00190738166138,0.00475369302203,0.00693042728758,0.0,0.00595101860937,0.000924929450922,0.000635793887126,0.000329310189502,0.000422746629592,0.0853193395778,0.014655825199,0.0,0.0,0.0224232238554,0.229429069872,0.00444538320506,0.0056244475989,0.0540248279187,0.0176056166562,0.0118277750318,0.00772265873238,0.00302679282311,0.0051469173888,0.000223395267186,0.00567253540845,0.00184377183734,0.00426551044905,0.00104727944739,0.00468977884477,0.00457808121118,0.000841232314034,0.00101653977166,7.60883062621e-06,0.000101958330391,1.8869899953e-05,0.000321092652426,0.041542693453,0.00268256932558,0.00287644232993,0.00229451896364,4.26094515068e-06,9.13059675146e-07,,7.60165696766,,,,
maxs,,,,,,,,5995.0,,,,,,,,,,,49.0,,,,,,22.0,,,,,,6.0,14.0,49.0,6.0,14.0,3.0,32.0,15.0,13771.4887385,55308824612.1,217.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,49.0,1.0,1.0,1.0,1.0,1.0,40.0,1.0,1.0,56.0,-1.0,-1.0,96888.0,96888.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,7091.0,-1.0,-1.0,7937.0,13979.0,6028.0,8222.0,96888.0,8222.0,22.0,-1.0,-1.0,265.270324707,265.270324707,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,19.4144992828,-1.0,-1.0,21.7307682037,38.2732009888,16.5041027069,22.5110721588,265.270324707,22.5110721588,-1.0,-1.0,96888.0,96888.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,7091.0,-1.0,-1.0,7937.0,13979.0,6028.0,52206.5,96888.0,8222.0,22.0,-1.0,-1.0,265.270324707,265.270324707,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,19.4144992828,-1.0,-1.0,21.7307682037,38.2732009888,16.5041027069,142.936537743,265.270324707,22.5110721588,-1.0,-1.0,96888.0,96888.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,7098.0,-1.0,-1.0,7995.0,13979.0,6035.0,96888.0,96888.0,8226.0,22.0,-1.0,-1.0,265.270324707,265.270324707,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,19.4336643219,-1.0,-1.0,21.8895664215,38.2732009888,16.5232696533,265.270324707,265.270324707,22.5220222473,4.0,12.0,5.0,12.0,2.0,0.0,6.0,0.0,48.0,118.0,117.0,30.0,8.0,14.0,4.0,32.0,3.0,0.0,5.0,6.0,7.0,26.0,0.0,8.0,5.0,3.0,5.0,3.0,170.0,59.0,0.0,0.0,41.0,165.0,18.0,69.0,75.0,44.0,24.0,18.0,9.0,17.0,3.0,22.0,4.0,12.0,5.0,11.0,16.0,7.0,7.0,1.0,4.0,2.0,3.0,28.0,38.0,33.0,19.0,1.0,3.0,,10.0,,,,
sigma,,,,,,,,23.7435417635,,,,,,,,,,,8.80394470514,,,,,,7.98326123849,,,,,,0.00567990918116,0.36576980846,0.954236839006,0.361783162464,0.410323558092,0.199939943816,0.468976097662,0.355955219557,33.369590158,30512892.7559,10.1453059699,0.351722093311,0.464028556621,0.0,0.498266975289,0.459890226649,0.369192384174,0.366250660882,0.340796221929,0.466024875064,0.3879527234,0.0281192539195,3.28257161144,0.210000038088,0.499616947773,0.0258204839887,0.123005550973,0.472513900348,0.438652367398,0.489169540891,0.384340281131,0.056973460364,0.393770297079,0.0,1.27617031214,0.346705249678,0.0078986414851,0.226760660663,0.363039290041,0.473431109235,0.376801255828,0.162148414886,0.166171822389,1.34085623017,0.0,0.0,10743.976736,1757.87354131,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,984.566811316,0.0,0.0,554.543568261,2428.26353343,377.585006066,1556.19645547,1393.45335152,1556.30157053,7.97002574312,0.0,0.0,29.4494008131,4.8128946829,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.96510835221,0.0,0.0,1.51828867551,6.78909621637,1.42311042482,4.59070939468,3.81514569403,4.59099231019,0.0,0.0,10744.2060757,1758.92013993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1078.43448814,0.0,0.0,550.101305338,2517.94394357,374.160468462,1465.85752507,1378.80201095,1464.64339181,7.93666677796,0.0,0.0,29.4500071253,4.81576017636,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.23281931919,0.0,0.0,1.50612617291,6.95375541536,1.35826269553,4.13345139283,3.77503168569,4.13022119773,0.0,0.0,10746.1700346,1775.6898698,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1607.55391754,0.0,0.0,550.962889985,3022.78036099,463.626253492,1796.92981416,1714.81565254,1791.67965745,7.99295059648,0.0,0.0,29.4553763539,4.86167413678,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.78628438414,0.0,0.0,1.50848511134,8.286289838,1.63699923135,4.94807587068,4.69500578908,4.93378214566,0.00234058889673,0.0506204467846,0.0359424131563,0.0390533716313,0.00710777418747,0.0,0.00952325514166,0.0,0.232202213179,0.562229675792,0.666815084243,0.188329932323,0.103032347673,0.186797746189,0.050975517894,0.166029093807,0.0469129890127,0.0,0.0449844952625,0.0463053264426,0.0740421559157,0.0976760996175,0.0,0.0865396581905,0.0325828472861,0.0260033303869,0.0187218208712,0.020923329309,0.583925490664,0.161574411161,0.0,0.0,0.260381736429,0.86791273925,0.080517399037,0.10626159424,0.30874951607,0.216326697316,0.149164874765,0.121350091458,0.0662039368153,0.105839577581,0.0159304978417,0.0864792085182,0.045635930403,0.0719754432731,0.0341927143103,0.0767428202308,0.0768743774892,0.0324493992823,0.0343667707596,0.00275840081345,0.0115057940919,0.00454924933687,0.0187138336031,0.25426067072,0.069187985082,0.0672550564934,0.0644068700334,0.00206420161124,0.00165504653271,,3.15446812872,,,,
zeros,0,,,,,,,3068511,,,,,,,,,,,45720,,,,,,0,,0,,,,3285646,2830540,475187,2796087,2858663,3151199,2310209,2823184,158587,2239173,2239133,475187,2254680,3285656,1506163,2287529,2750718,524447,2844938,2238075,606451,3283056,1186291,3133734,1578528,2192,3235167,1105639,854379,1982909,2693635,3274956,2655242,1849766,43026,2826556,3285451,3106991,513200,2171227,3015192,3196870,3192275,41320,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3285653,3281667,3282693,3283251,3285508,3285656,3285446,3285656,3247502,3040997,3036418,3231532,3256903,3225537,3278301,3240066,3278799,3285656,3279866,3279723,3271113,3265536,3285656,3268200,3282811,3283632,3284603,3284291,3133882,3248483,3285656,3285656,3241936,2895082,3273254,3270424,3147563,3249002,3256680,3267055,3277247,3274050,3284969,3268831,3279956,3272870,3282402,3271816,3272261,3283179,3282526,3285631,3285364,3285597,3284648,3174304,3278700,3277777,3279817,3285642,3285655,,26301,,,,
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1435890,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3000919,3000919,3000919,3000919,3000919,3000919
0,00018573Q,201712,,,TREDM,,,0.0,Android 6,,,,NIF,AC,TREDM,Varon,IREDM,SYUTA,29.0,Samsung Galaxy S7,,España,,88177,8.0,sin IAE,,,,Particulares,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,31.6910991669,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,-1.0,-1.0,9596.0,920.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,920.0,-1.0,-1.0,920.0,283.0,283.0,920.0,920.0,920.0,8.0,-1.0,-1.0,26.2729549408,2.51887440681,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2.51887440681,-1.0,-1.0,2.51887440681,0.77482765913,0.77482765913,2.51887440681,2.51887440681,2.51887440681,-1.0,-1.0,9596.0,920.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,920.0,-1.0,-1.0,920.0,283.0,283.0,920.0,920.0,920.0,8.0,-1.0,-1.0,26.2729549408,2.51887440681,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2.51887440681,-1.0,-1.0,2.51887440681,0.77482765913,0.77482765913,2.51887440681,2.51887440681,2.51887440681,-1.0,-1.0,9596.0,920.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,920.0,-1.0,-1.0,920.0,283.0,283.0,920.0,920.0,920.0,8.0,-1.0,-1.0,26.2729549408,2.51887440681,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2.51887440681,-1.0,-1.0,2.51887440681,0.77482765913,0.77482765913,2.51887440681,2.51887440681,2.51887440681,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,
1,00027620Y,201712,,,TSMAS,CPP24000,,0.0,Android 7,,N,,NIF,AC,TSMAS,Mujer,ISMAS,UIXS2,29.0,LG G6,S,España,D,89257,1.0,sin IAE,,,,Particulares,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,27.6774997711,388.5,21.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,7.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,-1.0,-1.0,9596.0,1616.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1616.0,-1.0,-1.0,1621.0,209.0,209.0,1616.0,1616.0,1616.0,1.0,-1.0,-1.0,26.2729549408,4.42445755005,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,4.42445755005,-1.0,-1.0,4.43814706802,0.572222590446,0.572222590446,4.42445755005,4.42445755005,4.42445755005,-1.0,-1.0,9596.0,1616.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1616.0,-1.0,-1.0,1621.0,209.0,209.0,1616.0,1616.0,1616.0,1.0,-1.0,-1.0,26.2729549408,4.42445755005,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,4.42445755005,-1.0,-1.0,4.43814706802,0.572222590446,0.572222590446,4.42445755005,4.42445755005,4.42445755005,-1.0,-1.0,9596.0,1616.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1616.0,-1.0,-1.0,1621.0,209.0,209.0,1616.0,1616.0,1616.0,1.0,-1.0,-1.0,26.2729549408,4.42445755005,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,4.42445755005,-1.0,-1.0,4.43814706802,0.572222590446,0.572222590446,4.42445755005,4.42445755005,4.42445755005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,
2,00028706G,201712,,,SYUTA,,,0.0,iOS 10,,N,,NIF,AC,SYUTA,Varon,IPDSY,NVMIN,29.0,Apple iPhone 6S,ESTANDAR,España,D,64154,1.0,sin IAE,,,,Particulares,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,45.3783998489,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,6.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,3.0,-1.0,-1.0,22014.0,4119.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1627.0,-1.0,-1.0,-1.0,241.0,-1.0,1.0,-1.0,-1.0,60.2722854614,11.2774391174,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,4.45457458496,-1.0,-1.0,-1.0,0.659835577011,-1.0,-1.0,-1.0,22014.0,4119.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2545.25,-1.0,-1.0,1627.0,399.25,957.5,2545.25,2605.75,2545.25,1.0,-1.0,-1.0,60.2722854614,11.2774391174,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,6.71934235096,-1.0,-1.0,4.45457458496,0.843793816864,2.37223035097,6.71934235096,7.13430124521,6.71934235096,-1.0,-1.0,22014.0,4119.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,4119.0,-1.0,-1.0,1627.0,1277.0,1277.0,4119.0,4119.0,4119.0,1.0,-1.0,-1.0,60.2722854614,11.2774391174,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,11.2774391174,-1.0,-1.0,4.45457458496,3.49630713463,3.49630713463,11.2774391174,11.2774391174,11.2774391174,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,


In [58]:
vf_pos_all_df_h2o_predict = vf_pos_all_df_h2o.drop(['TNPS01', 'TNPS2DET', 'TNPS2PRO', 'TNPS', 'TNPS4'])
vf_pos_all_preds = aml.predict(vf_pos_all_df_h2o_predict)

Parse progress: |█████████████████████████████████████████████████████████| 100%
drf prediction progress: |████████████████████████████████████████████████| 100%




In [59]:
#vf_pos_all_preds.head()

Append NIF and actual labels to predictions

In [60]:
vf_pos_all_preds_id = vf_pos_all_preds.cbind(vf_pos_all_df_h2o[['nif', label_preds, label_model]])
vf_pos_all_preds_id

predict,DETRACTOR,NON DETRACTOR,nif,TNPS,TNPS2DET
NON DETRACTOR,0.49559,0.50441,00018573Q,,
NON DETRACTOR,0.439892,0.560108,00027620Y,,
NON DETRACTOR,0.479954,0.520046,00028706G,,
NON DETRACTOR,0.514411,0.485589,00045283V,PROMOTER,NON DETRACTOR
NON DETRACTOR,0.498984,0.501016,00049003C,,
NON DETRACTOR,0.468719,0.531281,00049353D,,
NON DETRACTOR,0.579059,0.420941,00066859U,,
NON DETRACTOR,0.427448,0.572552,00080906R,,
NON DETRACTOR,0.604798,0.395202,00088614V,,
NON DETRACTOR,0.490356,0.509644,00100487X,,




Count number of rows and proportion of predictions

In [61]:
vf_pos_all_preds_nrows = vf_pos_all_preds_id.nrows
vf_pos_all_preds_groupby = vf_pos_all_preds_id.group_by(by='predict').count().get_frame()
proportion = 100*vf_pos_all_preds_groupby['nrow']/vf_pos_all_preds_nrows
proportion.set_names(['proportion'])
vf_pos_all_preds_groupby.cbind(proportion)

predict,nrow,proportion
DETRACTOR,11418.0,0.347511
NON DETRACTOR,3274240.0,99.6525




As proportions of predictions are highly unbalanced, let's create a new prediction column (**predict2**) using a threshold on scoring column

In [62]:
th_max_acc = test_metrics.accuracy()[0][0]
vf_pos_all_preds_id[vf_pos_all_preds_id["NON DETRACTOR"] >= th_max_acc, "predict2"] = "NON DETRACTOR"
vf_pos_all_preds_id[vf_pos_all_preds_id["NON DETRACTOR"] < th_max_acc, "predict2"] = "DETRACTOR"
vf_pos_all_preds_id['predict2'] = vf_pos_all_preds_id['predict2'].asfactor()
vf_pos_all_preds_id

predict,DETRACTOR,NON DETRACTOR,nif,TNPS,TNPS2DET,predict2
NON DETRACTOR,0.49559,0.50441,00018573Q,,,NON DETRACTOR
NON DETRACTOR,0.439892,0.560108,00027620Y,,,NON DETRACTOR
NON DETRACTOR,0.479954,0.520046,00028706G,,,NON DETRACTOR
NON DETRACTOR,0.514411,0.485589,00045283V,PROMOTER,NON DETRACTOR,DETRACTOR
NON DETRACTOR,0.498984,0.501016,00049003C,,,NON DETRACTOR
NON DETRACTOR,0.468719,0.531281,00049353D,,,NON DETRACTOR
NON DETRACTOR,0.579059,0.420941,00066859U,,,DETRACTOR
NON DETRACTOR,0.427448,0.572552,00080906R,,,NON DETRACTOR
NON DETRACTOR,0.604798,0.395202,00088614V,,,DETRACTOR
NON DETRACTOR,0.490356,0.509644,00100487X,,,NON DETRACTOR




Compare real label with this **predict2**  column newly created

In [63]:
vf_pos_all_preds_nrows = vf_pos_all_preds_id.nrows
vf_pos_all_preds_groupby = vf_pos_all_preds_id.group_by(by=['TNPS2DET', 'predict2']).count().get_frame()
proportion = 100*vf_pos_all_preds_groupby['nrow']/vf_pos_all_preds_nrows
proportion.set_names(['proportion'])
vf_pos_all_preds_groupby.cbind(proportion)

TNPS2DET,predict2,nrow,proportion
,DETRACTOR,1552400.0,47.2479
,NON DETRACTOR,1448520.0,44.0861
DETRACTOR,DETRACTOR,51657.0,1.5722
DETRACTOR,NON DETRACTOR,17518.0,0.533166
NON DETRACTOR,DETRACTOR,76410.0,2.32556
NON DETRACTOR,NON DETRACTOR,139152.0,4.23514




Count number of rows and proportion of predictions using the new **predict2** column

In [64]:
vf_pos_all_preds_nrows = vf_pos_all_preds_id.nrows
vf_pos_all_preds_groupby = vf_pos_all_preds_id.group_by(by='predict2').count().get_frame()
proportion = 100*vf_pos_all_preds_groupby['nrow']/vf_pos_all_preds_nrows
proportion.set_names(['proportion'])
vf_pos_all_preds_groupby.cbind(proportion)

predict2,nrow,proportion
DETRACTOR,1680470.0,51.1456
NON DETRACTOR,1605190.0,48.8544




As what we really need is to provide a prediction of the type ['PROMOTER', 'NEUTRAL', 'DETRACTOR'], we calculate cut points in predictions to preserve original proportions

In [65]:
print 'nrows:', vf_pos_all_preds_nrows
#print 'proportions', vf_pos_proportions


if 'HARD DETRACTOR' in vf_pos_proportions.keys() and 'SOFT DETRACTOR' in vf_pos_proportions.keys():
    vf_pos_proportions['DETRACTOR'] = vf_pos_proportions['HARD DETRACTOR'] + vf_pos_proportions['SOFT DETRACTOR']
    del vf_pos_proportions['HARD DETRACTOR']
    del vf_pos_proportions['SOFT DETRACTOR']

vf_pos_counts = {}
for k in vf_pos_proportions.keys():
    vf_pos_counts[k] = int(vf_pos_all_preds_nrows*vf_pos_proportions[k]/100)
diff = vf_pos_all_preds_nrows - sum(vf_pos_counts.values())
min_key = min(vf_pos_counts, key=vf_pos_counts.get) # Get the key with minimum value
vf_pos_counts[min_key] = vf_pos_counts[min_key] + diff # Assign rest to key with minimum num rows
print 'counts:', vf_pos_counts

ini = 0
end = 0
vf_pos_ranges = {}
for k in ['DETRACTOR', 'NEUTRAL', 'PROMOTER']: # It is necessary to force the order
    if end > 0:
        ini = end + 1
    end = ini + vf_pos_counts[k] - 1
    vf_pos_ranges[k] = [ini, end]
    #print k, ini, end
print 'ranges:', vf_pos_ranges

nrows: 3285656
counts: {u'NEUTRAL': 701623, u'PROMOTER': 1785805, 'DETRACTOR': 798228}
ranges: {'NEUTRAL': [798228, 1499850], 'PROMOTER': [1499851, 3285655], 'DETRACTOR': [0, 798227]}


Using the cut points calculated above, create a new **predict3** column with final predictions

In [66]:
for k in vf_pos_ranges.keys():
    r = range(vf_pos_ranges[k][0], vf_pos_ranges[k][1]+1, 1)
    print k, r[0], r[-1]
    vf_pos_all_preds_id[r, 'predict3'] = k
    #tmp = vf_pos_all_preds_id[r, :]
    #print tmp.head()
vf_pos_all_preds_id['predict3'] = vf_pos_all_preds_id['predict3'].asfactor()
vf_pos_all_preds_id

NEUTRAL 798228 1499850
PROMOTER 1499851 3285655
DETRACTOR 0 798227


predict,DETRACTOR,NON DETRACTOR,nif,TNPS,TNPS2DET,predict2,predict3
NON DETRACTOR,0.49559,0.50441,00018573Q,,,NON DETRACTOR,DETRACTOR
NON DETRACTOR,0.439892,0.560108,00027620Y,,,NON DETRACTOR,DETRACTOR
NON DETRACTOR,0.479954,0.520046,00028706G,,,NON DETRACTOR,DETRACTOR
NON DETRACTOR,0.514411,0.485589,00045283V,PROMOTER,NON DETRACTOR,DETRACTOR,DETRACTOR
NON DETRACTOR,0.498984,0.501016,00049003C,,,NON DETRACTOR,DETRACTOR
NON DETRACTOR,0.468719,0.531281,00049353D,,,NON DETRACTOR,DETRACTOR
NON DETRACTOR,0.579059,0.420941,00066859U,,,DETRACTOR,DETRACTOR
NON DETRACTOR,0.427448,0.572552,00080906R,,,NON DETRACTOR,DETRACTOR
NON DETRACTOR,0.604798,0.395202,00088614V,,,DETRACTOR,DETRACTOR
NON DETRACTOR,0.490356,0.509644,00100487X,,,NON DETRACTOR,DETRACTOR




Count number of rows and proportion of predictions using the final **predict3** column, that must match original proportions

In [67]:
vf_pos_all_preds_nrows = vf_pos_all_preds_id.nrows
vf_pos_all_preds_groupby = vf_pos_all_preds_id.group_by(by='predict3').count().get_frame()
proportion = 100*vf_pos_all_preds_groupby['nrow']/vf_pos_all_preds_nrows
proportion.set_names(['proportion'])
vf_pos_all_preds_groupby.cbind(proportion)

predict3,nrow,proportion
DETRACTOR,798228.0,24.2943
NEUTRAL,701623.0,21.3541
PROMOTER,1785800.0,54.3516




Convert predictions to Pandas, to bring predictions back to Spark again

In [68]:
start = time.time()
print 'Converting predictions to Pandas'
vf_pos_all_preds_id_pd = vf_pos_all_preds_id[['nif', label_preds, 'predict3']].as_data_frame()
#vf_pos_all_preds_id_pd
end = time.time()
print 'Process took:', "{0:.2f}".format((end - start)/60), 'minutes'

Converting predictions to Pandas
Process took: 0.14 minutes


Create a Spark DataFrame from the Pandas object created above

In [69]:
#Create PySpark DataFrame Schema
pd_schema = StructType([StructField('nif',        StringType(),True),
                        StructField('label_preds',StringType(),True),
                        StructField('predict',    StringType(),True)])

#Create Spark DataFrame from Pandas
df_vf_pos_preds = sqlContext.createDataFrame(vf_pos_all_preds_id_pd, pd_schema)
#Important to order columns in the same order as the target database
df_vf_pos_preds = df_vf_pos_preds.select('nif', 'label_preds', 'predict')

Count number of rows of actual label and final predictions

In [70]:
print df_vf_pos_preds.count()
df_vf_pos_preds.groupby('label_preds').count().show()
#print df_vf_pos_preds.filter('label_preds == "NaN"').count()
df_vf_pos_preds.groupby('predict').count().show()

3285656
+-----------+-------+
|label_preds|  count|
+-----------+-------+
|    NEUTRAL|  60803|
|   PROMOTER| 154759|
|        NaN|3000919|
|  DETRACTOR|  69175|
+-----------+-------+

+---------+-------+
|  predict|  count|
+---------+-------+
|  NEUTRAL| 701623|
| PROMOTER|1785805|
|DETRACTOR| 798228|
+---------+-------+



For those clients that we really have actual label, use actual label. Otherwise, use prediction.

In [71]:
vf_pos_preds = df_vf_pos_preds.withColumn('predict', 
                                          when(df_vf_pos_preds['label_preds'] != "NaN",
                                               df_vf_pos_preds['label_preds'])\
                                         .otherwise(df_vf_pos_preds['predict']))\
                              .drop('label_preds')
#print vf_pos_preds.count()
#vf_pos_preds.groupby('predict').count().show()

vf_pos_preds_count = vf_pos_preds.count()
print vf_pos_preds_count
vf_pos_preds_groupby = vf_pos_preds.groupby('predict').count()
vf_pos_preds_proportions = vf_pos_preds_groupby.withColumn('proportion', 100*vf_pos_preds_groupby['count']/vf_pos_preds_count).cache()
vf_pos_preds_proportions.withColumn('proportion', bround(vf_pos_preds_proportions['proportion'], 2)).show()

3285656
+---------+-------+----------+
|  predict|  count|proportion|
+---------+-------+----------+
|  NEUTRAL| 701574|     21.35|
| PROMOTER|1785774|     54.35|
|DETRACTOR| 798308|      24.3|
+---------+-------+----------+



Write predictions to HDFS

In [72]:
vf_pos_preds.write.mode('overwrite').format('parquet').save('/tmp/bbergua/tmp/vf_pos_preds-'+month)

## Write predictions to HDFS

In [99]:
if not 'vf_pre_preds' in globals():
    print 'Reading Vodafone Prepaid predictions from HDFS for month =', month
    vf_pre_preds = spark.read.parquet('/tmp/bbergua/tmp/vf_pre_preds-'+month)

In [76]:
vf_pre_preds.printSchema()

root
 |-- nif: string (nullable = true)
 |-- predict: string (nullable = true)



In [77]:
print vf_pre_preds.select('nif').count()
print vf_pre_preds.select('nif').distinct().count()

2267021
2267021


In [98]:
if not 'vf_pos_preds' in globals():
    print 'Reading Vodafone Postpaid predictions from HDFS for month =', month
    vf_pos_preds = spark.read.parquet('/tmp/bbergua/tmp/vf_pos_preds-'+month)

In [78]:
vf_pos_preds.printSchema()

root
 |-- nif: string (nullable = true)
 |-- predict: string (nullable = true)



In [79]:
print vf_pos_preds.select('nif').count()
print vf_pos_preds.select('nif').distinct().count()

3285656
3285656


Write predictions to HDFS

In [95]:
# For those NIFs shared between Prepaid and Postpaid, drop them from Prepaid DataFrame (and thus, take them from Postpaid)
shared = vf_pre_preds.join(vf_pos_preds.withColumnRenamed('predict', 'predict_pos'), 'nif', 'left_outer')
vf_pre_preds = shared.filter('predict_pos is NULL').drop('predict_pos')

oracle_preds = vf_pre_preds.select('nif', 'predict').union(vf_pos_preds.select('nif', 'predict'))

In [96]:
print oracle_preds.select('nif').count()
print oracle_preds.select('nif').distinct().count()

5281435
5281435


In [97]:
oracle_preds.write.mode('overwrite').format('parquet').save('/tmp/bbergua/tmp/oracle_preds-'+month)