# MODELO DE PORTABILIDAD A OPERADORES

Modelo para ver a qué operador de la competencia es más probable que un cliente realice portabilidad. Pruebo 2 modelos:

- Individual para cada modelo (one versus all)
- Junto las predicciones y clasifico al cliente por majority vote: escojo el que tenga mayor score

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
from common.src.main.python.utils.hdfs_generic import *
import os

MAX_N_EXECUTORS=15
MIN_N_EXECUTORS=1
N_CORES_EXECUTOR=4
EXECUTOR_IDLE_MAX_TIME=120
EXECUTOR_MEMORY='32g'
DRIVER_MEMORY='16g'
N_CORES_DRIVER=1
MEMORY_OVERHEAD=N_CORES_EXECUTOR*2048
#QUEUE="root.datascience.normal"
QUEUE="root.BDPtenants.es.medium"

BDA_CORE_VERSION="1.0.0"

SPARK_COMMON_OPTS=os.environ.get('SPARK_COMMON_OPTS', '')
SPARK_COMMON_OPTS+=" --executor-memory %s --driver-memory %s" % (EXECUTOR_MEMORY, DRIVER_MEMORY)
SPARK_COMMON_OPTS+=" --conf spark.shuffle.manager=tungsten-sort"
SPARK_COMMON_OPTS+="  --queue %s" % QUEUE
APP_NAME='Portability'

# Dynamic allocation configuration
SPARK_COMMON_OPTS+=" --conf spark.driver.allowMultipleContexts=true"
SPARK_COMMON_OPTS+=" --conf spark.dynamicAllocation.enabled=true"
SPARK_COMMON_OPTS+=" --conf spark.shuffle.service.enabled=true"
SPARK_COMMON_OPTS+=" --conf spark.dynamicAllocation.maxExecutors=%s" % (MAX_N_EXECUTORS)
SPARK_COMMON_OPTS+=" --conf spark.dynamicAllocation.minExecutors=%s" % (MIN_N_EXECUTORS)
SPARK_COMMON_OPTS+=" --conf spark.dynamicAllocation.executorIdleTimeout=%s" % (EXECUTOR_IDLE_MAX_TIME)
SPARK_COMMON_OPTS+=" --conf spark.ui.port=58201"
SPARK_COMMON_OPTS+=" --conf spark.port.maxRetries=200"
SPARK_COMMON_OPTS+=" --executor-cores=%s" % (N_CORES_EXECUTOR)
SPARK_COMMON_OPTS+=" --conf spark.app.name=%s" % (APP_NAME)

BDA_ENV = os.environ.get('BDA_USER_HOME', '')

# Attach bda-core-ra codebase
SPARK_COMMON_OPTS+=" --files \
{}/scripts/properties/red_agent/nodes.properties,\
{}/scripts/properties/red_agent/nodes-de.properties,\
{}/scripts/properties/red_agent/nodes-es.properties,\
{}/scripts/properties/red_agent/nodes-ie.properties,\
{}/scripts/properties/red_agent/nodes-it.properties,\
{}/scripts/properties/red_agent/nodes-pt.properties,\
{}/scripts/properties/red_agent/nodes-uk.properties".format(*[BDA_ENV]*7)

os.environ["SPARK_COMMON_OPTS"] = SPARK_COMMON_OPTS
os.environ["PYSPARK_SUBMIT_ARGS"] = "%s pyspark-shell " % SPARK_COMMON_OPTS

#print os.environ.get('SPARK_COMMON_OPTS', '')
#print os.environ.get('PYSPARK_SUBMIT_ARGS', '')

sc, sparkSession, sqlContext = run_sc()
print sc.defaultParallelism

2


In [3]:
# This literal_eval is needed since 
# we have to read from a textfile
# which is formatted as python objects.
# It is totally safe.
from ast import literal_eval

# Standard Library stuff:
from functools import partial
from datetime import date, timedelta, datetime

# Numpy stuff
from numpy import (nan as np_nan, round as np_round, int64 as np_int64)
import numpy as np


# Spark stuff
from pyspark.sql import SparkSession
from pyspark import StorageLevel
from pyspark.sql.functions import (udf, col, decode, when, lit, lower, upper, concat,
                                   translate, count, sum as sql_sum, max as sql_max, min as sql_min,
                                   round, 
                                   mean, stddev, datediff,
                                   length,
                                   countDistinct,
                                   hour, date_format, collect_set, collect_list,
                                   year, month, dayofmonth,
                                   rank, expr, lag, coalesce, row_number,
                                   isnull, isnan,
                                   unix_timestamp,
                                   regexp_replace
                                  )
from pyspark.sql.types import DoubleType, StringType, IntegerType, ArrayType, FloatType

from pyspark.ml import Pipeline

from pyspark.sql import DataFrameStatFunctions as statFunc

from pyspark.sql.window import Window

import json
from collections import OrderedDict

from subprocess import Popen, PIPE
import datetime, calendar
from pyspark.sql import functions as F

from pyspark.ml.feature import StandardScaler

In [4]:
spark = (SparkSession.builder
         .master("yarn")
         .config("spark.submit.deployMode", "client")
         .config("spark.ui.showConsoleProgress", "true")
         .enableHiveSupport()
         .getOrCreate()
         )

# sc = spark.sparkContext

In [5]:
%load_ext autoreload
%autoreload 2

#import re
import subprocess
#import sys
import time

from IPython.display import HTML, display
import tabulate

def printHTML(df, sample=7):
    display(HTML(tabulate.tabulate([df.columns]+df.take(sample), tablefmt='html', headers='firstrow')))
    
# Spark utils
from pyspark.sql.functions import (array_contains, bround, col, collect_set, concat, count, decode, desc, 
                                   isnull, length, lit, lower, lpad, max as sql_max, 
                                   size, struct, substring, sum as sql_sum, 
                                   translate, trim, udf, upper, when
                                  )
from pyspark.sql.types import DoubleType, IntegerType, StringType, StructField, StructType

import matplotlib.pyplot as plt
%matplotlib inline

# Escojo las variables numéricas  

In [7]:
import sys

In [8]:
sys.path.append("/var/SP/data/home/carnaum2/ids/amdocs_inf_dataset")

In [9]:
from src.main.python.configuration.constants import ENVIRONMENT
from src.main.python.utils.spark_creator import SparkCreator
from src.main.python.pipelines.billing import Billing
from src.main.python.pipelines.breakdowns import Breakdowns
from src.main.python.pipelines.call_centre_calls import CallCentreCalls
from src.main.python.pipelines.campaigns import Campaigns
from src.main.python.pipelines.claims import Claims
from src.main.python.pipelines.competitors_web import CompWeb
from src.main.python.pipelines.customer import Customer
from src.main.python.pipelines.customer_aggregations import Customer_Aggregations
from src.main.python.pipelines.penalties import PenaltiesCustomer, PenaltiesServices
from src.main.python.pipelines.device_catalogue import Device_Catalogue
from src.main.python.pipelines.geneva_traffic import GenevaVoiceTypeUsage
from src.main.python.pipelines.geneva_traffic import GenevaVoiceUsage
from src.main.python.pipelines.geneva_traffic import GenevaRoamVoiceUsage
from src.main.python.pipelines.geneva_traffic import GenevaDataUsage
from src.main.python.pipelines.geneva_traffic import GenevaRoamDataUsage
from src.main.python.pipelines.mobile_spinners_extractor import Mobile_spinners_extractor
from src.main.python.pipelines.netscout import Netscout
from src.main.python.pipelines.orders import Orders
from src.main.python.pipelines.orders_aggregations import OrdersAgg
from src.main.python.pipelines.permsandprefs import Perms_and_prefs
from src.main.python.pipelines.services import Services
from src.main.python.pipelines.services_problems import ServiceProblems
from src.main.python.pipelines.tech_suprt import TechSupport
from src.main.python.pipelines.tgs import Tgs
from src.main.python.pipelines.tnps import Tnps
from src.main.python.pipelines.orders_sla import Orders_sla
from src.main.python.pipelines.tickets import Tickets
from src.main.python.pipelines.refund import Refund
sc = SparkCreator()
date = "20191014"
module_constructors = (Customer(sc, date, ENVIRONMENT),
                       Services(sc, date, ENVIRONMENT),
                       Customer_Aggregations(sc, date, ENVIRONMENT),
                       Billing(sc, date, ENVIRONMENT),
                       Campaigns(sc, date, date, ENVIRONMENT),
                       GenevaVoiceTypeUsage(sc, date, date, ENVIRONMENT),
                       GenevaVoiceUsage(sc, date, date, ENVIRONMENT),
                       GenevaDataUsage(sc, date, date, ENVIRONMENT),
                       #GenevaRoamVoiceUsage(sc, date, date, ENVIRONMENT),
                       #GenevaRoamDataUsage(sc, date, date, ENVIRONMENT),
                       Orders(sc, date, date, ENVIRONMENT),
                       OrdersAgg(sc, date, date, ENVIRONMENT),
                       PenaltiesCustomer(sc, date, ENVIRONMENT),
                       PenaltiesServices(sc, date, ENVIRONMENT),
                       Device_Catalogue(sc, date, date, ENVIRONMENT),
                       Perms_and_prefs(sc, date, ENVIRONMENT),
                       CallCentreCalls(sc, date, date, ENVIRONMENT),
                       Tnps(sc, date, date, ENVIRONMENT),
                       Tgs(sc, date, ENVIRONMENT),
                       Claims(sc, date, ENVIRONMENT),
                       Breakdowns(sc, date, ENVIRONMENT),
                       TechSupport(sc, date, ENVIRONMENT),
                       Netscout(sc, date, date, ENVIRONMENT),
                       CompWeb(sc, date, date, ENVIRONMENT),
                       ServiceProblems(sc, date, ENVIRONMENT),
                       Mobile_spinners_extractor(sc, date, ENVIRONMENT),
                       Orders_sla(sc, date, ENVIRONMENT),
                       Refund(sc, date, ENVIRONMENT),
                       #Tickets(sc, date, ENVIRONMENT)
                       )
na_map = {}
for module in module_constructors:
    metadata = module.set_module_metadata()
    na_map.update(metadata)
final_map = {colmn: na_map[colmn][0] for colmn in na_map.keys()
             if colmn in na_map.keys() and na_map[colmn][1] != "id"}
categ_map = {colmn: na_map[colmn][0] for colmn in na_map.keys()
             if colmn in na_map.keys() and na_map[colmn][1] == "categorical"}
numeric_map = {colmn: na_map[colmn][0] for colmn in na_map.keys()
               if colmn in na_map.keys() and na_map[colmn][1] == "numerical"}
date_map = {colmn: na_map[colmn][0] for colmn in na_map.keys()
            if colmn in na_map.keys() and na_map[colmn][1] == "date"}

In [10]:
numeric_variables=numeric_map.keys()

In [11]:
numeric_variables.append('msisdn') #añado el msisdn para hacer el join después

### Extraigo el ids con las variables numéricas seleccionadas  

In [12]:
def ids_numeric_selection(year_, month_, day_):


    ids_completo = (spark.read.load(
            '/data/udf/vf_es/amdocs_inf_dataset/amdocs_ids_service_level/year=' + year_ + '/month=' + month_ + '/day=' + day_))

    ids_numeric=ids_completo.select(numeric_variables) #aqui cojo las variables que se han seleccionado

    return ids_numeric

#Guardo este ids para train y test y luego hago inner join on msisdn con las columnas de target que habiamos creado

In [13]:
ids_julio_numeric=ids_numeric_selection('2019','7','31')
ids_sept_numeric=ids_numeric_selection('2019','9','30') 

In [14]:
categoricas=[item[0] for item in ids_julio_numeric.dtypes if item[1].startswith('string')] #MIRAR SI HAY elimino estas variables que no son numericas: no deberian aparecer (carlos lo va a corregir)

In [18]:
ids_julio_numeric=ids_julio_numeric.drop('tgs_ind_riesgo_o2',
 'tgs_ind_riesgo_mm',
 'tgs_ind_riesgo_mv',
 'tgs_meses_fin_dto_ok',
 'CCC_L2_bucket_1st_interaction',
 'CCC_L2_bucket_latest_interaction',
 'CCC_L2_first_interaction',
 'Cust_Agg_flag_prepaid_nc',
 'tgs_ind_riesgo_max',
 'tgs_sol_24m',
 'CCC_L2_latest_interaction',
 'tgs_tg_marta',
 'tgs_blinda_bi_pos_n12')

ids_sept_numeric=ids_sept_numeric.drop('tgs_ind_riesgo_o2',
 'tgs_ind_riesgo_mm',
 'tgs_ind_riesgo_mv',
 'tgs_meses_fin_dto_ok',
 'CCC_L2_bucket_1st_interaction',
 'CCC_L2_bucket_latest_interaction',
 'CCC_L2_first_interaction',
 'Cust_Agg_flag_prepaid_nc',
 'tgs_ind_riesgo_max',
 'tgs_sol_24m',
 'CCC_L2_latest_interaction',
 'tgs_tg_marta',
 'tgs_blinda_bi_pos_n12')

### Etiqueto los tablones haciendo inner join del tablón básico guardado

In [19]:
train_basic= spark.read.load('/data/udf/vf_es/churn/portabPropension_model/train_final_via2')
test_basic= spark.read.load('/data/udf/vf_es/churn/portabPropension_model/test_final_via2')

msisdn_target_train=train_basic.select('msisdn','Operador_target','masmovil','movistar','orange','otros')
msisdn_target_test=test_basic.select('msisdn','Operador_target','masmovil','movistar','orange','otros')

In [20]:
train_numeric_final2=ids_julio_numeric.join(msisdn_target_train,on='msisdn',how='inner')
test_numeric_final2=ids_sept_numeric.join(msisdn_target_test,on='msisdn',how='inner')

In [21]:
train_numeric_final2=train_numeric_final2.cache()

In [22]:
test_numeric_final2=test_numeric_final2.cache()

In [23]:
train_numeric_final2.groupby('Operador_target').count().show()

+---------------+-------+
|Operador_target|  count|
+---------------+-------+
|              1|  37039|
|              3|  41050|
|              4|  33022|
|              2|  58318|
|              0|5775144|
+---------------+-------+



In [24]:
test_numeric_final2.groupby('Operador_target').count().show()

+---------------+-------+
|Operador_target|  count|
+---------------+-------+
|              1|  36835|
|              3|  39660|
|              4|  37253|
|              2|  48145|
|              0|5717624|
+---------------+-------+



# Procesamiento de los datos

In [25]:
variables_elim=['msisdn','Operador_target','masmovil','movistar','orange','otros']

In [27]:
variables = [i for i in train_numeric_final2.columns if i not in variables_elim] #cojo solo las variables predictoras para el assemble

In [None]:
len(variables)

In [None]:
def indexer_assembler(df_no_transformed): 
    
    assembler = VectorAssembler(inputCols=variables, outputCol="features")

    stages = [assembler]

    pipeline = Pipeline(stages = stages)
    
    pipeline_fit = pipeline.fit(df_no_transformed)
    
    df_transformed=pipeline_fit.transform(df_no_transformed)
    
    return df_transformed

In [None]:
train_via2=indexer_assembler(train_numeric_final2)
test_via2=indexer_assembler(test_numeric_final2)

## Aplico transformaciones sobre el dataframe de train y test para posteriormente aplicar modelos

## Funciones para aplicar modelos y evaluarlos

In [21]:
from pyspark.ml.classification import RandomForestClassifier

model = RandomForestClassifier(featuresCol = 'features', labelCol = 'target', maxDepth=8, numTrees=3000)

In [22]:
#AUC

from pyspark.ml.evaluation import BinaryClassificationEvaluator
import pyspark.sql.functions as F

evaluator = BinaryClassificationEvaluator(labelCol= 'target' , metricName='areaUnderROC') #auc


#LIFT

import utils_model
from utils_model import get_lift
getScore = udf(lambda prob: float(prob[1]), DoubleType())

#FEATURE IMPORTANCE

def ExtractFeatureImp(featureImp, dataset, featuresCol):
   list_extract = []
   for i in dataset.schema[featuresCol].metadata["ml_attr"]["attrs"]:
       list_extract = list_extract + dataset.schema[featuresCol].metadata["ml_attr"]["attrs"][i]
   varlist = pd.DataFrame(list_extract)
   varlist['score'] = varlist['idx'].apply(lambda x: featureImp[x])
   return(varlist.sort_values('score', ascending = False))

import matplotlib.pyplot as plt

## MODELO DE PORTABILIDAD A MÁSMÓVIL

### Escojo datos de entrenamiento y asigno target

Del tablón de entrenamiento completo transformado con el pipeline (training_df_trans), escojo:

- Clientes que solicitan portabilidad a MásMóvil (Operador_target=1)
- Mismo número de clientes que no solicitan portabilidad a ninguna compañía (Operador_target=0). Estos se eligen aleatoriamente del tablón training_df_noPortab_trans creado antes.

Creo variable masmovil_target:

- 1: el cliente solicita portabilidad a MásMóvil
- 0: resto de clientes 

In [23]:
n=float(37439)/float(5999083) #proporcion que hay que coger de los que no solicitan porta (misma que los que sí: nº clientes que van a masmovil)

train_masmovil=training_df_transf.filter(training_df_transf['Operador_target']==1).union(training_df_noPortab_trans.sample(False, n,5))
train_masmovil=train_masmovil.withColumnRenamed('Operador_target','target')

prediction_df_masmovil=prediction_df_transf.withColumn('target',when(prediction_df_transf['Operador_target']==1,1).otherwise(0))

In [24]:
train_masmovil=train_masmovil.cache()
train_masmovil.count()

74720

In [25]:
prediction_df_masmovil=prediction_df_masmovil.cache()
prediction_df_masmovil.count()

6226600

### Aplico modelo y saco tablón de predicciones

In [None]:
modelMasMovil = model.fit(train_masmovil)

In [None]:
predsMasMovilTest=modelMasMovil.transform(prediction_df_masmovil)

In [31]:
predsMasMovilTest

DataFrame[msisdn: string, score: double, target: int, prediction: double]

In [None]:
aucTrainMasMovil = evaluator.evaluate(predsMasMovilTrain)
aucTestMasMovil = evaluator.evaluate(predsMasMovilTest)

In [None]:
aucTrainMasMovil

In [None]:
aucTestMasMovil

In [None]:
feat_impMasMovil = ExtractFeatureImp(modelMasMovil.featureImportances ,predsMasMovilTest, "features")[0:30]
feat_impMasMovil = feat_impMasMovil.sort_values(by = ['score'], ascending = True)

In [None]:
feat_impMasMovil

In [32]:
predsMasMovilTest=predsMasMovilTest.withColumn("score", getScore(col("probability")).cast(DoubleType()))
predsMasMovilTest=predsMasMovilTest.select('msisdn','score','target','prediction') #guardo para dibujar luego matrices de confusion etc

AnalysisException: u"cannot resolve '`probability`' given input columns: [msisdn, score, target, prediction];;\n'Project [msisdn#42097, cast(<lambda>('probability) as double) AS score#367903, target#289757, prediction#357272]\n+- Sort [score#359927 DESC NULLS LAST], true\n   +- Project [msisdn#42097, score#359927, target#289757, prediction#357272]\n      +- Project [msisdn#42097, metodo_pago#42098, factura_electronica#42099, superoferta#42100, tipo_documento#42101, nacionalidad#42102, x_datos_navegacion#42103, x_datos_trafico#42104, x_cesion_datos#42105, x_user_facebook#42106, x_user_twitter#42107, marriage2hgbst_elm#42108, gender2hgbst_elm#42109, flg_robinson#42110, x_formato_factura#42111, x_idioma_factura#42112, bam_services#42113L, bam-movil_services#42114L, fbb_services#42115L, fixed_services#42116L, movil_services#42117L, prepaid_services#42118L, tv_services#42119L, tipo_sim#42120, ... 1304 more fields]\n         +- Project [msisdn#42097, metodo_pago#42098, factura_electronica#42099, superoferta#42100, tipo_documento#42101, nacionalidad#42102, x_datos_navegacion#42103, x_datos_trafico#42104, x_cesion_datos#42105, x_user_facebook#42106, x_user_twitter#42107, marriage2hgbst_elm#42108, gender2hgbst_elm#42109, flg_robinson#42110, x_formato_factura#42111, x_idioma_factura#42112, bam_services#42113L, bam-movil_services#42114L, fbb_services#42115L, fixed_services#42116L, movil_services#42117L, prepaid_services#42118L, tv_services#42119L, tipo_sim#42120, ... 1303 more fields]\n            +- Project [msisdn#42097, metodo_pago#42098, factura_electronica#42099, superoferta#42100, tipo_documento#42101, nacionalidad#42102, x_datos_navegacion#42103, x_datos_trafico#42104, x_cesion_datos#42105, x_user_facebook#42106, x_user_twitter#42107, marriage2hgbst_elm#42108, gender2hgbst_elm#42109, flg_robinson#42110, x_formato_factura#42111, x_idioma_factura#42112, bam_services#42113L, bam-movil_services#42114L, fbb_services#42115L, fixed_services#42116L, movil_services#42117L, prepaid_services#42118L, tv_services#42119L, tipo_sim#42120, ... 1302 more fields]\n               +- Project [msisdn#42097, metodo_pago#42098, factura_electronica#42099, superoferta#42100, tipo_documento#42101, nacionalidad#42102, x_datos_navegacion#42103, x_datos_trafico#42104, x_cesion_datos#42105, x_user_facebook#42106, x_user_twitter#42107, marriage2hgbst_elm#42108, gender2hgbst_elm#42109, flg_robinson#42110, x_formato_factura#42111, x_idioma_factura#42112, bam_services#42113L, bam-movil_services#42114L, fbb_services#42115L, fixed_services#42116L, movil_services#42117L, prepaid_services#42118L, tv_services#42119L, tipo_sim#42120, ... 1301 more fields]\n                  +- Project [msisdn#42097, metodo_pago#42098, factura_electronica#42099, superoferta#42100, tipo_documento#42101, nacionalidad#42102, x_datos_navegacion#42103, x_datos_trafico#42104, x_cesion_datos#42105, x_user_facebook#42106, x_user_twitter#42107, marriage2hgbst_elm#42108, gender2hgbst_elm#42109, flg_robinson#42110, x_formato_factura#42111, x_idioma_factura#42112, bam_services#42113L, bam-movil_services#42114L, fbb_services#42115L, fixed_services#42116L, movil_services#42117L, prepaid_services#42118L, tv_services#42119L, tipo_sim#42120, ... 1300 more fields]\n                     +- Project [msisdn#42097, metodo_pago#42098, factura_electronica#42099, superoferta#42100, tipo_documento#42101, nacionalidad#42102, x_datos_navegacion#42103, x_datos_trafico#42104, x_cesion_datos#42105, x_user_facebook#42106, x_user_twitter#42107, marriage2hgbst_elm#42108, gender2hgbst_elm#42109, flg_robinson#42110, x_formato_factura#42111, x_idioma_factura#42112, bam_services#42113L, bam-movil_services#42114L, fbb_services#42115L, fixed_services#42116L, movil_services#42117L, prepaid_services#42118L, tv_services#42119L, tipo_sim#42120, ... 1299 more fields]\n                        +- Project [msisdn#42097, metodo_pago#42098, factura_electronica#42099, superoferta#42100, tipo_documento#42101, nacionalidad#42102, x_datos_navegacion#42103, x_datos_trafico#42104, x_cesion_datos#42105, x_user_facebook#42106, x_user_twitter#42107, marriage2hgbst_elm#42108, gender2hgbst_elm#42109, flg_robinson#42110, x_formato_factura#42111, x_idioma_factura#42112, bam_services#42113L, bam-movil_services#42114L, fbb_services#42115L, fixed_services#42116L, movil_services#42117L, prepaid_services#42118L, tv_services#42119L, tipo_sim#42120, ... 1298 more fields]\n                           +- Project [msisdn#42097, metodo_pago#42098, factura_electronica#42099, superoferta#42100, tipo_documento#42101, nacionalidad#42102, x_datos_navegacion#42103, x_datos_trafico#42104, x_cesion_datos#42105, x_user_facebook#42106, x_user_twitter#42107, marriage2hgbst_elm#42108, gender2hgbst_elm#42109, flg_robinson#42110, x_formato_factura#42111, x_idioma_factura#42112, bam_services#42113L, bam-movil_services#42114L, fbb_services#42115L, fixed_services#42116L, movil_services#42117L, prepaid_services#42118L, tv_services#42119L, tipo_sim#42120, ... 1297 more fields]\n                              +- Project [msisdn#42097, metodo_pago#42098, factura_electronica#42099, superoferta#42100, tipo_documento#42101, nacionalidad#42102, x_datos_navegacion#42103, x_datos_trafico#42104, x_cesion_datos#42105, x_user_facebook#42106, x_user_twitter#42107, marriage2hgbst_elm#42108, gender2hgbst_elm#42109, flg_robinson#42110, x_formato_factura#42111, x_idioma_factura#42112, bam_services#42113L, bam-movil_services#42114L, fbb_services#42115L, fixed_services#42116L, movil_services#42117L, prepaid_services#42118L, tv_services#42119L, tipo_sim#42120, ... 1296 more fields]\n                                 +- Project [msisdn#42097, metodo_pago#42098, factura_electronica#42099, superoferta#42100, tipo_documento#42101, nacionalidad#42102, x_datos_navegacion#42103, x_datos_trafico#42104, x_cesion_datos#42105, x_user_facebook#42106, x_user_twitter#42107, marriage2hgbst_elm#42108, gender2hgbst_elm#42109, flg_robinson#42110, x_formato_factura#42111, x_idioma_factura#42112, bam_services#42113L, bam-movil_services#42114L, fbb_services#42115L, fixed_services#42116L, movil_services#42117L, prepaid_services#42118L, tv_services#42119L, tipo_sim#42120, ... 1295 more fields]\n                                    +- Project [msisdn#42097, metodo_pago#42098, factura_electronica#42099, superoferta#42100, tipo_documento#42101, nacionalidad#42102, x_datos_navegacion#42103, x_datos_trafico#42104, x_cesion_datos#42105, x_user_facebook#42106, x_user_twitter#42107, marriage2hgbst_elm#42108, gender2hgbst_elm#42109, flg_robinson#42110, x_formato_factura#42111, x_idioma_factura#42112, bam_services#42113L, bam-movil_services#42114L, fbb_services#42115L, fixed_services#42116L, movil_services#42117L, prepaid_services#42118L, tv_services#42119L, tipo_sim#42120, ... 1294 more fields]\n                                       +- Project [msisdn#42097, metodo_pago#42098, factura_electronica#42099, superoferta#42100, tipo_documento#42101, nacionalidad#42102, x_datos_navegacion#42103, x_datos_trafico#42104, x_cesion_datos#42105, x_user_facebook#42106, x_user_twitter#42107, marriage2hgbst_elm#42108, gender2hgbst_elm#42109, flg_robinson#42110, x_formato_factura#42111, x_idioma_factura#42112, bam_services#42113L, bam-movil_services#42114L, fbb_services#42115L, fixed_services#42116L, movil_services#42117L, prepaid_services#42118L, tv_services#42119L, tipo_sim#42120, ... 1293 more fields]\n                                          +- Project [msisdn#42097, metodo_pago#42098, factura_electronica#42099, superoferta#42100, tipo_documento#42101, nacionalidad#42102, x_datos_navegacion#42103, x_datos_trafico#42104, x_cesion_datos#42105, x_user_facebook#42106, x_user_twitter#42107, marriage2hgbst_elm#42108, gender2hgbst_elm#42109, flg_robinson#42110, x_formato_factura#42111, x_idioma_factura#42112, bam_services#42113L, bam-movil_services#42114L, fbb_services#42115L, fixed_services#42116L, movil_services#42117L, prepaid_services#42118L, tv_services#42119L, tipo_sim#42120, ... 1292 more fields]\n                                             +- Project [msisdn#42097, metodo_pago#42098, factura_electronica#42099, superoferta#42100, tipo_documento#42101, nacionalidad#42102, x_datos_navegacion#42103, x_datos_trafico#42104, x_cesion_datos#42105, x_user_facebook#42106, x_user_twitter#42107, marriage2hgbst_elm#42108, gender2hgbst_elm#42109, flg_robinson#42110, x_formato_factura#42111, x_idioma_factura#42112, bam_services#42113L, bam-movil_services#42114L, fbb_services#42115L, fixed_services#42116L, movil_services#42117L, prepaid_services#42118L, tv_services#42119L, tipo_sim#42120, ... 1291 more fields]\n                                                +- Project [msisdn#42097, metodo_pago#42098, factura_electronica#42099, superoferta#42100, tipo_documento#42101, nacionalidad#42102, x_datos_navegacion#42103, x_datos_trafico#42104, x_cesion_datos#42105, x_user_facebook#42106, x_user_twitter#42107, marriage2hgbst_elm#42108, gender2hgbst_elm#42109, flg_robinson#42110, x_formato_factura#42111, x_idioma_factura#42112, bam_services#42113L, bam-movil_services#42114L, fbb_services#42115L, fixed_services#42116L, movil_services#42117L, prepaid_services#42118L, tv_services#42119L, tipo_sim#42120, ... 1290 more fields]\n                                                   +- Project [msisdn#42097, metodo_pago#42098, factura_electronica#42099, superoferta#42100, tipo_documento#42101, nacionalidad#42102, x_datos_navegacion#42103, x_datos_trafico#42104, x_cesion_datos#42105, x_user_facebook#42106, x_user_twitter#42107, marriage2hgbst_elm#42108, gender2hgbst_elm#42109, flg_robinson#42110, x_formato_factura#42111, x_idioma_factura#42112, bam_services#42113L, bam-movil_services#42114L, fbb_services#42115L, fixed_services#42116L, movil_services#42117L, prepaid_services#42118L, tv_services#42119L, tipo_sim#42120, ... 1289 more fields]\n                                                      +- Project [msisdn#42097, metodo_pago#42098, factura_electronica#42099, superoferta#42100, tipo_documento#42101, nacionalidad#42102, x_datos_navegacion#42103, x_datos_trafico#42104, x_cesion_datos#42105, x_user_facebook#42106, x_user_twitter#42107, marriage2hgbst_elm#42108, gender2hgbst_elm#42109, flg_robinson#42110, x_formato_factura#42111, x_idioma_factura#42112, bam_services#42113L, bam-movil_services#42114L, fbb_services#42115L, fixed_services#42116L, movil_services#42117L, prepaid_services#42118L, tv_services#42119L, tipo_sim#42120, ... 1288 more fields]\n                                                         +- Project [msisdn#42097, metodo_pago#42098, factura_electronica#42099, superoferta#42100, tipo_documento#42101, nacionalidad#42102, x_datos_navegacion#42103, x_datos_trafico#42104, x_cesion_datos#42105, x_user_facebook#42106, x_user_twitter#42107, marriage2hgbst_elm#42108, gender2hgbst_elm#42109, flg_robinson#42110, x_formato_factura#42111, x_idioma_factura#42112, bam_services#42113L, bam-movil_services#42114L, fbb_services#42115L, fixed_services#42116L, movil_services#42117L, prepaid_services#42118L, tv_services#42119L, tipo_sim#42120, ... 1287 more fields]\n                                                            +- Project [msisdn#42097, metodo_pago#42098, factura_electronica#42099, superoferta#42100, tipo_documento#42101, nacionalidad#42102, x_datos_navegacion#42103, x_datos_trafico#42104, x_cesion_datos#42105, x_user_facebook#42106, x_user_twitter#42107, marriage2hgbst_elm#42108, gender2hgbst_elm#42109, flg_robinson#42110, x_formato_factura#42111, x_idioma_factura#42112, bam_services#42113L, bam-movil_services#42114L, fbb_services#42115L, fixed_services#42116L, movil_services#42117L, prepaid_services#42118L, tv_services#42119L, tipo_sim#42120, ... 1286 more fields]\n                                                               +- Project [msisdn#42097, metodo_pago#42098, factura_electronica#42099, superoferta#42100, tipo_documento#42101, nacionalidad#42102, x_datos_navegacion#42103, x_datos_trafico#42104, x_cesion_datos#42105, x_user_facebook#42106, x_user_twitter#42107, marriage2hgbst_elm#42108, gender2hgbst_elm#42109, flg_robinson#42110, x_formato_factura#42111, x_idioma_factura#42112, bam_services#42113L, bam-movil_services#42114L, fbb_services#42115L, fixed_services#42116L, movil_services#42117L, prepaid_services#42118L, tv_services#42119L, tipo_sim#42120, ... 1285 more fields]\n                                                                  +- Project [msisdn#42097, metodo_pago#42098, factura_electronica#42099, superoferta#42100, tipo_documento#42101, nacionalidad#42102, x_datos_navegacion#42103, x_datos_trafico#42104, x_cesion_datos#42105, x_user_facebook#42106, x_user_twitter#42107, marriage2hgbst_elm#42108, gender2hgbst_elm#42109, flg_robinson#42110, x_formato_factura#42111, x_idioma_factura#42112, bam_services#42113L, bam-movil_services#42114L, fbb_services#42115L, fixed_services#42116L, movil_services#42117L, prepaid_services#42118L, tv_services#42119L, tipo_sim#42120, ... 1284 more fields]\n                                                                     +- Project [msisdn#42097, metodo_pago#42098, factura_electronica#42099, superoferta#42100, tipo_documento#42101, nacionalidad#42102, x_datos_navegacion#42103, x_datos_trafico#42104, x_cesion_datos#42105, x_user_facebook#42106, x_user_twitter#42107, marriage2hgbst_elm#42108, gender2hgbst_elm#42109, flg_robinson#42110, x_formato_factura#42111, x_idioma_factura#42112, bam_services#42113L, bam-movil_services#42114L, fbb_services#42115L, fixed_services#42116L, movil_services#42117L, prepaid_services#42118L, tv_services#42119L, tipo_sim#42120, ... 1283 more fields]\n                                                                        +- Project [msisdn#42097, metodo_pago#42098, factura_electronica#42099, superoferta#42100, tipo_documento#42101, nacionalidad#42102, x_datos_navegacion#42103, x_datos_trafico#42104, x_cesion_datos#42105, x_user_facebook#42106, x_user_twitter#42107, marriage2hgbst_elm#42108, gender2hgbst_elm#42109, flg_robinson#42110, x_formato_factura#42111, x_idioma_factura#42112, bam_services#42113L, bam-movil_services#42114L, fbb_services#42115L, fixed_services#42116L, movil_services#42117L, prepaid_services#42118L, tv_services#42119L, tipo_sim#42120, ... 1282 more fields]\n                                                                           +- Project [msisdn#42097, metodo_pago#42098, factura_electronica#42099, superoferta#42100, tipo_documento#42101, nacionalidad#42102, x_datos_navegacion#42103, x_datos_trafico#42104, x_cesion_datos#42105, x_user_facebook#42106, x_user_twitter#42107, marriage2hgbst_elm#42108, gender2hgbst_elm#42109, flg_robinson#42110, x_formato_factura#42111, x_idioma_factura#42112, bam_services#42113L, bam-movil_services#42114L, fbb_services#42115L, fixed_services#42116L, movil_services#42117L, prepaid_services#42118L, tv_services#42119L, tipo_sim#42120, ... 1281 more fields]\n                                                                              +- Project [msisdn#42097, metodo_pago#42098, factura_electronica#42099, superoferta#42100, tipo_documento#42101, nacionalidad#42102, x_datos_navegacion#42103, x_datos_trafico#42104, x_cesion_datos#42105, x_user_facebook#42106, x_user_twitter#42107, marriage2hgbst_elm#42108, gender2hgbst_elm#42109, flg_robinson#42110, x_formato_factura#42111, x_idioma_factura#42112, bam_services#42113L, bam-movil_services#42114L, fbb_services#42115L, fixed_services#42116L, movil_services#42117L, prepaid_services#42118L, tv_services#42119L, tipo_sim#42120, ... 1280 more fields]\n                                                                                 +- Project [msisdn#42097, metodo_pago#42098, factura_electronica#42099, superoferta#42100, tipo_documento#42101, nacionalidad#42102, x_datos_navegacion#42103, x_datos_trafico#42104, x_cesion_datos#42105, x_user_facebook#42106, x_user_twitter#42107, marriage2hgbst_elm#42108, gender2hgbst_elm#42109, flg_robinson#42110, x_formato_factura#42111, x_idioma_factura#42112, bam_services#42113L, bam-movil_services#42114L, fbb_services#42115L, fixed_services#42116L, movil_services#42117L, prepaid_services#42118L, tv_services#42119L, tipo_sim#42120, ... 1279 more fields]\n                                                                                    +- Project [msisdn#42097, metodo_pago#42098, factura_electronica#42099, superoferta#42100, tipo_documento#42101, nacionalidad#42102, x_datos_navegacion#42103, x_datos_trafico#42104, x_cesion_datos#42105, x_user_facebook#42106, x_user_twitter#42107, marriage2hgbst_elm#42108, gender2hgbst_elm#42109, flg_robinson#42110, x_formato_factura#42111, x_idioma_factura#42112, bam_services#42113L, bam-movil_services#42114L, fbb_services#42115L, fixed_services#42116L, movil_services#42117L, prepaid_services#42118L, tv_services#42119L, tipo_sim#42120, ... 1278 more fields]\n                                                                                       +- Project [msisdn#42097, metodo_pago#42098, factura_electronica#42099, superoferta#42100, tipo_documento#42101, nacionalidad#42102, x_datos_navegacion#42103, x_datos_trafico#42104, x_cesion_datos#42105, x_user_facebook#42106, x_user_twitter#42107, marriage2hgbst_elm#42108, gender2hgbst_elm#42109, flg_robinson#42110, x_formato_factura#42111, x_idioma_factura#42112, bam_services#42113L, bam-movil_services#42114L, fbb_services#42115L, fixed_services#42116L, movil_services#42117L, prepaid_services#42118L, tv_services#42119L, tipo_sim#42120, ... 1277 more fields]\n                                                                                          +- Project [msisdn#42097, metodo_pago#42098, factura_electronica#42099, superoferta#42100, tipo_documento#42101, nacionalidad#42102, x_datos_navegacion#42103, x_datos_trafico#42104, x_cesion_datos#42105, x_user_facebook#42106, x_user_twitter#42107, marriage2hgbst_elm#42108, gender2hgbst_elm#42109, flg_robinson#42110, x_formato_factura#42111, x_idioma_factura#42112, bam_services#42113L, bam-movil_services#42114L, fbb_services#42115L, fixed_services#42116L, movil_services#42117L, prepaid_services#42118L, tv_services#42119L, tipo_sim#42120, ... 1276 more fields]\n                                                                                             +- Project [msisdn#42097, metodo_pago#42098, factura_electronica#42099, superoferta#42100, tipo_documento#42101, nacionalidad#42102, x_datos_navegacion#42103, x_datos_trafico#42104, x_cesion_datos#42105, x_user_facebook#42106, x_user_twitter#42107, marriage2hgbst_elm#42108, gender2hgbst_elm#42109, flg_robinson#42110, x_formato_factura#42111, x_idioma_factura#42112, bam_services#42113L, bam-movil_services#42114L, fbb_services#42115L, fixed_services#42116L, movil_services#42117L, prepaid_services#42118L, tv_services#42119L, tipo_sim#42120, ... 1275 more fields]\n                                                                                                +- Project [msisdn#42097, metodo_pago#42098, factura_electronica#42099, superoferta#42100, tipo_documento#42101, nacionalidad#42102, x_datos_navegacion#42103, x_datos_trafico#42104, x_cesion_datos#42105, x_user_facebook#42106, x_user_twitter#42107, marriage2hgbst_elm#42108, gender2hgbst_elm#42109, flg_robinson#42110, x_formato_factura#42111, x_idioma_factura#42112, bam_services#42113L, bam-movil_services#42114L, fbb_services#42115L, fixed_services#42116L, movil_services#42117L, prepaid_services#42118L, tv_services#42119L, tipo_sim#42120, ... 1274 more fields]\n                                                                                                   +- Project [msisdn#42097, metodo_pago#42098, factura_electronica#42099, superoferta#42100, tipo_documento#42101, nacionalidad#42102, x_datos_navegacion#42103, x_datos_trafico#42104, x_cesion_datos#42105, x_user_facebook#42106, x_user_twitter#42107, marriage2hgbst_elm#42108, gender2hgbst_elm#42109, flg_robinson#42110, x_formato_factura#42111, x_idioma_factura#42112, bam_services#42113L, bam-movil_services#42114L, fbb_services#42115L, fixed_services#42116L, movil_services#42117L, prepaid_services#42118L, tv_services#42119L, tipo_sim#42120, ... 1273 more fields]\n                                                                                                      +- Project [msisdn#42097, metodo_pago#42098, factura_electronica#42099, superoferta#42100, tipo_documento#42101, nacionalidad#42102, x_datos_navegacion#42103, x_datos_trafico#42104, x_cesion_datos#42105, x_user_facebook#42106, x_user_twitter#42107, marriage2hgbst_elm#42108, gender2hgbst_elm#42109, flg_robinson#42110, x_formato_factura#42111, x_idioma_factura#42112, bam_services#42113L, bam-movil_services#42114L, fbb_services#42115L, fixed_services#42116L, movil_services#42117L, prepaid_services#42118L, tv_services#42119L, tipo_sim#42120, ... 1272 more fields]\n                                                                                                         +- Relation[msisdn#42097,metodo_pago#42098,factura_electronica#42099,superoferta#42100,tipo_documento#42101,nacionalidad#42102,x_datos_navegacion#42103,x_datos_trafico#42104,x_cesion_datos#42105,x_user_facebook#42106,x_user_twitter#42107,marriage2hgbst_elm#42108,gender2hgbst_elm#42109,flg_robinson#42110,x_formato_factura#42111,x_idioma_factura#42112,bam_services#42113L,bam-movil_services#42114L,fbb_services#42115L,fixed_services#42116L,movil_services#42117L,prepaid_services#42118L,tv_services#42119L,tipo_sim#42120,... 1271 more fields] parquet\n"

In [33]:
predsMasMovilTest.groupby('target','prediction').count().show()

+------+----------+-------+
|target|prediction|  count|
+------+----------+-------+
|     1|       0.0|   9803|
|     0|       0.0|3343513|
|     1|       1.0|  32220|
|     0|       1.0|2841064|
+------+----------+-------+



In [34]:
predsMasMovilTest=predsMasMovilTest.orderBy('score',ascending=False)

In [None]:
printHTML(predsMasMovilTest)

In [None]:
predsMasMovilTest.limit(50000).show()

In [None]:
predsMasMovilTest= spark.read.load('/data/udf/vf_es/churn/portabPropension_model/predsMasMovilTest')

In [None]:
predsMasMovilTest=predsMasMovilTest.cache()
predsMasMovilTest.groupby('target').count().show()

In [None]:
predsMasMovilTest.repartition(300).write.save('/data/udf/vf_es/churn/portabPropension_model/predsMasMovilTest', format='parquet', mode='overwrite')

In [None]:
#Tablon de predicciones de test con los campos necesarios para luego unir con el resto de predicciones
prediccionesMasMovil=predsMasMovilTest.select('msisdn','score').withColumnRenamed('score','MasMovil').withColumnRenamed('msisdn','msisdn_MasMovil')

In [None]:
prediccionesMasMovil=prediccionesMasMovil.cache()

In [None]:
prediccionesMasMovil.repartition(300).write.save('/data/udf/vf_es/churn/portabPropension_model/prediccionesMasMovil', format='parquet', mode='overwrite')

In [None]:
prediccionesMasMovil

### Evalúo modelo (lift, confusion)

In [None]:
#roc curve:   https://stackoverflow.com/questions/52847408/pyspark-extract-roc-curve

In [None]:
lift = get_lift(predsMasMovilTest, 'score', 'target', 10)

for d ,l in lift:
   print str(d) + ": " + str(l)

In [None]:
feat_imp=feat_impMasMovil

features = feat_imp['name']
importances = feat_imp['score']
indices = feat_imp['idx']
plt.figure(figsize=(15, 10))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix
y_true = predsMasMovilTest.select("target")
y_true = y_true.toPandas()

y_pred = predsMasMovilTest.select("prediction")
y_pred = y_pred.toPandas()

cnf_matrix = confusion_matrix(y_true, y_pred,labels=['0','1'])
cnf_matrix

In [None]:
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['0','1'],
                      title='Confusion matrix, without normalization')
plt.show()

## MODELO DE PORTABILIDAD A MOVISTAR

In [None]:
n=float(58755)/float(5999083)

train_movistar=training_df_transf.filter(training_df_transf['Operador_target']==2).union(training_df_noPortab_trans.sample(False, n,5))
train_movistar=train_movistar.withColumn('target',when(train_movistar['Operador_target']==2,1).otherwise(0))
train_movistar=train_movistar.drop('Operador_target')

prediction_df_movistar=prediction_df_transf.withColumn('target',when(prediction_df_transf['Operador_target']==2,1).otherwise(0))

In [None]:
train_movistar=train_movistar.cache()
train_movistar.count()

### Aplico modelo y saco tablón de predicciones

In [None]:
modelMovistar = model.fit(train_movistar)

In [None]:
predsMovistarTrain=modelMovistar.transform(train_movistar)
predsMovistarTest=modelMovistar.transform(prediction_df_movistar)

In [None]:
aucTrainMovistar = evaluator.evaluate(predsMovistarTrain)
aucTestMovistar = evaluator.evaluate(predsMovistarTest)

In [None]:
aucTrainMovistar

In [None]:
aucTestMovistar 

In [None]:
feat_impMovistar = ExtractFeatureImp(modelMovistar.featureImportances ,predsMovistarTest, "features")[0:30]
feat_impMovistar = feat_impMovistar.sort_values(by = ['score'], ascending = True)

In [None]:
predsMovistarTrain=predsMovistarTrain.withColumn("score", getScore(col("probability")).cast(DoubleType()))
predsMovistarTrain=predsMovistarTrain.select('msisdn','score','target','prediction') #guardo para dibujar luego matrices de confuisioon etc

In [None]:
predsMovistarTrain.repartition(300).write.save('/data/udf/vf_es/churn/portabPropension_model/predsMovistarTrain', format='parquet', mode='overwrite')

In [None]:
predsMovistarTest=predsMovistarTest.withColumn("score", getScore(col("probability")).cast(DoubleType()))
predsMovistarTest=predsMovistarTest.select('msisdn','score','target','prediction') #guardo para dibujar luego matrices de confuisioon etc

In [None]:
predsMovistarTest.repartition(300).write.save('/data/udf/vf_es/churn/portabPropension_model/predsMovistarTest', format='parquet', mode='overwrite')

In [None]:
#Tablon de predicciones preparado para luego unir con el resto 

prediccionesMovistar=predsMovistarTest.select('msisdn','score').withColumnRenamed('score','Movistar').withColumnRenamed('msisdn','msisdn_Movistar')

In [None]:

prediccionesMovistar.repartition(300).write.save('/data/udf/vf_es/churn/portabPropension_model/prediccionesMovistar', format='parquet', mode='overwrite')



In [None]:
prediccionesMovistar

### Evalúo modelo

In [None]:
lift = get_lift(predsMovistarTest, 'score', 'target', 10)

for d ,l in lift:
   print str(d) + ": " + str(l)

In [None]:
for d ,l in lift:
   print str(d) + ": " + str(l)

In [None]:
feat_imp=feat_impMovistar

features = feat_imp['name']
importances = feat_imp['score']
indices = feat_imp['idx']
plt.figure(figsize=(15, 10))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

In [None]:
feat_impMovistar

## MODELO DE PORTABILIDAD A ORANGE

In [None]:
n=float(41554)/float(5999083)

train_orange=training_df_transf.filter(training_df_transf['Operador_target']==3).union(training_df_noPortab_trans.sample(False, n,5))
train_orange=train_orange.withColumn('target',when(train_orange['Operador_target']==3,1).otherwise(0))
train_orange=train_orange.drop('Operador_target')


prediction_df_orange=prediction_df_transf.withColumn('target',when(prediction_df_transf['Operador_target']==3,1).otherwise(0))

In [None]:
train_orange=train_orange.cache()
train_orange.count()

In [None]:
train_orange.filter(train_orange['target']==1).count()

In [None]:
train_orange.filter(train_orange['target']==0).count()

### Aplico modelo y saco tablón de predicciones

In [None]:
modelOrange = model.fit(train_orange)

In [None]:
predsOrangeTrain=modelOrange.transform(train_orange)
predsOrangeTest=modelOrange.transform(prediction_df_orange)

In [None]:
aucTrainOrange = evaluator.evaluate(predsOrangeTrain)
aucTestOrange = evaluator.evaluate(predsOrangeTest)

In [None]:
aucTrainOrange

In [None]:
aucTestOrange

In [None]:
feat_impOrange = ExtractFeatureImp(modelOrange.featureImportances ,predsOrangeTest, "features")[0:30]
feat_impOrange = feat_impOrange.sort_values(by = ['score'], ascending = True)

In [None]:
predsOrangeTrain=predsOrangeTrain.withColumn("score", getScore(col("probability")).cast(DoubleType()))
predsOrangeTrain=predsOrangeTrain.select('msisdn','score','target','prediction') #guardo para dibujar luego matrices de confuisioon etc

In [None]:

predsOrangeTrain.repartition(300).write.save('/data/udf/vf_es/churn/portabPropension_model/predsOrangeTrain', format='parquet', mode='overwrite')

'''
'''

In [None]:
predsOrangeTest=predsOrangeTest.withColumn("score", getScore(col("probability")).cast(DoubleType()))
predsOrangeTest=predsOrangeTest.select('msisdn','score','target','prediction') #guardo para dibujar luego matrices de confuisioon etc

In [None]:
'''
predsOrangeTest.repartition(300).write.save('/data/udf/vf_es/churn/portabPropension_model/predsOrangeTest', format='parquet', mode='overwrite')


'''

In [None]:
#Tablon de predicciones preparado para luego unir con el resto 

prediccionesOrange=predsOrangeTest.select('msisdn','score').withColumnRenamed('score','Orange').withColumnRenamed('msisdn','msisdn_Orange')

In [None]:
prediccionesOrange.repartition(300).write.save('/data/udf/vf_es/churn/portabPropension_model/prediccionesOrange', format='parquet', mode='overwrite')


### Evalúo modelo

In [None]:
lift = get_lift(predsOrangeTest, 'score', 'target', 10)

for d ,l in lift:
   print str(d) + ": " + str(l)

In [None]:
feat_imp=feat_impOrange

features = feat_imp['name']
importances = feat_imp['score']
indices = feat_imp['idx']
plt.figure(figsize=(15, 10))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

## MODELO DE PORTABILIDAD A OTROS OPERADORES

In [None]:
n=float(20951)/float(5999083)

train_otros=training_df_transf.filter(training_df_transf['Operador_target']==4).union(training_df_noPortab_trans.sample(False, n,5))
train_otros=train_otros.withColumn('target',when(train_otros['Operador_target']==4,1).otherwise(0))
train_otros=train_otros.drop('Operador_target')

prediction_df_otros=prediction_df_transf.withColumn('target',when(prediction_df_transf['Operador_target']==4,1).otherwise(0))

### Aplico modelo y saco tablón de predicciones

In [None]:
modelOtros = model.fit(train_otros)

In [None]:
predsOtrosTrain=modelOtros.transform(train_otros)
predsOtrosTest=modelOtros.transform(prediction_df_otros)

In [None]:
aucTrainOtros = evaluator.evaluate(predsOtrosTrain)
aucTestOtros = evaluator.evaluate(predsOtrosTest)

In [None]:
aucTrainOtros

In [None]:
aucTestOtros

In [None]:
feat_impOtros = ExtractFeatureImp(modelOtros.featureImportances ,predsOtrosTest, "features")[0:30]
feat_impOtros = feat_impOtros.sort_values(by = ['score'], ascending = True)

In [None]:
predsOtrosTrain=predsOtrosTrain.withColumn("score", getScore(col("probability")).cast(DoubleType()))
predsOtrosTrain=predsOtrosTrain.select('msisdn','score','target','prediction') #guardo para dibujar luego matrices de confuisioon etc

In [None]:

predsOtrosTrain.repartition(300).write.save('/data/udf/vf_es/churn/portabPropension_model/predsOtrosTrain', format='parquet', mode='overwrite')



In [None]:
predsOtrosTest=predsOtrosTest.withColumn("score", getScore(col("probability")).cast(DoubleType()))
predsOtrosTest=predsOtrosTest.select('msisdn','score','target','prediction') #guardo para dibujar luego matrices de confuisioon etc

In [None]:

predsOtrosTest.repartition(300).write.save('/data/udf/vf_es/churn/portabPropension_model/predsOtrosTest', format='parquet', mode='overwrite')



In [None]:
#Tablon de predicciones preparado para luego unir con el resto 

prediccionesOtros=predsOtrosTest.select('msisdn','score').withColumnRenamed('score','Otros').withColumnRenamed('msisdn','msisdn_Otros')

In [None]:

prediccionesOtros.repartition(300).write.save('/data/udf/vf_es/churn/portabPropension_model/prediccionesOtros', format='parquet', mode='overwrite')




In [None]:
spark.stop()

### Evalúo modelo

In [None]:
lift = get_lift(predsOtrosTest, 'score', 'target', 10)

for d ,l in lift:
   print str(d) + ": " + str(l)

In [None]:
feat_imp=feat_impOtros

features = feat_imp['name']
importances = feat_imp['score']
indices = feat_imp['idx']
plt.figure(figsize=(15, 10))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

## Tablón de predicciones final: scores de cada modelo y majority vote

Con los scores obtenidos por cada modelo para los datos de predicción, creamos un tablón que nos diga el operador con mayor score para cada cliente

### Junto los tablones de predicciones 

In [15]:
prediccionesMasMovil= spark.read.load('/data/udf/vf_es/churn/portabPropension_model/prediccionesMasMovil)
prediccionesMovistar = spark.read.load('/data/udf/vf_es/churn/portabPropension_model/prediccionesMovistar')
prediccionesOrange = spark.read.load('/data/udf/vf_es/churn/portabPropension_model/prediccionesOrange')
prediccionesOtros = spark.read.load('/data/udf/vf_es/churn/portabPropension_model/prediccionesOtros')

AnalysisException: u'Path does not exist: hdfs://nameservice1/data/udf/vf_es/churn/portabPropension_model/prediccionesMasMovilTest;'

In [46]:
msisdn_target=prediccionesFinal.select('msisdn','Operador_target')

In [47]:
msisdn_target.groupby('Operador_target').count().show()

+---------------+-------+
|Operador_target|  count|
+---------------+-------+
|              1|  42023|
|              3|  39764|
|              4|  23970|
|              2|  50124|
|              0|6070719|
+---------------+-------+



In [51]:
prediccionesOtros = spark.read.load('/data/udf/vf_es/churn/portabPropension_model/prediccionesOtros')

prediccionesOtros=prediccionesOtros.join(msisdn_target,
                                            on=(prediccionesOtros['msisdn_Otros']==msisdn_target['msisdn'])
                                                ,how='inner')

In [52]:
prediccionesOtros.groupby('Operador_target').count().show()

+---------------+-------+
|Operador_target|  count|
+---------------+-------+
|              1|  42023|
|              3|  39764|
|              4|  23970|
|              2|  50124|
|              0|6070719|
+---------------+-------+



In [57]:
prediccionesOtros=prediccionesOtros.withColumn('prediction',when(prediccionesOtros['Otros']>0.6,1).otherwise(0))

In [54]:
prediccionesOtros=prediccionesOtros.withColumn('Operador_target',when(prediccionesOtros['Operador_target']==4,1).otherwise(0))

In [58]:
prediccionesOtros.groupby('Operador_target','prediction').count().show()

+---------------+----------+-------+
|Operador_target|prediction|  count|
+---------------+----------+-------+
|              1|         0|  17640|
|              1|         1|   6330|
|              0|         0|5585276|
|              0|         1| 617354|
+---------------+----------+-------+



In [None]:
#Creamos un tablon con los scores de cada modelo:

prediccionesUnion=prediccionesMasMovil.join(prediccionesMovistar,
                                            on=(prediccionesMasMovil['msisdn_MasMovil']==prediccionesMovistar['msisdn_Movistar'])
                                                ,how='inner')

In [None]:
prediccionesUnion=prediccionesUnion.join(prediccionesOrange,
                                            on=(prediccionesUnion['msisdn_MasMovil']==prediccionesOrange['msisdn_Orange'])
                                                ,how='inner')

In [None]:
prediccionesUnion=prediccionesUnion.join(prediccionesOtros,
                                            on=(prediccionesUnion['msisdn_MasMovil']==prediccionesOtros['msisdn_Otros'])
                                                ,how='inner')

In [None]:
prediccionesUnion=prediccionesUnion.select('msisdn_MasMovil','MasMovil','Movistar','Orange','Otros')

prediccionesUnion=prediccionesUnion.withColumnRenamed('msisdn_MasMovil','msisdn')

In [None]:
prediccionesUnion.show() 

In [None]:
prediccionesUnion.cache()
prediccionesUnion.count()

### Majority vote: operador predicho para cada cliente

- Columna que, para cada registro, nos diga el mayor score: *Score_max*
- Columna que, para cada registro, nos diga el operador al que corresponde el mayor score: *Operador_predicho*
- Anterior columna pero poniendo los operadores predichos en número (para luego comparar con el valor real del target): *Operador_predicho_num*

In [None]:
cond = "psf.when" + ".when".join(["(psf.col('" + c + "') == psf.col('Score_max'), psf.lit('" + c + "'))" for c in prediccionesUnion.columns[1:5]])
import pyspark.sql.functions as psf
prediccionesFinal=prediccionesUnion.withColumn("Score_max", psf.greatest(*prediccionesUnion.columns[1:5])).withColumn("Operador_predicho", eval(cond))

In [None]:
#Añadimos columna con la prediccion en numeros, para despues comparar con el target real

prediccionesFinal = prediccionesFinal.withColumn("Operador_predicho_num",
                when(prediccionesFinal["Operador_predicho"]=='MasMovil',1).otherwise(
                    when(prediccionesFinal["Operador_predicho"]=='Movistar',2).otherwise(
                        when(prediccionesFinal["Operador_predicho"]=='Orange',3).otherwise(
                            when(prediccionesFinal["Operador_predicho"]=='Otros',4)))))                    

In [None]:
prediccionesFinal=prediccionesFinal.cache()
prediccionesFinal.show()

In [None]:
prediccionesFinal.groupby('Operador_predicho').count().show()

In [None]:
#añadimos la columna de target real del tablón de predicciones

In [None]:
msisdn_target_pred=prediction_df.select('msisdn','Operador_target')

In [None]:
prediccionesFinal=prediccionesFinal.join(msisdn_target_pred,on='msisdn',how='inner')

prediccionesFinal=prediccionesFinal.orderBy('Score_max',ascending=False)

In [None]:
prediccionesFinal.groupby('Operador_target').count().show()

In [None]:
prediccionesFinal.repartition(300).write.save('/data/udf/vf_es/churn/portabPropension_model/prediccionesFinal', format='parquet', mode='overwrite')

In [7]:
prediccionesFinal= spark.read.load('/data/udf/vf_es/churn/portabPropension_model/prediccionesFinal')

**Evaluamos predicciones finales**

In [None]:
#Nos quedamos con los 150000 con mayor score, ya que es la tasa que suele hacer porta

In [8]:
prediccionesFinal_limit=prediccionesFinal.limit(500000)

In [10]:
prediccionesFinal_limit.groupby('Operador_target','Operador_predicho_num').count().show()

+---------------+---------------------+------+
|Operador_target|Operador_predicho_num| count|
+---------------+---------------------+------+
|              0|                    1| 87644|
|              2|                    1|   611|
|              1|                    1|   961|
|              3|                    1|   532|
|              0|                    2|131917|
|              4|                    1|   296|
|              2|                    2|  1659|
|              1|                    2|   870|
|              0|                    4|163727|
|              0|                    3|103780|
|              3|                    2|   884|
|              4|                    2|   521|
|              2|                    4|  1185|
|              4|                    4|   894|
|              2|                    3|   651|
|              3|                    4|  1092|
|              1|                    4|  1021|
|              3|                    3|   825|
|            

In [None]:
predsMulticlaseTest=predsMulticlaseTest.select('msisdn','probability','prediction','Operador_target')

import utils_model
from utils_model import get_lift

getScore = udf(lambda prob: float(prob[0]), DoubleType())
predsMulticlaseTest=predsMulticlaseTest.withColumn("Ninguno", getScore(col("probability")).cast(DoubleType()))

getScore = udf(lambda prob: float(prob[1]), DoubleType())
predsMulticlaseTest=predsMulticlaseTest.withColumn("MasMovil", getScore(col("probability")).cast(DoubleType()))

getScore = udf(lambda prob: float(prob[2]), DoubleType())
predsMulticlaseTest=predsMulticlaseTest.withColumn("Movistar", getScore(col("probability")).cast(DoubleType()))

getScore = udf(lambda prob: float(prob[3]), DoubleType())
predsMulticlaseTest=predsMulticlaseTest.withColumn("Orange", getScore(col("probability")).cast(DoubleType()))

getScore = udf(lambda prob: float(prob[4]), DoubleType())
predsMulticlaseTest=predsMulticlaseTest.withColumn("Otros", getScore(col("probability")).cast(DoubleType()))

In [None]:
predsMulticlaseTest=predsMulticlaseTest.select('msisdn','Ninguno',"MasMovil","Movistar","Orange","Otros",'prediction','Operador_target')

In [None]:
predsMulticlaseTest.groupby('Operador_target','prediction').count().show()

In [None]:
printHTML(predsMulticlaseTest)

In [None]:
predsMulticlaseTest.repartition(300).write.save('/data/udf/vf_es/churn/portabPropension_model/predsMulticlaseTest', format='parquet', mode='overwrite')