# Migrator

This script is intended for migrate scores in specific churn tables into output model tables
Documentation on how to insert into these tables can be found here:
https://confluence.sp.vodafone.com/pages/viewpage.action?spaceKey=VEBDA&title=Model+Outputs

In [None]:
spark.stop()

# Setting paths

In [None]:
import os, sys
import datetime as dt
DEVEL_SRC = os.path.join(os.environ.get('BDA_USER_HOME', ''), "src", "devel")
if DEVEL_SRC not in sys.path:
    sys.path.append(DEVEL_SRC)

USECASES_SRC = os.path.join(DEVEL_SRC, "use-cases") # TODO when - is removed, remove also this line and adapt imports
if USECASES_SRC not in sys.path: 
    sys.path.append(USECASES_SRC)
    
AMDOCS_SRC = os.path.join(DEVEL_SRC, "amdocs_informational_dataset") # TODO when - is removed, remove also this line and adapt imports
if AMDOCS_SRC not in sys.path: 
    sys.path.append(AMDOCS_SRC)
    
import pykhaos.utils.custom_logger as clogger
logging_file = os.path.join(os.environ.get('BDA_USER_HOME', ''), "logging",
                                    "out_" + dt.datetime.now().strftime("%Y%m%d_%H%M%S") + ".log")
logger = clogger.configure_logger(log_filename=logging_file, std_channel=sys.stderr, logger_name="")
logger.info("Logging to file {}".format(logging_file))    
    
#EXTERNAL_PATH = os.path.join(os.environ.get('BDA_USER_HOME', ''), "src", "devel", "pykhaos", "external_lib")
EXTERNAL_PATH = "/var/SP/data/bdpmdses/churn/lib"
if EXTERNAL_PATH not in sys.path:
    sys.path.append(EXTERNAL_PATH)

import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)

    
import pykhaos.utils.notebooks as nb


RUNNING_FROM_NOTEBOOK = nb.isnotebook()
import matplotlib.pyplot as plt
if RUNNING_FROM_NOTEBOOK:
    %load_ext autoreload
    %autoreload 2
    %matplotlib inline  
    
    
    
#logger = my_project.logger

if not RUNNING_FROM_NOTEBOOK:
    args = my_project.arg_parser()

In [None]:
import time
start_time = time.time()
app_name = "migrator"

import pykhaos.utils.pyspark_configuration as pyspark_config
sc, spark, sql_context = pyspark_config.get_spark_session(app_name=app_name, log_level="OFF", min_n_executors = 1, max_n_executors = 10, n_cores = 4,
                         executor_memory = "16g", driver_memory="4g")
print("Ended spark session: {} secs | default parallelism={}".format(time.time() - start_time,
                                                                     sc.defaultParallelism))

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
from amdocs_informational_dataset.engine.call_centre_calls import CallCentreCalls
from pyspark.sql.functions import collect_set, concat, size, coalesce, col, lpad, struct, count as sql_count, lit, min as sql_min, max as sql_max, collect_list, udf, \
        desc, asc, to_date, create_map, sum as sql_sum, substring, sort_array, split, month, dayofmonth
from pyspark.sql.types import StringType, ArrayType, MapType, StructType, StructField, IntegerType, DateType
from pyspark.sql.functions import array, regexp_extract
from itertools import chain
import argparse
import csv
import re
import subprocess
import sys
import time
from pyspark.sql import SparkSession, SQLContext

from pyspark.sql.functions import concat_ws, date_format, from_unixtime, \
    length, lit, lower, lpad, month, regexp_replace, translate, udf, unix_timestamp, year, when, upper
from pyspark.sql.utils import AnalysisException
from engine.general_functions import format_date, compute_diff_days, sum_horizontal
from collections import Counter
from pyspark.sql.types import StringType
from pykhaos.utils.date_functions import get_last_day_of_month, move_date_n_days, move_date_n_cycles, move_date_n_yearmonths
from churn.analysis.ccc_churn.engine.data_loader import get_port, get_ccc_data, get_tgs, get_all_ports
from churn.datapreparation.general.data_loader import get_active_services
from churn.analysis.ccc_churn.engine.reporter import compute_results, SAVING_PATH, init_writer, print_sheet
from churn.analysis.ccc_churn.app.run_ccc_churn_analysis import join_dfs

# Select closing days to migrate

Specify the first and latest cycle to migrate
In case of more than one prediction are found for a closing day, the latest one is selected.

In [None]:
end_cycle = "20190314"
start_cycle = "20181114"

FOR_MIGRATE = []
ss = start_cycle
while ss<=end_cycle:
    FOR_MIGRATE.append(ss)
    ss = move_date_n_cycles(date_=ss, n=1)
FOR_MIGRATE

# Migrator

In [None]:
getFirstElement = udf(lambda myarray: myarray[0])

df_preds = (spark
      .read
      .table("tests_es.jvmm_amdocs_automated_churn_scores"))


df_model = (spark
      .read
      .table("tests_es.jvmm_amdocs_churn_model_results").drop(*["horizon", "segment"]))
        
df_complete = df_preds.join(df_model, on=["model"], how="inner")

df_complete = df_complete.cache()


In [None]:
tuples_list = df_preds.select("pred_name", "model", "segment").distinct().rdd.map(lambda row: (row[0], row[1], row[2])).collect()

In [None]:
import pandas as pd
df_model_list = pd.DataFrame(tuples_list, columns=["pred_name", "model", "segment"])
df_model_list["closing_day"] = df_model_list["pred_name"].apply(lambda mm: re.match("(churn_preds|preds)_(mobileandfbb|onlymob)_for(201[8|9][0-9]{4}).*",mm).group(3))
df_model_list.sort_values(by="model", ascending=True, inplace=True)
df_model_list["order"] = df_model_list.groupby(["closing_day", "segment"]).cumcount()
df_model_list.drop_duplicates(["closing_day", "segment"], inplace=True, keep="last")
df_model_list = df_model_list[df_model_list["closing_day"].isin(FOR_MIGRATE)]
df_model_list

Extract the information for the pred_name's and mode's stored in the df_model_list dataframe and format the info to insert into the new tables

In [None]:
#MODEL = "prediction_tr20181214to20181214_tt20190207_horizon8_on20190214_165745"
#XXXX_MODEL_LIST = [MODEL]

model_param_cols = ["model_name", 
      "executed_at",
      "model_level",
      "training_closing_date",
      "target",
      "model_path",
      "metrics_path",
      "metrics_train",
      "metrics_test",
      "varimp",
      "algorithm",
      "author_login",
      "extra_info",
      "scores_extra_info_headers",
      "year", "month", "day", "time"]

model_scores_cols = ["model_name",
      "executed_at",
      "model_executed_at",
      "predict_closing_date",
      "msisdn",
      "client_id", 
      "nif",
      "model_output",
      "scoring",
      "prediction",
      "extra_info"]

for idx, df_model in df_model_list.iterrows():
    
    #print(df_model)
    MODEL = df_model["model"] # e.g. prediction_tr20181207to20181207_tt20190131_horizon8_on20190206_173333
    PRED_NAME = df_model["pred_name"] # e.g. preds_mobileandfbb_for20190221_on20190227_164405
    #print(PRED_NAME)
    EXECUTED_AT_DATE = re.match("(preds_|churn_preds_).*_on([0-9]{8})_([0-9]{6})", PRED_NAME).group(2)
    EXECUTED_AT_TIME = re.match("(preds_|churn_preds_).*_on([0-9]{8})_([0-9]{6})", PRED_NAME).group(3)
    SEGMENT = df_model["segment"]
    MODEL_NAME_PARTITION = "churn_preds_" + SEGMENT
    
    print(MODEL, PRED_NAME, SEGMENT)
    
    model_param_table = (df_complete
          .where( ((col("model") == MODEL) & (col("pred_name") == PRED_NAME)))
          .dropDuplicates()
          .withColumn("model_name", concat(lit("churn_preds_"), regexp_extract(col("pred_name"), "(preds_|churn_preds_)(mobileandfbb|onlymob)_for.*",2)))      
          .withColumn("executed_at", from_unixtime(unix_timestamp( substring(col("pred_name"), -15, 15), "yyyyMMdd_HHmmss")))
          .withColumn("model_level", lit("service"))
          .withColumn("training_closing_date", regexp_extract(col("model"), "^prediction_tr([0-9]{8}to[0-9]{8}).*", 1))
          #.withColumn("target", array(col("target")))
          .withColumnRenamed("path", "model_path")
          .withColumn("metrics_path", lit("-"))
          .withColumn("metrics_train", concat_ws(";", concat(lit("roc="), col("roc_tr")),
                                                      concat(lit("avg_score="), col("avg_score_tr")), 
                                                      concat(lit("sd_score="), col("sd_score_tr")), 
                                                      concat(lit("skewness_score="), col("skewness_score_tr")), 
                                                      concat(lit("kurtosis_score="), col("kurtosis_score_tr")), 
                                                      concat(lit("min_score="), col("min_score_tr")), 
                                                      concat(lit("max_score="), col("max_score_tr"))))
          .withColumn("metrics_test", concat_ws(";", concat(lit("roc="), col("roc_tt")),
                                                      concat(lit("avg_score="), col("avg_score_tt")), 
                                                      concat(lit("sd_score="), col("sd_score_tt")), 
                                                      concat(lit("skewness_score="), col("skewness_score_tt")), 
                                                      concat(lit("kurtosis_score="), col("kurtosis_score_tt")), 
                                                      concat(lit("min_score="), col("min_score_tt")), 
                                                      concat(lit("max_score="), col("max_score_tt"))))
          .withColumn("varimp", lit("-"))
          .withColumnRenamed("alg", "algorithm")
          .withColumn("author_login", lit("jmarcoso"))
          .withColumn("extra_info", concat_ws(";", 
                                              concat(lit("horizon="), col("horizon")), 
                                              concat(lit("input_dim="), col("input_dim"))))
          .withColumn("scores_extra_info_headers", lit("-"))
          .withColumn("year", year('executed_at'))
          .withColumn("month", month('executed_at'))
          .withColumn("day", dayofmonth('executed_at'))
          .withColumn("time", regexp_replace(split(col("executed_at"), " ")[1], ":", ""))
          .select(*model_param_cols)
          .dropDuplicates())
    

    model_output_table = (df_complete
          .filter(col("model") == MODEL)
          .withColumnRenamed("pred_name", "model_name")
          .withColumn("model_name", when(col("model_name").rlike("^churn_"), col("model_name")).otherwise(concat(lit("churn_"), col("model_name"))))
          .withColumn("executed_at", from_unixtime(unix_timestamp( substring(col("model_name"), -15, 15), "yyyyMMdd_HHmmss")))
          .withColumnRenamed("date", "model_executed_at")
          .withColumn("model_executed_at",  from_unixtime(unix_timestamp( col("model_executed_at"), "yyyyMMdd_HHmmss"))  )
          .withColumnRenamed("test", "predict_closing_date")
          .withColumnRenamed("num_cliente", "client_id")
          .withColumnRenamed("nif_cliente", "nif")
          .withColumnRenamed("model_score", "scoring")
          .withColumn("model_output", array(col("scoring")))
          .withColumn("prediction", lit("-"))
          .withColumn("extra_info", lit("-"))
          .withColumn("year", year('executed_at'))
          .withColumn("month", month('executed_at'))
          .withColumn("day", dayofmonth('executed_at'))
          .withColumn("time", regexp_replace(split(col("executed_at"), " ")[1], ":", ""))
          .select(*model_scores_cols))
    
    (model_param_table
          .write
          .mode("append")
          .format("parquet")
          # /data/attributes/vf_es/model_outputs/model_parameters/
          # /user/csanc109/projects/model_outputs/model_param/
          .save("/data/attributes/vf_es/model_outputs/model_parameters/model_name={}/year={}/month={}/day={}/time={}".format(MODEL_NAME_PARTITION, 
                                                                                                                         int(EXECUTED_AT_DATE[:4]), 
                                                                                                                         int(EXECUTED_AT_DATE[4:6]), 
                                                                                                                         int(EXECUTED_AT_DATE[6:]), 
                                                                                                                         int(EXECUTED_AT_TIME))))
    (model_output_table
          .write
          .mode("append")
          .format("parquet")
          #/data/attributes/vf_es/model_outputs/model_scores/
          #/user/csanc109/projects/model_outputs/model_scores/
          .save("/data/attributes/vf_es/model_outputs/model_scores/model_name={}/year={}/month={}/day={}/time={}".format(MODEL_NAME_PARTITION, 
                                                                                                                         int(EXECUTED_AT_DATE[:4]), 
                                                                                                                         int(EXECUTED_AT_DATE[4:6]), 
                                                                                                                         int(EXECUTED_AT_DATE[6:]), 
                                                                                                                         int(EXECUTED_AT_TIME))))
print("[Info] Output model migration completed!!!")


## Checks

In [None]:

 
df_onlymob = (spark.read.load("/data/attributes/vf_es/model_outputs/model_scores/model_name=churn_preds_onlymob")
              .select("executed_at", "predict_closing_date").groupby("executed_at", "predict_closing_date").agg(sql_count("*").alias("onlymob")))
df_onlymob_params = spark.read.load("/data/attributes/vf_es/model_outputs/model_parameters/model_name=churn_preds_onlymob").select("executed_at", "training_closing_date")
df_onlymob = df_onlymob.join(df_onlymob_params, on=["executed_at"], how="outer")
df_onlymob = df_onlymob.select("training_closing_date", "predict_closing_date")


df_mobandfbb = (spark.read.load("/data/attributes/vf_es/model_outputs/model_scores/model_name=churn_preds_mobileandfbb")
                .select("executed_at", "predict_closing_date").groupby("executed_at", "predict_closing_date").agg(sql_count("*").alias("mobileandfbb")))

df_mobandfbb_params = spark.read.load("/data/attributes/vf_es/model_outputs/model_parameters/model_name=churn_preds_mobileandfbb").select("executed_at", "training_closing_date")
df_mobandfbb = df_mobandfbb.join(df_mobandfbb_params, on=["executed_at"], how="outer")
df_mobandfbb = df_mobandfbb.select("training_closing_date", "predict_closing_date")


df_others = (spark.read.load("/data/attributes/vf_es/model_outputs/model_scores/model_name=churn_preds_others")
              .select("executed_at", "predict_closing_date").groupby("executed_at", "predict_closing_date").agg(sql_count("*").alias("others")))

             
df_others_params = spark.read.load("/data/attributes/vf_es/model_outputs/model_parameters/model_name=churn_preds_others").select("executed_at", "training_closing_date")
df_others = df_others.join(df_others_params, on=["executed_at"], how="outer")
df_others = df_others.select("training_closing_date", "predict_closing_date")


df_fbb = (spark.read.load("/data/attributes/vf_es/model_outputs/model_scores/model_name=churn_preds_fbb")
            .select("executed_at", "predict_closing_date").groupby("executed_at", "predict_closing_date").agg(sql_count("*").alias("fbb")))
df_fbb_params = spark.read.load("/data/attributes/vf_es/model_outputs/model_parameters/model_name=churn_preds_fbb").select("executed_at", "training_closing_date")
df_fbb = df_fbb.join(df_fbb_params, on=["executed_at"], how="outer")
df_fbb = df_fbb.select("training_closing_date", "predict_closing_date")


df_all = df_onlymob.join(df_mobandfbb, on=["predict_closing_date", "training_closing_date"], how="outer")
df_all = df_all.join(df_others, on=["predict_closing_date", "training_closing_date"], how="outer")
df_all = df_all.join(df_fbb, on=["predict_closing_date", "training_closing_date"], how="outer")


df_all.sort(desc("predict_closing_date"), desc("training_closing_date")).show()



In [None]:
target.columns


In [None]:
closing_day = "20191022"
churn_window=15

from churn_nrt.src.data.sopos_dxs import FixPort
from churn_nrt.src.data.customer_base import CustomerBase

# Getting portout requests for fix and mobile services, and disconnections of fbb services
print("******* Asking for FixPort...")
df_sopo_fix = FixPort(spark).get_module(closing_day, save=False, churn_window=churn_window)

# The base of active services on closing_day
from churn.analysis.triggers.base_utils.base_utils import get_mobile_portout_requests, get_customer_base
base_df = get_customer_base(spark, closing_day).select('msisdn', "nif_cliente", "rgu")

df_sopos = (base_df.join(df_sopo_fix, ['msisdn'], "left").na.fill({'label_srv': 0.0}))

df_sopos.where(col("label_srv")==1.0).groupby("rgu").agg(sql_count("*").alias("count")).count()

In [None]:
df_sopos.where(col("label_srv")==1.0).groupby("rgu").agg(sql_count("*").alias("count")).show()

In [None]:
df_sopos.where(col("label_srv")==1.0).select("msisdn").show()

In [None]:
#spark.read.load("/data/udf/vf_es/churn/triggers/nav_comp_tests_all_labels/year=2019/month=10/day=30/time=1574941173").groupby("rgu").agg(sql_count("*").alias("count")).show()



spark.read.load("/data/udf/vf_es/churn/triggers/navcomp_msisdn_data/year=2019/month=11/day=14/").groupby("segment_nif").agg(sql_count("*").alias("count")).show()


In [None]:
spark.read.load("/data/udf/vf_es/churn/triggers/navcomp_msisdn_data/year=2019/month=11/day=28/").where(col("segment_nif")=="Standalone_FBB").select("msisdn").show()

In [None]:
df = get_customer_master(spark, "20191128", unlabeled=True)


#nifs_standalone_fbb = df.where(col("segment_nif")=="Standalone_FBB").sort(desc("nif_cliente")).select("nif_cliente", "msisdn", "rgu", "segment_nif", "rgus_list").select("nif_cliente").rdd.map(lambda x: x[0]).collect()

df.where(col("segment_nif")=="Standalone_FBB").where(col("rgu")=="mobile").select("nif_cliente", "msisdn", "rgu", "segment_nif", "rgus_list").show()




In [None]:
from churn.analysis.triggers.orders.customer_master import get_customer_master



get_customer_master(spark, "20191114", unlabeled=True).where(col("msisdn").isin(["638079600","649160250", "612735706"])).show()


In [None]:
customer_tr_df.columns

In [None]:
from churn.analysis.triggers.orders.customer_master import get_segment_nif_anyday

df_segment_nif_anyday = get_segment_nif_anyday(spark, date_).where(col("nif_cliente").isin([nif_cliente_raros]))  
df_segment_nif_anyday.select("nif_cliente", "segment_nif", "nb_rgus", "rgus_list").show(100, truncate=False)

In [None]:
from churn.analysis.triggers.orders.customer_master import get_customer_master
df_customer_master_module = get_customer_master(spark, date_, unlabeled=True).where(col("nif_cliente").isin(nif_error))  
df_customer_master_module.select("nif_cliente", "segment_nif", "nb_rgus", "rgus_list").show(100, truncate=False)

In [None]:
from churn.analysis.triggers.orders.customer_master import get_customer_master_module
df_customer_master_module = get_customer_master_module(spark, date_, unlabeled=True, save=False).where(col("nif_cliente").isin(nif_error))  
df_customer_master_module.select("nif_cliente", "segment_nif", "nb_rgus", "rgus_list").show(100, truncate=False)

# BASE COMPARISON

In [None]:
from churn_nrt.src.data.customer_base import CustomerBase
df_nrt_cust_base = CustomerBase(spark).get_module("20190930", save=False).where(col("rgu")=="mobile")
df_nrt_cust_base.count()

In [None]:
df_serv = spark.read.load("/data/udf/vf_es/amdocs_ids/service/year=2019/month=9/day=30")
[col_ for col_ in df_serv.columns if "bas" in col_.lower()]

In [None]:
df_serv.where(col("rgu")=="mobile").groupby("SRV_BASIC").agg(sql_count("*").alias("count")).sort(desc("count")).show()

In [None]:
df_bd_anon = spark.read.load("/data/attributes/vf_es/model_outputs/model_scores/model_name=mobile_base/year=2019/month=9/day=30") #"/user/csanc109/data/mobile_base_20190930_parquet/")
df_bd_anon.count()

In [None]:
df_bd = spark.read.option("delimeter", "|").option("header",False).csv("/user/csanc109/data/mobile_base_20190930_desanonim/Mobile_Base-20190930.csv")
df_bd.count()

In [None]:

headers = ['executed_at',
 'model_executed_at',
 'predict_closing_date',
        'msisdn',   
        'client_id'  , 
           'nif',
 'model_output',
 'scoring',
 'prediction',
 'extra_info',
 'time',
 
 
 ]
df_bd=df_bd.toDF(*headers)

In [None]:
from pyspark.sql.functions import substring_index, posexplode, split
EXTRA_INFO_COLS = ['srv_basic',
 'rgu',
 'CLASE_CLI_COD_CLASE_CLIENTE',
 'COD_ESTADO_GENERAL',
 'TARIFF']

for ii, col_ in enumerate(EXTRA_INFO_COLS):
    df_bd = df_bd.withColumn(col_, split("extra_info", ";")[ii])


#df_bd.show()

In [None]:
df_insi.columns

In [None]:
df_insi.coalesce(1).write.mode('overwrite').format('csv').option('sep', '|').option('header', 'true').save("/user/csanc109/data/mobile_base_20190930_insigths_campos/")

In [None]:
df_insi.columns

In [None]:
df_insi = spark.read.option("delimiter", "|").option("header",True).csv("/user/csanc109/data/mobile_base_20190930_insigths/mobile_base_20190930_insights.csv")
# for col_ in df_insi.columns:
#     if col_ == "MSISDN":
#         df_insi = df_insi.withColumnRenamed(col_, col_.lower())
#     else:
#         df_insi = df_insi.withColumnRenamed(col_, col_.lower()+"_insi")
        
# df_insi.count()
df_insi = df_insi.withColumn("campo1", col("msisdn"))
df_insi = df_insi.withColumn("campo2", col("num_cliente"))
df_insi = df_insi.withColumn("campo3", col("nif_cliente"))
df_insi.columns

In [None]:
df_insi.count() - df_bd.count()

### En BD y no en Insi

In [None]:
df_labels_cross = df_insi.join(df_bd, ['msisdn'], 'right').where(df_insi['msisdn'].isNull())
df_labels_cross = df_labels_cross.cache()
print("volumen incremental", df_labels_cross.count())

In [None]:
df_labels_cross.select("msisdn").show()

### En Insi y no en BD

In [None]:
df_labels_cross2 = df_bd.join(df_insi, ['msisdn'], 'right').where(df_bd['msisdn'].isNull())
df_labels_cross2 = df_labels_cross2.cache()
print("volumen incremental", df_labels_cross2.count())

In [None]:
df_labels_cross3 = df_insi.join(df_bd, ['msisdn'], 'inner')
df_labels_cross3 = df_labels_cross3.cache()
print("volumen comun", df_labels_cross3.count())

In [None]:
df_labels_cross.groupby("CLASE_CLI_COD_CLASE_CLIENTE", 'COD_ESTADO_GENERAL').agg(sql_count("*").alias("count")).sort(desc("count")).show()

In [None]:
df_labels_cross2.groupby('serv_basico_insi','tarifa_insi',).agg(sql_count("*").alias("count")).sort(desc("count")).show()

# navcomp exploration

In [None]:
df_pasoprevio = spark.read.load("/data/udf/vf_es/netscout/dailyMSISDNApplicationName/year=2019/month=4/day=7")
df_pasoprevio.columns
df_pasoprevio2 = spark.read.load("/data/udf/vf_es/netscout/dailyMSISDNApplicationName/year=2019/month=10/day=7")
df_pasoprevio2.columns

set(df_pasoprevio.columns) ^ set(df_pasoprevio2.columns)

In [None]:
df_data_limit200 = spark.read.load("/data/attributes/vf_es/return_feed/data_navigation/year=2019/month=10/day=7").limit(200)
msisdns = list(set(df_data_limit200.select("subscriber_msisdn").rdd.map(lambda x: x['subscriber_msisdn']).collect()))
#msisdns

In [None]:
apps_names = list(set(spark.read.load("/data/attributes/vf_es/return_feed/data_navigation/year=2019/month=10/day=7").select("application_name").rdd.map(lambda x: x['application_name']).collect()))
apps_names



In [None]:
(spark.read.load("/data/udf/vf_es/netscout/dailyMSISDNApplicationName/year=2019/month=10/day=7")
 .where(col("subscriber_msisdn").isin(["34683771231", "34695709335"]))
 .where( (col("SUM_userplane_upload_bytes_count") + col("SUM_userplane_download_bytes_count"))> 524288)
 .where(col("application_name").isin(apps_names)).sort(desc("subscriber_msisdn")).show())


In [None]:
(spark.read.load("/data/attributes/vf_es/return_feed/data_navigation/year=2019/month=10/day=7")
 .where(col("subscriber_msisdn").isin(["34683771231", "34695709335"]))
 .where(col("application_name").isin(apps_names)).sort(desc("subscriber_msisdn")).show())

In [None]:
spark.read.load("/data/attributes/vf_es/return_feed/data_navigation/year=2019/month=10/day=7").where(col("subscriber_msisdn").isin(["34683771231", "34695709335"])).show()



In [None]:
operators = ["JAZZTEL", "LOWI", "MASMOVIL", "MOVISTAR", "O2", "ORANGE", "PEPEPHONE", "VODAFONE", "YOIGO"]
apps_names = ["WEB_"+op+"_"+p for op in operators for p in ["HTTP", "HTTPS"]]
apps_names


# apps_names = [u'WEB_O2_HTTPS',
#  u'WEB_MOVISTAR_HTTPS',
#  u'WEB_VODAFONE_HTTPS',
#  u'WEB_YOIGO_HTTP',
#  u'WEB_LOWI_HTTPS',
#  u'WEB_ORANGE_HTTP',
#  u'WEB_JAZZTEL_HTTP',
#  u'WEB_MASMOVIL_HTTP',
#  u'WEB_VODAFONE_HTTP',
#  u'WEB_MOVISTAR_HTTP',
#  u'WEB_ORANGE_HTTPS',
#  u'WEB_PEPEPHONE_HTTPS',
#  u'WEB_PEPEPHONE_HTTP',
#  u'WEB_JAZZTEL_HTTPS',
#  u'WEB_LOWI_HTTP',
#  u'WEB_MASMOVIL_HTTPS',
#  u'WEB_YOIGO_HTTPS',
#  u'WEB_O2_HTTP']

In [None]:
(spark.read.load("/data/udf/vf_es/netscout/dailyMSISDNApplicationName/year=2019/month=10/day=17").where(col("application_name").isin(apps_names)).where(col("subscriber_msisdn").isNotNull())
           .withColumn("data",col("SUM_userplane_upload_bytes_count")+col("SUM_userplane_download_bytes_count"))
           .where(col("data")>lit(524288))
           .count())

In [None]:
(spark.read.load("/data/udf/vf_es/netscout/dailyMSISDNApplicationName/year=2019/month=10/day=17").where(col("application_name").isin(apps_names))
           .withColumn("data",col("SUM_userplane_upload_bytes_count")+col("SUM_userplane_download_bytes_count"))
           .where(col("data")>lit(524288)).where(col("subscriber_msisdn").isNull()).count())

In [None]:
(spark.read.load("/data/attributes/vf_es/return_feed/data_navigation/year=2019/month=10/day=17")
           .where(col("application_name").isin(apps_names)).count())

In [None]:
len(msisdns)

In [None]:
spark.read.load("/data/udf/vf_es/netscout/dailyMSISDNApplicationName/year=2019/month=10/day=7").where( (col("application_name").isin(apps_names)) & (col("subscriber_msisdn").isNotNull())).withColumn("data",col("SUM_userplane_upload_bytes_count")+col("SUM_userplane_download_bytes_count")).where(col("data")>lit(524288)).agg(sql_sum("data")).show(100)

In [None]:
spark.read.load("/data/attributes/vf_es/return_feed/data_navigation/year=2019/month=10/day=7").withColumn("data",col("SUM_userplane_upload_bytes_count")+col("SUM_userplane_download_bytes_count")).agg(sql_sum("data")).show(100)


In [None]:
from churn.analysis.triggers.navcomp.navcomp_utils import get_navcomp_attributes

starting_date = "20191001"
process_date = "20191015"

df_orig = get_navcomp_attributes(spark, starting_date, process_date, level="msisdn", suffix="", orig_path=True)
df_new = get_navcomp_attributes(spark, starting_date, process_date, level="msisdn", suffix="", orig_path=False)


msisdns = [u'34602579459',
 u'34699265865',
 u'34637118046',
 u'34643541635',
 u'34667016219',
 u'34603522398',
 u'34612028463',
 u'34602172915',
 u'34638122635',
 u'34612661085',
 u'34621389630',
 u'34626735798',
 u'34683771231',
 u'34664804027',
 u'34612610069',
 u'34639483976',
 u'34652601172',
 u'34609921178',
 u'34661956702',
 u'882393230430821',
 u'34639513800',
 u'34603357360',
 u'34652976638',
 u'34653975428',
 u'34612139344',
 u'34657565248',
 u'34656504103',
 u'34615956180',
 u'34608183564',
 u'34658555004',
 u'34650548809',
 u'34603997436',
 u'34694917249',
 u'34655457739',
 u'34679067766',
 u'34655551506',
 u'4534296369',
 u'34688510057',
 u'34692843710',
 u'34615977864',
 u'34694252365',
 u'34750363217',
 u'34621848425',
 u'34647631129',
 u'34661980383',
 u'34698409737',
 u'34655950914',
 u'34670819047',
 u'34602342056',
 u'34672405851',
 u'34697093354',
 u'34676843005',
 u'34607755410',
 u'34643746223',
 u'34624075045',
 u'34664164957',
 u'34640055967',
 u'34699436991',
 u'34615040780',
 u'34649505208',
 u'34630228529',
 u'34641787568',
 u'34695261294',
 u'34650852676',
 u'34651786844',
 u'34615406433',
 u'34661414103',
 u'34635496504',
 u'34650886571',
 u'34684878681',
 u'34679830577',
 u'34624664265',
 u'34686333247',
 u'34619574709',
 u'34661626345',
 u'34698930485',
 u'34643290334',
 u'34607224467',
 u'34611273565',
 u'34659755182',
 u'34626409295',
 u'34625087446',
 u'34697793869',
 u'34620566839',
 u'34647541941',
 u'34605096525',
 u'34650504179',
 u'34625289088',
 u'34683210998',
 u'34656319159',
 u'34630720603',
 u'447741382752',
 u'34683152899',
 u'34602929199',
 u'34692822948',
 u'34622813172',
 u'34670572823',
 u'34789958445',
 u'34688965869',
 u'34680347447',
 u'34664508482',
 u'491789031376',
 u'34609848759',
 u'34627904079',
 u'34655266152',
 u'34610455798',
 u'34695261579',
 u'34682884563',
 u'34666763747',
 u'34604753014',
 u'34682473201',
 u'34631727443',
 u'34610441363',
 u'882396873366857',
 u'34636544644',
 u'34679518709',
 u'34649438836',
 u'34654761792',
 u'34640763250',
 u'34693958858',
 u'34634959019',
 u'34642901478',
 u'34606813261',
 u'34635773177',
 u'34667060855',
 u'34628330469',
 u'34627906966',
 u'34609348547',
 u'34699593370',
 u'34601186557',
 u'34684669501',
 u'34693123558',
 u'34653037531',
 u'34679250802',
 u'34688200353',
 u'34611425181',
 u'34696064244',
 u'34676455684',
 u'34693045687',
 u'34683627907',
 u'34645462158',
 u'34601143234',
 u'34648188487',
 u'34629999175',
 u'34625299080',
 u'34693494584',
 u'491795499178',
 u'34692124140',
 u'34683818952',
 u'34600503065',
 u'34611822026',
 u'34613046218',
 u'34699581645',
 u'34664571843',
 u'34662072310',
 u'34656348015',
 u'34616934022',
 u'34636099471',
 u'34666271584',
 u'34670942078',
 u'34670472204',
 u'34663565093',
 u'34667039830',
 u'34688385443',
 u'34640854236',
 u'34609934152',
 u'34628964902',
 u'34620534832',
 u'34619282211',
 u'34640265051',
 u'34632300147',
 u'34691150155',
 u'34681535231',
 u'34693190094',
 u'34644285982',
 u'34620270626',
 u'34654243256',
 u'34662901998',
 u'34600732856',
 u'34656334691',
 u'34658894953',
 u'34610703900',
 u'34601426823',
 u'34601761332',
 u'34675069520',
 u'34654207810',
 u'34694845333',
 u'34613742893',
 u'34600038285',
 u'34606950109',
 u'34624679794',
 u'34637879188',
 u'34689218659',
 u'34643243869',
 u'34601168324',
 u'34669574220',
 u'34648207731',
 u'34613234875',
 u'34683590410',
 u'34634399909']


msisdns = [re.sub(r'^34', '', mm) for mm in msisdns]

df_orig = df_orig.where(col("msisdn").isin(msisdns))
df_new = df_new.where(col("msisdn").isin(msisdns))

df_orig = df_orig.show(200, truncate=False)
df_new = df_new.show(200, truncate=False)



In [None]:
data_service_param.count()

In [None]:
print(df_table.count(), df_table.distinct().count())

In [None]:
print(data_service_param.count(), data_service_param.distinct().count())

In [None]:
spark.read.load("/data/raw/vf_es/customerprofilecar/WEBSERVICES/1.0/parquet").where(col("COD_SERVICIO")=="MRPD1").show(50)

In [None]:
LOC_RT_PATH = '/data/udf/vf_es/ref_tables/amdocs_ids/'
LOC_RT_PARAM_OW_SERVICES = LOC_RT_PATH + 'PARAM_OW_SERVICES.TXT'
data_service_param =  spark.read.format('com.databricks.spark.csv').options(header='true', inferschema='true',delimiter='\t').load(LOC_RT_PARAM_OW_SERVICES).select("RGU", "COD_SERVICIO", "TIPO").distinct()
df_table = spark.read.load("/data/raw/vf_es/customerprofilecar/WEBSERVICES/1.0/parquet").select("COD_SERVICIO", "RGU", "AMBITO").withColumnRenamed("RGU", "RGU_table").distinct()
df_table = df_table.withColumn("RGU_table", when(col("RGU_table")=="F - Internet (BA)", "fbb").when(col("RGU_table")=="F - Televisión (TV)", "tv").when(col("RGU_table")=="M - Móvil (MV)", "mobile").otherwise(col("RGU_table")))

df_all = df_table.join(data_service_param, on=["COD_SERVICIO"], how="outer")

df_all.show(2000)

In [None]:
from churn.analysis.triggers.navcomp.navcomp_utils import get_customer_base_navcomp
date_  = "20191015"
df_navcomp = get_customer_base_navcomp(spark, date_, verbose=False)

from churn_nrt.src.data.customer_base import CustomerBase
df_base_msisdn = CustomerBase(spark).get_module(date_).filter(col('rgu') == 'mobile')




In [None]:
df_navcomp.count()

In [None]:
df_base_msisdn.count()

In [None]:
df_base_msisdn.filter((col('segment_nif').isin("Other", "Convergent", "Mobile_only")) & (col("segment_nif").isNotNull())).drop_duplicates(["nif_cliente"]).count()


In [None]:
df_base_msisdn = df_base_msisdn.drop_duplicates(["msisdn"])

df_base_msisdn = df_base_msisdn.drop_duplicates(['msisdn', 'nif_cliente', 'num_cliente'])
df_base_msisdn.count()

In [None]:
df_navcomp = df_navcomp.drop_duplicates(["msisdn"])

df_navcomp = df_navcomp.drop_duplicates(['msisdn', 'nif_cliente'])
df_navcomp.count()