# Vodafone COPS. Model designing & training. Distributed version using Spark

In [None]:
# Spark Initialization

from common.src.main.python.utils.hdfs_generic import *
import os
MAX_N_EXECUTORS=15
MIN_N_EXECUTORS=1
N_CORES_EXECUTOR=4
EXECUTOR_IDLE_MAX_TIME=120
EXECUTOR_MEMORY='32g'
DRIVER_MEMORY='32g'
N_CORES_DRIVER=1
MEMORY_OVERHEAD=N_CORES_EXECUTOR*2048
QUEUE="root.datascience.normal"
BDA_CORE_VERSION="1.0.0"
SPARK_COMMON_OPTS=os.environ.get('SPARK_COMMON_OPTS', '')
SPARK_COMMON_OPTS+=" --executor-memory %s --driver-memory %s" % (EXECUTOR_MEMORY, DRIVER_MEMORY)
SPARK_COMMON_OPTS+=" --conf spark.shuffle.manager=tungsten-sort"
SPARK_COMMON_OPTS+=" --queue %s" % QUEUE
# Dynamic allocation configuration
SPARK_COMMON_OPTS+=" --conf spark.dynamicAllocation.enabled=true"
SPARK_COMMON_OPTS+=" --conf spark.shuffle.service.enabled=true"
SPARK_COMMON_OPTS+=" --conf spark.dynamicAllocation.maxExecutors=%s" % (MAX_N_EXECUTORS)
SPARK_COMMON_OPTS+=" --conf spark.dynamicAllocation.minExecutors=%s" % (MIN_N_EXECUTORS)
SPARK_COMMON_OPTS+=" --conf spark.dynamicAllocation.executorIdleTimeout=%s" % (EXECUTOR_IDLE_MAX_TIME)

BDA_ENV = os.environ.get('BDA_USER_HOME', '')

# Attach bda-core-ra codebase
SPARK_COMMON_OPTS+=" --files \
{}/scripts/properties/red_agent/nodes.properties,\
{}/scripts/properties/red_agent/nodes-de.properties,\
{}/scripts/properties/red_agent/nodes-es.properties,\
{}/scripts/properties/red_agent/nodes-ie.properties,\
{}/scripts/properties/red_agent/nodes-it.properties,\
{}/scripts/properties/red_agent/nodes-pt.properties,\
{}/scripts/properties/red_agent/nodes-uk.properties".format(*[BDA_ENV]*7)

os.environ["SPARK_COMMON_OPTS"] = SPARK_COMMON_OPTS
os.environ["PYSPARK_SUBMIT_ARGS"] = "%s pyspark-shell " %SPARK_COMMON_OPTS

sc, sparkSession, sqlContext = run_sc()

spark = (SparkSession.builder
         .appName("Vodafone COPS")
         .master("yarn")
         .config("spark.submit.deployMode", "client")
         .config("spark.ui.showConsoleProgress", "true")
         .enableHiveSupport()
         .getOrCreate()
         )

# Imports

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import logging
import json
import random
from IPython.display import display
from sklearn.metrics import roc_curve, auc, confusion_matrix
from itertools import chain
from pyspark import keyword_only
from pyspark.ml import Pipeline, PipelineModel, Transformer
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, StandardScaler
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.param.shared import HasInputCol, HasOutputCol
from pyspark.sql.functions import mean, desc, regexp_replace, udf
from pyspark.sql.types import StringType
from __future__ import print_function, division

logger = logging.getLogger()
logger.setLevel(logging.INFO)

%matplotlib inline
%autosave 60

plt.rcParams['figure.figsize'] = 20, 10
plt.rcParams['axes.titlesize'] = 24
plt.rcParams['axes.labelsize'] = 20
plt.rcParams['xtick.labelsize'] = 16
plt.rcParams['ytick.labelsize'] = 16

## General utilities

In [None]:
# Utilities

def get_mean(sparkdf, colname):
    """
    Calculate the mean of a Spark DataFrame column
    and retunrs value as float.
    """
    logging.info("Computing mean of column {colname}".format(colname=colname))
    mean_row = sparkdf.select(mean(sparkdf[colname])).first()
    mean_value = mean_row['avg({colname})'.format(colname=colname)]
    return mean_value

def load_train_params():
    """
    Loads training params json file from disk.
    """
    logging.info("Loading training params...")
    with open('./train_params.json', 'r') as json_file:
        train_params = json.loads(json_file.read())
        return train_params

def save_train_params(train_params_dict):
    """
    Saves training params to json file in disk.
    """
    logging.info("Saving training params...")
    with open('./train_params.json', 'w') as json_file:
        json.dump(train_params_dict, json_file)

# Spark User Defined Function to replace null strings
empty_string_filter = udf(lambda string: 'null_value' if string == '' else string, StringType())

def clean(sparkdf, 
          string_cols_imput_null, 
          numeric_cols_imput_mean, 
          numeric_cols_imput_zero, 
          refresh=False):
    """
    Performs previous cleaning using stored parameters (means to replace, most frequent value).
    In case there is no stored parameters, it computes needed info and save in disk.
    """
    logging.info("Cleaning data...")
    if refresh:
        train_params = dict()
    else:
        try:
            train_params = load_train_params()
        except Exception:
            logging.error("No training params found! Creating empty params dict...")
            train_params = dict()
    sparkdf = sparkdf.fillna('null_value', subset=string_cols_imput_null)
    sparkdf = sparkdf.fillna(0, subset=numeric_cols_imput_zero)
    for col in string_cols_imput_null:
        sparkdf = sparkdf.withColumn(col, empty_string_filter(sparkdf[col]))
    for col in numeric_cols_imput_mean:
        try:
            col_params = train_params[col]
        except KeyError:
            logging.error("No training parameters found for this column! Creating empty params dict...")
            col_params = dict()
            train_params[col] = col_params
        try:
            mean_value = col_params['mean']
        except KeyError:
            mean_value = get_mean(sparkdf, col)
            col_params['mean'] = mean_value
        sparkdf = sparkdf.fillna(mean_value)
    save_train_params(train_params)
    return sparkdf

def plot_roc(true_labels, scores):
    fpr, tpr, thr = roc_curve(true_labels, scores)
    area_under_curve = auc(fpr, tpr)
    plt.figure()
    plt.plot(fpr, 
             tpr, 
             color='darkorange',
             label='ROC curve (area = %0.2f)' % area_under_curve)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

## Data loading

It creates a balanced Spark DataFrame readinf Analytics Datamart in tests_es.dacc_analytics_panel.

In [None]:
# Analytics panel load

string_cols_imput_null = ['data_plan_c', 'voice_plan_c', 'promo_code_vf', 'promo_code_tarif', 'zip_code', 'region_code',
                          'gender', 'type_ident', 'nationality']

numeric_cols_imput_mean = ['n_lines', 'n_lines_pre', 'n_lines_post', 'age', 'months_to_end_promo_tarif', 
                           'months_to_end_promo_vf']

numeric_cols_imput_zero = ['voice_plan_change', 'data_plan_change', 'n_calls_billing_c', 'n_calls_billing_c_minus_1', 
                           'n_calls_churn_c', 'n_calls_churn_c_minus_1', 'n_calls_tariff_c', 'n_calls_tariff_c_minus_1', 
                           'n_calls_dsl_inc_c', 'n_calls_dsl_inc_c_minus_1', 'n_calls_mobile_inc_c', 
                           'n_calls_mobile_inc_c_minus_1', 'n_calls_device_upgr_c', 'n_calls_device_upgr_c_minus_1', 
                           'n_calls_device_del_rep_c', 'n_calls_device_del_rep_c_minus_1', 'n_calls_new_adds_c',
                           'n_calls_new_adds_c_minus_1', 'n_calls_ser_man_c', 'n_calls_ser_man_c_minus_1']

targetcol = 'n_calls_billing_c_plus_1'

datamart_name = 'tests_es.dacc_cops_datamart'

featurecols = ','.join(string_cols_imput_null + numeric_cols_imput_mean + numeric_cols_imput_zero)

sql_positive = """
select
billing_cycle_id,
{featurecols},
1 as label
from {datamart}
where {targetcol} > 0
""".format(featurecols=featurecols, targetcol=targetcol, datamart=datamart_name)

sql_negative = """
select
billing_cycle_id,
{featurecols},
0 as label
from {datamart}
where {targetcol} = 0
""".format(featurecols=featurecols, targetcol=targetcol, datamart=datamart_name)

analytics_panel_positive = spark.sql(sql_positive)
analytics_panel_negative_full = spark.sql(sql_negative)
whole_data = analytics_panel_positive.union(analytics_panel_negative_full)
sample_fraction = analytics_panel_positive.count() / analytics_panel_negative_full.count()
analytics_panel_negative = analytics_panel_negative_full.sample(False, sample_fraction)
analytics_panel = analytics_panel_positive.union(analytics_panel_negative)

analytics_panel[['billing_cycle_id', 'n_calls_billing_c', 'n_calls_billing_c_minus_1', 'label']].show()

## Train - Test Split

In [None]:
logging.info("Defining Train - Test split...")
train = analytics_panel.where('billing_cycle_id < 20171201')
test_1 = whole_data.where('billing_cycle_id >= 20171201 and billing_cycle_id < 20180101')
test_2 = whole_data.where('billing_cycle_id >= 20180101 and billing_cycle_id < 20180201')

# logging.info("Saving data to csv to testing and processing outside this notebook...")
# train.toPandas().to_csv('./data/train.csv', encoding='utf-8')
# test_1.toPandas().to_csv('./data/test_1.csv', encoding='utf-8')
# test_2.toPandas().to_csv('./data/test_2.csv', encoding='utf-8')

# Set refresh=True for train if data changes
logging.info("Performing cleaning to train and test sets...")
train = clean(train, string_cols_imput_null, numeric_cols_imput_mean, numeric_cols_imput_zero, refresh=False)
test_1 = clean(test_1, string_cols_imput_null, numeric_cols_imput_mean, numeric_cols_imput_zero, refresh=False)
test_2 = clean(test_2, string_cols_imput_null, numeric_cols_imput_mean, numeric_cols_imput_zero, refresh=False)

In [None]:
print('Number of training observations: {n}'.format(n=train.count()))
print('Number of test set 1 observations: {n}'.format(n=test_1.count()))
print('Number of test set 2 observations: {n}'.format(n=test_2.count()))

## Logistic Regression Model definition

In [None]:
# Indexers + Encoders for categorical columns

indexers = [StringIndexer(inputCol=col, outputCol=col + '_index', handleInvalid='skip') for col in string_cols_imput_null]

encoders = [OneHotEncoder(inputCol=col + '_index', outputCol=col + '_vect') for col in string_cols_imput_null]

# Features data assembly

featurecols = numeric_cols_imput_zero + numeric_cols_imput_mean + [col + '_vect' for col in string_cols_imput_null]

assembler = VectorAssembler(
    inputCols=featurecols,
    
    outputCol="features")

# Data normalization

# Standarization
standarizer = StandardScaler(inputCol="features", outputCol="normFeatures")

# Model
lr = LogisticRegression(featuresCol="normFeatures")

# Standarization included in algorithm

stages = indexers + encoders + [assembler] + [standarizer] + [lr]
# stages = indexers + encoders + [assembler] + [lr]

pipeline_lr = Pipeline(stages=stages)

## Logistic Regression Model training and tuning

### Training

In [None]:
# Parameters settings
regparam_max = 0.5
regparam_min = 0.0
regparam_n = 10
elasticnetparam_max = 1.0
elasticnetparam_min = 0.0
elasticnetparam_n = 5

regparam_values = np.linspace(regparam_min, regparam_max, regparam_n)
elasticnetparam_values = np.linspace(elasticnetparam_min, elasticnetparam_max, elasticnetparam_n)

# Search Grid definition
paramGrid = ParamGridBuilder()\
    .addGrid(lr.regParam, regparam_values)\
    .addGrid(lr.elasticNetParam, elasticnetparam_values)\
    .addGrid(lr.fitIntercept, [False, True])\
    .build()

# Validation Split definition
tvs = TrainValidationSplit(estimator=pipeline_lr, 
                           estimatorParamMaps=paramGrid, 
                           evaluator=BinaryClassificationEvaluator(metricName='areaUnderROC'), 
                           trainRatio=0.8)

# Model training
logging.info("Training model {n} times... better grab a coffee or go doing something else :-D".format(n=elasticnetparam_n*regparam_n))
model_lr = tvs.fit(train)

### Best Model

In [None]:
best_model_lr = model_lr.bestModel
print("Best regularization parameter: {regparam}".format(regparam=best_model_lr.stages[-1]._java_obj.getRegParam()))
print("Best elasticnet parameter: {elasticnetparam}".format(elasticnetparam=best_model_lr.stages[-1]._java_obj.getElasticNetParam()))
print("Best elasticnet parameter: {fitintercept}".format(fitintercept=best_model_lr.stages[-1]._java_obj.getFitIntercept()))
#print("Best model training AUC: {AUC}".format(AUC=best_model_lr.stages[-1].summary.areaUnderROC))

## Logistic Regression Model saving

In [None]:
logging.info("Saving model to hdfs...")
best_model_lr.write().overwrite().save('./cops/output/model.pickle')

## Logistic Regression Model Loading

In [None]:
logging.info("Loading model to hdfs...")
best_model_lr = PipelineModel.load('./cops/output/model.pickle')

## Logistic Regression Model performance testing

In [None]:
logging.info("Computing predictions for tests set 1")
predictions_1 = best_model_lr.transform(test_1)[['rawPrediction', 'probability', 'prediction', 'label']]
logging.info("Computing predictions for tests set 2")
predictions_2 = best_model_lr.transform(test_2)[['rawPrediction', 'probability', 'prediction', 'label']]

logging.info("Saving predictions as parquet files...")
predictions_1.write.mode('overwrite').csv("/user/adesant3/cops/data/test_1_pred.csv")
predictions_2.write.mode('overwrite').csv("/user/adesant3/cops/data/test_2_pred.csv")

## Logistic Regression Model Most important features

In [None]:
lrm = best_model_lr.stages[-1]
attrs = sorted((attr['idx'], attr['name']) for attr in chain(*predictions_1.schema['features'].metadata['ml_attr']['attrs'].values()))
coeffs_importance = [(name, lrm.coefficients[idx]) for idx, name in attrs[:]]
coefficients_importance = pd.DataFrame(columns=['coef', 'coef_value'], data=coeffs_importance).sort_values('coef_value', ascending=False)
coefficients_importance.to_csv('./coefficients_importance.csv', encoding='utf-8')
display(coefficients_importance)