In [None]:
#export PYSPARK_PYTHON=/opt/cloudera/parcels/Anaconda/bin/python
#export PYSPARK_DRIVER_PYTHON=/opt/cloudera/parcels/Anaconda/bin/python
#unset PYSPARK_DRIVER_PYTHON_OPTS

In [1]:
from functools import partial
from datetime import date, timedelta, datetime

from numpy import nan as np_nan

from pyspark import StorageLevel
from pyspark.sql.functions import udf, col, decode, when, lit, lower, translate
from pyspark.sql.types import DoubleType, StringType, IntegerType

In [2]:
sc.version

u'2.0.0.cloudera2'

In [None]:
#spark = (SparkSession.builder
#         .appName("Pre2Post Spain hyperparameter tuning")
#         .master("yarn")
#         .config("spark.submit.deployMode", "client")
#         .enableHiveSupport()
#         .getOrCreate()
#         )

# sc = spark.sparkContext

In [1]:
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(featuresCol="features", 
                    labelCol="label", 
                    predictionCol="prediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, lossType="logistic", maxIter=20, stepSize=0.1, seed=None)

In [2]:
month_for_verifying_in_pospago = "201703"

month_for_getting_prepaid_data = (datetime.strptime(month_for_verifying_in_pospago, "%Y%m") 
                                  - timedelta(4*365/12)).strftime("%Y%m")

print(month_for_verifying_in_pospago, month_for_getting_prepaid_data)

('201703', '201610')


In [3]:
useful_columns_from_acFinalPrepago = ["FECHA_EJECUCION",
                                      "MSISDN",
                                      "NUM_DOCUMENTO_CLIENTE",
                                      "NACIONALIDAD", 
                                      "NUM_PREPAGO", 
                                      "NUM_POSPAGO",
                                      #"Tipo_Documento_Cliente", Very uninformed
                                      "Tipo_Documento_Comprador",
                                      "X_FECHA_NACIMIENTO"]

def empty_str_to_null(string_value):
    if string_value == "":
        result = None
    elif string_value == u"":
        result = None
    else:
        result = string_value
    return result

empty_string_to_null = udf(empty_str_to_null, StringType())

def get_customer_age_raw(birthdate, month_for_getting_prepaid_data):
        if birthdate is None:
            return np_nan
        parsed_date = datetime.strptime(str(int(birthdate)), "%Y%m%d")
        timedelta = datetime.strptime(month_for_getting_prepaid_data, "%Y%m") - parsed_date
        return timedelta.days / 365.25

def get_customer_age_udf(birthdate, month):
    return udf(partial(get_customer_age_raw, month_for_getting_prepaid_data=month), DoubleType())(birthdate)

def subsitute_crappy_characters(string_column):
    return (string_column
            .replace(u"\ufffd", u"ñ")
            # add more here in the future
           )

substitute_crappy_characters_udf = udf(subsitute_crappy_characters, StringType())

acFinalPrepago = (sqlContext.read.table("raw_es.vf_pre_ac_final")
                  .where((col("year") == int(month_for_getting_prepaid_data[:4]))
                         & (col("month") == int(month_for_getting_prepaid_data[4:]))
                        )
                  .select(*useful_columns_from_acFinalPrepago)
                  .withColumn("X_FECHA_NACIMIENTO", empty_string_to_null(col("X_FECHA_NACIMIENTO")))
                  .withColumn("NUM_DOCUMENTO_CLIENTE", empty_string_to_null(col("NUM_DOCUMENTO_CLIENTE")))
                  .withColumn("age_in_years", get_customer_age_udf(col("X_FECHA_NACIMIENTO"),
                                                                   month_for_getting_prepaid_data)
                             )
                  .withColumn("NACIONALIDAD", substitute_crappy_characters_udf(col("NACIONALIDAD")))
                  #.withColumn("")
                 )

acFinalPrepago = acFinalPrepago.repartition(int(acFinalPrepago.count() / 500)+1)

In [4]:
most_frequent_countries = [u"España",
                           u"Marruecos",
                           u"Rumania",
                           u"Colombia",
                           u"Italia",
                           u"Ecuador",
                           u"Alemania",
                           u"Estados Unidos",
                           u"Francia",
                           u"Brasil",
                           u"Argentina",
                           u"Afganistan",
                           u"Bolivia",
                           u"Gran Bretaña",
                           u"Portugal",
                           u"Paraguay",
                           u"China",
                           u"Gran Bretana",
                           u"Venezuela",
                           u"Honduras",
                           u"Corea del Sur"]


acFinalPrepago = acFinalPrepago.withColumn("NACIONALIDAD", when(col("NACIONALIDAD").isin(most_frequent_countries),
                                                                  col("NACIONALIDAD")
                                                                 ).otherwise(lit("Other"))
                                            )

In [5]:
acFinalPospago_4monthsLater = (sqlContext.read.table("raw_es.vf_pos_ac_final")
                               .where((col("year") == int(month_for_verifying_in_pospago[:4]))
                                      & (col("month") == int(month_for_verifying_in_pospago[4:]))
                                     )
                               .select("x_id_red","x_num_ident")
                               .na.drop()
                              )

acFinalPospago_4monthsLater = acFinalPospago_4monthsLater.repartition(int(acFinalPospago_4monthsLater.count() / 500)+1)

join_prepago_pospago = (acFinalPrepago.join(acFinalPospago_4monthsLater,
                                            on=(acFinalPrepago["MSISDN"]==acFinalPospago_4monthsLater["x_id_red"])
                                            & (acFinalPrepago["NUM_DOCUMENTO_CLIENTE"]==acFinalPospago_4monthsLater["x_num_ident"]),
                                            how="left"
                                            )
                        .withColumn("migrated_to_postpaid", (~col("x_id_red").isNull()).cast(IntegerType()))
                       )

In [6]:
useful_columns_from_tarificadorPre = ['MSISDN',
                                      'MOU',
                                      'TOTAL_LLAMADAS',
                                      'TOTAL_SMS',
                                      'MOU_Week',
                                      'LLAM_Week',
                                      'SMS_Week',
                                      'MOU_Weekend',
                                      'LLAM_Weekend',
                                      'SMS_Weekend',
                                      'MOU_VF',
                                      'LLAM_VF',
                                      'SMS_VF',
                                      'MOU_Fijo',
                                      'LLAM_Fijo',
                                      'SMS_Fijo',
                                      'MOU_OOM',
                                      'LLAM_OOM',
                                      'SMS_OOM',
                                      'MOU_Internacional',
                                      'LLAM_Internacional',
                                      'SMS_Internacional',
                                      'ActualVolume',
                                      'Num_accesos',
                                      'Plan',
                                      'Num_Cambio_Planes',
                                      #'TOP_Internacional', # No idea of what is
                                      'LLAM_COMUNIDAD_SMART',
                                      'MOU_COMUNIDAD_SMART',
                                      'LLAM_SMS_COMUNIDAD_SMART',
                                      'Flag_Uso_Etnica',
                                      'cuota_SMART8',
                                      'cuota_SMART12',
                                      'cuota_SMART16']


tarificadorPre = (sqlContext.read.table("raw_es.vf_pre_info_tarif")
                  .where((col("year") == int(month_for_getting_prepaid_data[:4]))
                         & (col("month") == int(month_for_getting_prepaid_data[4:]))
                        )
                  .select(*useful_columns_from_tarificadorPre)
                 )

tarificadorPre = tarificadorPre.repartition(int(tarificadorPre.count() / 500)+1)

plan_categories = ['PPIB7',
                   'PPFCL',
                   'PPIB4',
                   'PPXS8',
                   'PPIB8',
                   'PPIB9',
                   'PPTIN',
                   'PPIB1',
                   'PPVIS',
                   'PPREX',
                   'PPIB5',
                   'PPREU',
                   'PPRET',
                   'PPFCS',
                   'PPIB6',
                   'PPREY',
                   'PPVSP',
                   'PPIB2',
                   'PPIB3',
                   'PPRE2',
                   'PPRE5',
                   'PPVE2',
                   'PPVE1',
                   'PPRES',
                   'PPJ24',
                   'PPVE3',
                   'PPJAT',
                   'PPJMI']

tarificadorPre_2 = tarificadorPre.withColumn("Plan", 
                                             when(col("Plan").isin(plan_categories),
                                                  col("Plan")
                                                 ).otherwise(lit("Other"))
                                            )

In [7]:
prepaid_dataset_1 = join_prepago_pospago.join(tarificadorPre,
                                               how="inner",
                                               on="MSISDN").persist(StorageLevel.DISK_ONLY_2)

In [8]:
prepaid_dataset_1.groupBy("migrated_to_postpaid").count().show()

+--------------------+-------+
|migrated_to_postpaid|  count|
+--------------------+-------+
|                   1|  15545|
|                   0|3066276|
+--------------------+-------+



# The model

In [9]:
numeric_columns = ['NUM_PREPAGO',
                   'NUM_POSPAGO',
                   'age_in_years',
                   #'documenttype_Other',
                   #'documenttype_cif',
                   #'documenttype_nif',
                   #'documenttype_pasaporte',
                   #'documenttype_tarj_residente',
                   #'nationality_Afganistan',
                   #'nationality_Alemania',
                   #'nationality_Argentina',
                   #'nationality_Bolivia',
                   #'nationality_Brasil',
                   #'nationality_China',
                   #'nationality_Colombia',
                   #'nationality_Corea_del_Sur',
                   #'nationality_Ecuador',
                   #'nationality_España',
                   #'nationality_Estados_Unidos',
                   #'nationality_Francia',
                   #'nationality_Gran_Bretana',
                   #'nationality_Gran_Bretaña',
                   #'nationality_Honduras',
                   #'nationality_Italia',
                   #'nationality_Marruecos',
                   #'nationality_Other',
                   #'nationality_Paraguay',
                   #'nationality_Portugal',
                   #'nationality_Rumania',
                   #'nationality_Venezuela',
                   #'migrated_to_postpaid',
                   'MOU',
                   'TOTAL_LLAMADAS',
                   'TOTAL_SMS',
                   'MOU_Week',
                   'LLAM_Week',
                   'SMS_Week',
                   'MOU_Weekend',
                   'LLAM_Weekend',
                   #'SMS_Weekend',
                   'MOU_VF',
                   'LLAM_VF',
                   'SMS_VF',
                   'MOU_Fijo',
                   'LLAM_Fijo',
                   #'SMS_Fijo',
                   'MOU_OOM',
                   'LLAM_OOM',
                   'SMS_OOM',
                   'MOU_Internacional',
                   #'LLAM_Internacional',
                   #'SMS_Internacional',
                   'ActualVolume',
                   'Num_accesos',
                   #'Num_Cambio_Planes',
                   'LLAM_COMUNIDAD_SMART',
                   'MOU_COMUNIDAD_SMART',
                   'LLAM_SMS_COMUNIDAD_SMART',
                   #'Flag_Uso_Etnica',
                   'cuota_SMART8',
                   #'cuota_SMART12',
                   #'cuota_SMART16',
                   #'plan_PPFCL',
                   #'plan_PPFCS',
                   #'plan_PPIB1',
                   #'plan_PPIB2',
                   #'plan_PPIB3',
                   #'plan_PPIB4',
                   #'plan_PPIB5',
                   #'plan_PPIB6',
                   #'plan_PPIB7',
                   #'plan_PPIB8',
                   #'plan_PPIB9',
                   #'plan_PPJ24',
                   #'plan_PPJAT',
                   #'plan_PPJMI',
                   #'plan_PPRE2',
                   #'plan_PPRE5',
                   #'plan_PPRES',
                   #'plan_PPRET',
                   #'plan_PPREU',
                   #'plan_PPREX',
                   #'plan_PPREY',
                   #'plan_PPTIN',
                   #'plan_PPVE1',
                   #'plan_PPVE2',
                   #'plan_PPVE3',
                   #'plan_PPVIS',
                   #'plan_PPVSP',
                   #'plan_PPXS8'
                  ]

categorical_columns = ["Tipo_Documento_Comprador", "NACIONALIDAD", "Plan"]

In [10]:
prepaid_dataset_2 = prepaid_dataset_1

for column in numeric_columns:
    prepaid_dataset_2 = prepaid_dataset_2.withColumn(column, col(column).cast(DoubleType()))

In [11]:
from pyspark.ml.feature import StringIndexer

string_indexer_document = (StringIndexer(inputCol="Tipo_Documento_Comprador", outputCol="documentType_indexed")
                           .fit(prepaid_dataset_2))
string_indexer_nation = (StringIndexer(inputCol="NACIONALIDAD", outputCol="nationality_indexed")
                         .fit(prepaid_dataset_2))
string_indexer_plan = (StringIndexer(inputCol="Plan", outputCol="tariffPlan_indexed")
                       .fit(prepaid_dataset_2))
string_indexer_label = (StringIndexer(inputCol="migrated_to_postpaid", outputCol="label")
                        .fit(prepaid_dataset_2)
                       )

prepaid_dataset_prep = string_indexer_label.transform(string_indexer_plan
                                                .transform(string_indexer_nation
                                                           .transform(string_indexer_document
                                                                      .transform(prepaid_dataset_2)))
                                                     )

categorical_columns_indexed = ["documentType_indexed", "nationality_indexed", "tariffPlan_indexed"]

In [12]:
train, test = prepaid_dataset_prep.randomSplit([0.8, 0.2])

train = train.repartition(int(train.count() / 500)+1).cache()
test = test.repartition(int(test.count() / 500)+1).cache()

In [13]:
age_median = (train
              .na.drop(subset=["age_in_years"])
              .approxQuantile("age_in_years",
                              probabilities=[0.5],
                              relativeError=0.0)
             )[0]

In [14]:
train_filled = train.na.fill(age_median, subset=["age_in_years"])
test_filled = test.na.fill(age_median, subset=["age_in_years"])

In [15]:
from pyspark.ml.feature import VectorAssembler

vector_assembler = VectorAssembler(inputCols=numeric_columns + categorical_columns_indexed, outputCol="features")

train_assembled = vector_assembler.transform(train_filled).cache()
test_assembled = vector_assembler.transform(test_filled).cache()

In [16]:
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.classification import RandomForestClassifier

sc.setCheckpointDir("hdfs:///user/jsotovi2/spark_checkpoints/")

rf = RandomForestClassifier(featuresCol="features", 
                            labelCol="label",
                            maxBins=64,
                            maxMemoryInMB=512,
                            cacheNodeIds=True,
                            checkpointInterval=1
                           )

hyperparam_grid_pipeline_random_forest = (ParamGridBuilder()
                                          .addGrid(rf.maxDepth, [19, 17, 15, 13, 11, 9, 7, 5, 3])
                                          .addGrid(rf.numTrees, [256])
                                          .addGrid(rf.featureSubsetStrategy, ["all",0.6,"onethird","sqrt"])
                                          .build()
                                          )

In [17]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

random_forest_evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",
                                                        labelCol="label",
                                                        metricName="areaUnderROC")

In [18]:
from pyspark.ml.tuning import CrossValidator

cross_validator_pipeline_random_forest = CrossValidator(estimator=rf,
                                                        estimatorParamMaps=hyperparam_grid_pipeline_random_forest,
                                                        evaluator=random_forest_evaluator,
                                                        numFolds=5)

In [19]:
cross_validator_model_rf = cross_validator_pipeline_random_forest.fit(train_assembled)

KeyboardInterrupt: 

In [None]:
best_rf = cross_validator_model_rf.bestModel

string_best_model = best_rf._call_java("parent").extractParamMap().toString()

In [None]:
auc_train = random_forest_evaluator.evaluate(best_rf.transform(train_assembled))

auc_test = random_forest_evaluator.evaluate(best_rf.transform(test_assembled))

In [1]:
rdd_results = sc.parallelize([("best_rf",string_best_model), ("auc_train",auc_train), ("auc_test",auc_test)])

In [3]:
rdd_results.saveAsTextFile("hdfs:///user/jsotovi2/rf_model_pre2post_results.txt")

In [4]:
rdd_re_readed = sc.textFile("hdfs:///user/jsotovi2/rf_model_pre2post_results.txt")

In [5]:
rdd_re_readed.collect()

[u"('best_rf', 'blablabla')", u"('auc_train', 0.81)", u"('auc_test', 0.8)"]

In [None]:
prepaid_dataset_1.unpersist()
train.unpersist()
test.unpersist()