In [1]:
from functools import partial
from datetime import date, timedelta, datetime

from numpy import nan as np_nan

from pyspark.sql.functions import udf, col, decode, when, lit, lower, translate
from pyspark.sql.types import DoubleType, StringType, IntegerType

In [2]:
month_for_verifying_in_pospago = "201703"

month_for_getting_prepaid_data = (datetime.strptime(month_for_verifying_in_pospago, "%Y%m") 
                                  - timedelta(4*365/12)).strftime("%Y%m")

print(month_for_verifying_in_pospago, month_for_getting_prepaid_data)

('201703', '201610')


In [3]:
useful_columns_from_acFinalPrepago = ["FECHA_EJECUCION",
                                      "MSISDN",
                                      "NUM_DOCUMENTO_CLIENTE",
                                      "NACIONALIDAD", 
                                      "NUM_PREPAGO", 
                                      "NUM_POSPAGO",
                                      #"Tipo_Documento_Cliente", Very uninformed
                                      "Tipo_Documento_Comprador",
                                      "X_FECHA_NACIMIENTO"]

def empty_str_to_null(string_value):
    if string_value == "":
        result = None
    elif string_value == u"":
        result = None
    else:
        result = string_value
    return result

empty_string_to_null = udf(empty_str_to_null, StringType())

def get_customer_age_raw(birthdate, month_for_getting_prepaid_data):
        if birthdate is None:
            return np_nan
        parsed_date = datetime.strptime(str(int(birthdate)), "%Y%m%d")
        timedelta = datetime.strptime(month_for_getting_prepaid_data, "%Y%m") - parsed_date
        return timedelta.days / 365.25

def get_customer_age_udf(birthdate, month):
    return udf(partial(get_customer_age_raw, month_for_getting_prepaid_data=month), DoubleType())(birthdate)

def subsitute_crappy_characters(string_column):
    return (string_column
            .replace(u"\ufffd", u"ñ")
            # add more here in the future
           )

substitute_crappy_characters_udf = udf(subsitute_crappy_characters, StringType())

acFinalPrepago = (sqlContext.read.table("raw_es.vf_pre_ac_final")
                  .where((col("year") == int(month_for_getting_prepaid_data[:4]))
                         & (col("month") == int(month_for_getting_prepaid_data[4:]))
                        )
                  .select(*useful_columns_from_acFinalPrepago)
                  .withColumn("X_FECHA_NACIMIENTO", empty_string_to_null(col("X_FECHA_NACIMIENTO")))
                  .withColumn("NUM_DOCUMENTO_CLIENTE", empty_string_to_null(col("NUM_DOCUMENTO_CLIENTE")))
                  .withColumn("age_in_years", get_customer_age_udf(col("X_FECHA_NACIMIENTO"),
                                                                   month_for_getting_prepaid_data)
                             )
                  .withColumn("NACIONALIDAD", substitute_crappy_characters_udf(col("NACIONALIDAD")))
                  #.withColumn("")
                 )

acFinalPrepago = acFinalPrepago.repartition(int(acFinalPrepago.count() / 500)+1)

In [4]:
most_frequent_countries = [u"España",
                           u"Marruecos",
                           u"Rumania",
                           u"Colombia",
                           u"Italia",
                           u"Ecuador",
                           u"Alemania",
                           u"Estados Unidos",
                           u"Francia",
                           u"Brasil",
                           u"Argentina",
                           u"Afganistan",
                           u"Bolivia",
                           u"Gran Bretaña",
                           u"Portugal",
                           u"Paraguay",
                           u"China",
                           u"Gran Bretana",
                           u"Venezuela",
                           u"Honduras",
                           u"Corea del Sur"]


acFinalPrepago_2 = acFinalPrepago.withColumn("NACIONALIDAD", when(col("NACIONALIDAD").isin(most_frequent_countries),
                                                                  col("NACIONALIDAD")
                                                                 ).otherwise(lit("Other"))
                                            )

for column in most_frequent_countries + ["Other"]:
    acFinalPrepago_2 = acFinalPrepago_2.withColumn("nationality_"+column.replace(" ","_"), 
                                                   when(col("NACIONALIDAD")==lit(column),1.0)
                                                   .otherwise(0.0)
                                                   )
    
acFinalPrepago_2 = acFinalPrepago_2.drop("NACIONALIDAD")

In [5]:
most_frequent_documents = ["nif",
                           "pasaporte",
                           "tarj_residente",
                           "cif"]


acFinalPrepago_3 = acFinalPrepago_2.withColumn("Tipo_Documento_Comprador",
                                               translate(translate(lower(col("Tipo_Documento_Comprador")), ".", "")," ","_")
                                              )


acFinalPrepago_3 = acFinalPrepago_3.withColumn("Tipo_Documento_Comprador", 
                                               when(col("Tipo_Documento_Comprador").isin(most_frequent_documents),
                                                    col("Tipo_Documento_Comprador")
                                                   ).otherwise(lit("Other"))
                                            )

for column in most_frequent_documents + ["Other"]:
    acFinalPrepago_3 = acFinalPrepago_3.withColumn("documenttype_"+column.replace(" ","_"), 
                                                   when(col("Tipo_Documento_Comprador")==lit(column),1.0)
                                                   .otherwise(0.0)
                                                   )

acFinalPrepago_3 = acFinalPrepago_3.drop("Tipo_Documento_Comprador")

In [6]:
acFinalPospago_4monthsLater = (sqlContext.read.table("raw_es.vf_pos_ac_final")
                               .where((col("year") == int(month_for_verifying_in_pospago[:4]))
                                      & (col("month") == int(month_for_verifying_in_pospago[4:]))
                                     )
                               .select("x_id_red","x_num_ident")
                               .na.drop()
                              )

acFinalPospago_4monthsLater = acFinalPospago_4monthsLater.repartition(int(acFinalPospago_4monthsLater.count() / 500)+1)

join_prepago_pospago = (acFinalPrepago_3.join(acFinalPospago_4monthsLater,
                                              on=(acFinalPrepago_3["MSISDN"]==acFinalPospago_4monthsLater["x_id_red"])
                                               & (acFinalPrepago_3["NUM_DOCUMENTO_CLIENTE"]==acFinalPospago_4monthsLater["x_num_ident"]),
                                             how="left"
                                            )
                        .withColumn("migrated_to_postpaid", (~col("x_id_red").isNull()).cast(IntegerType()))
                       )

In [7]:
useful_columns_from_tarificadorPre = ['MSISDN',
                                      'MOU',
                                      'TOTAL_LLAMADAS',
                                      'TOTAL_SMS',
                                      'MOU_Week',
                                      'LLAM_Week',
                                      'SMS_Week',
                                      'MOU_Weekend',
                                      'LLAM_Weekend',
                                      'SMS_Weekend',
                                      'MOU_VF',
                                      'LLAM_VF',
                                      'SMS_VF',
                                      'MOU_Fijo',
                                      'LLAM_Fijo',
                                      'SMS_Fijo',
                                      'MOU_OOM',
                                      'LLAM_OOM',
                                      'SMS_OOM',
                                      'MOU_Internacional',
                                      'LLAM_Internacional',
                                      'SMS_Internacional',
                                      'ActualVolume',
                                      'Num_accesos',
                                      'Plan',
                                      'Num_Cambio_Planes',
                                      #'TOP_Internacional', # No idea of what is
                                      'LLAM_COMUNIDAD_SMART',
                                      'MOU_COMUNIDAD_SMART',
                                      'LLAM_SMS_COMUNIDAD_SMART',
                                      'Flag_Uso_Etnica',
                                      'cuota_SMART8',
                                      'cuota_SMART12',
                                      'cuota_SMART16']


tarificadorPre = (sqlContext.read.table("raw_es.vf_pre_info_tarif")
                  .where((col("year") == int(month_for_getting_prepaid_data[:4]))
                         & (col("month") == int(month_for_getting_prepaid_data[4:]))
                        )
                  .select(*useful_columns_from_tarificadorPre)
                 )

tarificadorPre = tarificadorPre.repartition(int(tarificadorPre.count() / 500)+1)

plan_categories = ['PPIB7',
                   'PPFCL',
                   'PPIB4',
                   'PPXS8',
                   'PPIB8',
                   'PPIB9',
                   'PPTIN',
                   'PPIB1',
                   'PPVIS',
                   'PPREX',
                   'PPIB5',
                   'PPREU',
                   'PPRET',
                   'PPFCS',
                   'PPIB6',
                   'PPREY',
                   'PPVSP',
                   'PPIB2',
                   'PPIB3',
                   'PPRE2',
                   'PPRE5',
                   'PPVE2',
                   'PPVE1',
                   'PPRES',
                   'PPJ24',
                   'PPVE3',
                   'PPJAT',
                   'PPJMI']

tarificadorPre_2 = tarificadorPre.withColumn("Plan", 
                                             when(col("Plan").isin(plan_categories),
                                                  col("Plan")
                                                 ).otherwise(lit("Other"))
                                            )

for column in plan_categories + ["Other"]:
    tarificadorPre_2 = tarificadorPre_2.withColumn("plan_"+column.replace(" ","_"), 
                                                   when(col("Plan")==lit(column), 1.0)
                                                   .otherwise(0.0)
                                                   )

tarificadorPre_3 = tarificadorPre_2.drop("Plan")

In [8]:
prepaid_dataset_1 = join_prepago_pospago.join(tarificadorPre_3,
                                               how="inner",
                                               on="MSISDN").cache()

In [9]:
prepaid_dataset_1.groupBy("migrated_to_postpaid").count().show()

+--------------------+-------+
|migrated_to_postpaid|  count|
+--------------------+-------+
|                   1|  15545|
|                   0|3066276|
+--------------------+-------+



# The model

In [10]:
feature_columns = ['NUM_PREPAGO',
                   'NUM_POSPAGO',
                   'age_in_years',
                   #'documenttype_Other',
                   'documenttype_cif',
                   #'documenttype_nif',
                   'documenttype_pasaporte',
                   #'documenttype_tarj_residente',
                   #'nationality_Afganistan',
                   #'nationality_Alemania',
                   #'nationality_Argentina',
                   #'nationality_Bolivia',
                   #'nationality_Brasil',
                   #'nationality_China',
                   #'nationality_Colombia',
                   #'nationality_Corea_del_Sur',
                   #'nationality_Ecuador',
                   #'nationality_España',
                   #'nationality_Estados_Unidos',
                   #'nationality_Francia',
                   #'nationality_Gran_Bretana',
                   #'nationality_Gran_Bretaña',
                   #'nationality_Honduras',
                   #'nationality_Italia',
                   #'nationality_Marruecos',
                   #'nationality_Other',
                   #'nationality_Paraguay',
                   #'nationality_Portugal',
                   #'nationality_Rumania',
                   #'nationality_Venezuela',
                   #'migrated_to_postpaid',
                   'MOU',
                   'TOTAL_LLAMADAS',
                   'TOTAL_SMS',
                   'MOU_Week',
                   'LLAM_Week',
                   'SMS_Week',
                   'MOU_Weekend',
                   'LLAM_Weekend',
                   #'SMS_Weekend',
                   'MOU_VF',
                   'LLAM_VF',
                   'SMS_VF',
                   'MOU_Fijo',
                   'LLAM_Fijo',
                   #'SMS_Fijo',
                   'MOU_OOM',
                   'LLAM_OOM',
                   'SMS_OOM',
                   'MOU_Internacional',
                   #'LLAM_Internacional',
                   #'SMS_Internacional',
                   'ActualVolume',
                   'Num_accesos',
                   #'Num_Cambio_Planes',
                   'LLAM_COMUNIDAD_SMART',
                   'MOU_COMUNIDAD_SMART',
                   'LLAM_SMS_COMUNIDAD_SMART',
                   #'Flag_Uso_Etnica',
                   'cuota_SMART8',
                   #'cuota_SMART12',
                   #'cuota_SMART16',
                   'plan_PPFCL',
                   #'plan_PPFCS',
                   #'plan_PPIB1',
                   #'plan_PPIB2',
                   #'plan_PPIB3',
                   #'plan_PPIB4',
                   #'plan_PPIB5',
                   #'plan_PPIB6',
                   #'plan_PPIB7',
                   'plan_PPIB8',
                   #'plan_PPIB9',
                   #'plan_PPJ24',
                   #'plan_PPJAT',
                   #'plan_PPJMI',
                   #'plan_PPRE2',
                   #'plan_PPRE5',
                   #'plan_PPRES',
                   #'plan_PPRET',
                   #'plan_PPREU',
                   #'plan_PPREX',
                   #'plan_PPREY',
                   'plan_PPTIN',
                   #'plan_PPVE1',
                   #'plan_PPVE2',
                   #'plan_PPVE3',
                   #'plan_PPVIS',
                   #'plan_PPVSP',
                   #'plan_PPXS8'
                  ]

In [11]:
prepaid_dataset_2 = prepaid_dataset_1

for column in feature_columns:
    prepaid_dataset_2 = prepaid_dataset_2.withColumn(column, col(column).cast(DoubleType()))

In [12]:
train, test = prepaid_dataset_2.randomSplit([0.8, 0.2])

In [13]:
scale_pos_weight = float(train.where(col("migrated_to_postpaid")==0).count()) / train.where(col("migrated_to_postpaid")==1).count()

scale_pos_weight

198.2337777777778

In [14]:
from pyspark.ml.feature import VectorAssembler, StringIndexer

vector_assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

train_assembled = vector_assembler.transform(train)

string_indexer_label = StringIndexer(inputCol="migrated_to_postpaid", outputCol="label")
string_indexer_label_model = string_indexer_label.fit(train_assembled)

train_prepared = string_indexer_label_model.transform(train_assembled)

In [16]:
(train_prepared
 .repartition(int(train_prepared.count() / 500)+1)
 .write
 .saveAsTable("tests_es.training_set_pre2post_201610",
              format="parquet",
              mode="overwrite")
 )

In [25]:
type(train_assembled.schema["features"].dataType)

pyspark.ml.linalg.VectorUDT

In [28]:
#from pyspark.ml.linalg import VectorUDT

#toDense_udf = udf(lambda x: x.toDense(), VectorUDT())

#train_assembled.select(toDense_udf(col("features"))).show()

Py4JJavaError: An error occurred while calling o808.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 73.0 failed 4 times, most recent failure: Lost task 0.3 in stage 73.0 (TID 193321, vgddp531hr.dc.sedc.internal.vodafone.com, executor 670): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/cloudera/parcels/SPARK2-2.0.0.cloudera2-1.cdh5.7.0.p0.118100/lib/spark2/python/pyspark/worker.py", line 172, in main
    process()
  File "/opt/cloudera/parcels/SPARK2-2.0.0.cloudera2-1.cdh5.7.0.p0.118100/lib/spark2/python/pyspark/worker.py", line 167, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/opt/cloudera/parcels/SPARK2-2.0.0.cloudera2-1.cdh5.7.0.p0.118100/lib/spark2/python/pyspark/worker.py", line 106, in <lambda>
    func = lambda _, it: map(mapper, it)
  File "/opt/cloudera/parcels/SPARK2-2.0.0.cloudera2-1.cdh5.7.0.p0.118100/lib/spark2/python/pyspark/worker.py", line 92, in <lambda>
    mapper = lambda a: udf(*a)
  File "/opt/cloudera/parcels/SPARK2-2.0.0.cloudera2-1.cdh5.7.0.p0.118100/lib/spark2/python/pyspark/worker.py", line 68, in <lambda>
    return lambda *a: toInternal(f(*a))
  File "<ipython-input-28-a3b24b173ce8>", line 3, in <lambda>
AttributeError: 'SparseVector' object has no attribute 'toDense'

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
	at org.apache.spark.sql.execution.python.BatchEvalPythonExec$$anonfun$doExecute$1.apply(BatchEvalPythonExec.scala:124)
	at org.apache.spark.sql.execution.python.BatchEvalPythonExec$$anonfun$doExecute$1.apply(BatchEvalPythonExec.scala:68)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:779)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:779)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
	at org.apache.spark.scheduler.Task.run(Task.scala:86)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
	at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1454)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1442)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1441)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1441)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:811)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1669)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1624)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1613)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1893)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1906)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1919)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:347)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:39)
	at org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$execute$1$1.apply(Dataset.scala:2193)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:57)
	at org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:2546)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$execute$1(Dataset.scala:2192)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collect(Dataset.scala:2199)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:1935)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:1934)
	at org.apache.spark.sql.Dataset.withTypedCallback(Dataset.scala:2576)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:1934)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2149)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:239)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:606)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/cloudera/parcels/SPARK2-2.0.0.cloudera2-1.cdh5.7.0.p0.118100/lib/spark2/python/pyspark/worker.py", line 172, in main
    process()
  File "/opt/cloudera/parcels/SPARK2-2.0.0.cloudera2-1.cdh5.7.0.p0.118100/lib/spark2/python/pyspark/worker.py", line 167, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/opt/cloudera/parcels/SPARK2-2.0.0.cloudera2-1.cdh5.7.0.p0.118100/lib/spark2/python/pyspark/worker.py", line 106, in <lambda>
    func = lambda _, it: map(mapper, it)
  File "/opt/cloudera/parcels/SPARK2-2.0.0.cloudera2-1.cdh5.7.0.p0.118100/lib/spark2/python/pyspark/worker.py", line 92, in <lambda>
    mapper = lambda a: udf(*a)
  File "/opt/cloudera/parcels/SPARK2-2.0.0.cloudera2-1.cdh5.7.0.p0.118100/lib/spark2/python/pyspark/worker.py", line 68, in <lambda>
    return lambda *a: toInternal(f(*a))
  File "<ipython-input-28-a3b24b173ce8>", line 3, in <lambda>
AttributeError: 'SparseVector' object has no attribute 'toDense'

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
	at org.apache.spark.sql.execution.python.BatchEvalPythonExec$$anonfun$doExecute$1.apply(BatchEvalPythonExec.scala:124)
	at org.apache.spark.sql.execution.python.BatchEvalPythonExec$$anonfun$doExecute$1.apply(BatchEvalPythonExec.scala:68)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:779)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:779)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
	at org.apache.spark.scheduler.Task.run(Task.scala:86)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
	... 1 more
