In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql.functions import *
import pyspark.sql.functions as func
from pyspark.sql.functions import countDistinct

In [2]:
def reading_datasource(data_source):
    path = "gs://ds-url-catag/plenty_stickers_data/"+data_source+"/aggregate_data/"
    df_new = spark.read.parquet(path)
    df_new = df_new.withColumn('date', concat(df_new.time.substr(1, 10)))
    df_new = df_new.withColumn('month', concat(df_new.time.substr(4, 5)))
    return df_new

#number of days logged in  
def get_numofdays(dataframe,feature_concat):
    new_col_name = feature_concat+"_sent_nofdays"
    distinctdate = dataframe.groupby('user_id_n').agg(countDistinct('date').alias("nofdays"))
    new_distinctdate = distinctdate.selectExpr("user_id_n as user_id_n","nofdays as "+ new_col_name)
    return new_distinctdate

def get_date_list(dataframe):
    dataframe_new = dataframe.select('user_id_n','date').dropDuplicates()
    from pyspark.sql import functions as F
    date_list = dataframe_new.groupby("user_id_n").agg(F.collect_list("date").alias("date_list"))
    return date_list

def get_condays(read_data,data_source,feature_concat):
    
    #reading in the data
    new_base = spark.read.parquet("gs://ds-url-catag/plenty_stickers_data/"+data_source+"/derived_features/temp/list_dates/")

    #computing the consecutive days 
    from pyspark.sql.types import FloatType,StringType
    from datetime import datetime,timedelta
    #function to derive average consecutive days
    def con_days(raw):
        raw = list(set(raw))
        n = len(raw)
        raw = sorted(raw, key=lambda d: map(int, d.split('-')))
        no_of_days = 0
        no_of_groups = 0
        group = 0
        total_days = 0
        res = []
        for i in range(0,n-1):
            current_date = datetime.strptime(raw[i], "%Y-%m-%d")
            current_date_mod = datetime.strftime(current_date, "%Y-%m-%d")
            con_date = current_date + timedelta(days=1)
            con_date_mod = datetime.strftime(con_date, "%Y-%m-%d")

            next_date = datetime.strptime(raw[i+1], "%Y-%m-%d")
            next_date_mod = datetime.strftime(next_date, "%Y-%m-%d")
            if con_date_mod == next_date_mod :
                no_of_days += 1
                total_days += 1
                if no_of_days == 1:
                    group += 1
                    no_of_groups = group

            else:
                no_of_days = 0
        res.append(total_days)
        return total_days

    consecutive_days = udf(con_days,StringType())

    def con_groups(raw):
        raw = list(set(raw))
        n = len(raw)
        raw = sorted(raw, key=lambda d: map(int, d.split('-')))
        no_of_days = 0
        no_of_groups = 0
        group = 0
        total_days = 0
        res = []
        for i in range(0,n-1):
            current_date = datetime.strptime(raw[i], "%Y-%m-%d")
            current_date_mod = datetime.strftime(current_date, "%Y-%m-%d")
            con_date = current_date + timedelta(days=1)
            con_date_mod = datetime.strftime(con_date, "%Y-%m-%d")

            next_date = datetime.strptime(raw[i+1], "%Y-%m-%d")
            next_date_mod = datetime.strftime(next_date, "%Y-%m-%d")
            if con_date_mod == next_date_mod :
                no_of_days += 1
                total_days += 1
                if no_of_days == 1:
                    group += 1
                    no_of_groups = group

            else:
                no_of_days = 0
        return no_of_groups

    groups = udf(con_groups,StringType())
    new_base = new_base.withColumn("Consective_days",consecutive_days('date_list'))
    new_base = new_base.withColumn("Consective_groups",groups('date_list'))
    new_base = new_base.withColumn("avg_con_days",when((col('Consective_days') != 0)  & (col('Consective_groups') != 0) , round((col('Consective_days')/col('Consective_groups')))).otherwise(0))
    final_base = new_base.select('user_id_n','avg_con_days')
    new_col_name = feature_concat+"_avg_con_days"
    final_base_one = final_base.selectExpr("user_id_n as user_id_n","avg_con_days as "+ new_col_name)
    return final_base_one

def get_stick_send(dataframe,feature_concat):
    #sticker packs sent
    stickers_sent = dataframe.select('user_id_n','sticker_pack_id').groupby('user_id_n').agg(count('sticker_pack_id').alias('sticker_packs'))
    new_col_name = feature_concat+"_sticker_packs"
    final_base_one = stickers_sent.selectExpr("user_id_n as user_id_n","sticker_packs as "+ new_col_name)
    return final_base_one

def get_stick_dist(dataframe,feature_concat):
    #sticker packs sent
    stickers_distinct_sent = dataframe.select('user_id_n','sticker_pack_id').groupby('user_id_n').agg(countDistinct('sticker_pack_id').alias('distinct_stick_packs'))
    new_col_name = feature_concat+"_distinct_stick_packs"
    final_base_one = stickers_distinct_sent.selectExpr("user_id_n as user_id_n","distinct_stick_packs as "+ new_col_name)
    return final_base_one

def get_stick_type(dataframe,feature_concat):
    #reading in sticker type data
    Sticker_types = spark.read.parquet("gs://ds-url-catag/Stickers/stick_bytype/agg_proc_stickertypes/*.parquet")
    Sticker_types = Sticker_types.dropDuplicates()

    Sticker_categories = spark.read.parquet("gs://ds-url-catag/Stickers/processed_categories/*/*/*.parquet")
    Sticker_categories = Sticker_categories.dropDuplicates()
    Sticker_categories = Sticker_categories.selectExpr("Sticker_id", "name","animated","Type as Type_n","category")

    #joinging type and category data
    data_join_type = dataframe.join(Sticker_types,Sticker_types.id==dataframe.sticker_pack_id,'left').select([dataframe.user_id_n,dataframe.sticker_pack_id]+[Sticker_types.type])
    data_join_categories = data_join_type.join(Sticker_categories,Sticker_categories.Sticker_id==data_join_type.sticker_pack_id,'left').select([data_join_type.user_id_n,data_join_type.sticker_pack_id,data_join_type.type]+[Sticker_categories.name,Sticker_categories.animated,Sticker_categories.Type_n,Sticker_categories.category])
    data_join_categories = data_join_categories.dropDuplicates()
    #deriving new columns
    data_join_categories = data_join_categories.withColumn('Final_Type',when(col('type').isNull(),col('Type_n')).otherwise(col('type')))
    data_join_categories = data_join_categories.withColumn('Final_Type',when(col('Final_Type') == 'true','FREE').otherwise(col('Final_Type')))
    #Deriving sticker type variables
    #paid
    data_join_categories  = data_join_categories.withColumn("sent_is_paid",when(col('Final_Type') == 'PAID',data_join_categories.sticker_pack_id).otherwise(None))
    data_join_categories  = data_join_categories.withColumn("sent_is_free",when(col('Final_Type') == 'FREE',data_join_categories.sticker_pack_id).otherwise(None))
    data_join_categories  = data_join_categories.withColumn("sent_is_subscribed",when(col('Final_Type') == 'SUBSCRIPTION',data_join_categories.sticker_pack_id).otherwise(None))
    data_join_categories  = data_join_categories.withColumn("sent_is_discontinued",when(col('Final_Type') == 'DISCONTINUED',data_join_categories.sticker_pack_id).otherwise(None))

    #replacing strings with numbers for clustering
    data_join_categories = data_join_categories.withColumn('sent_is_paid',when(col('sent_is_paid').isNull(),0).otherwise(1))
    data_join_categories = data_join_categories.withColumn('sent_is_free',when(col('sent_is_free').isNull(),0).otherwise(1))
    data_join_categories = data_join_categories.withColumn('sent_is_subscribed',when(col('sent_is_subscribed').isNull(),0).otherwise(1))
    data_join_categories = data_join_categories.withColumn('sent_is_discontinued',when(col('sent_is_discontinued').isNull(),0).otherwise(1))

    #aggregating the data
    more_stick_base = data_join_categories.groupBy('user_id_n').agg(func.sum('sent_is_paid').alias('sum_paid'),func.sum('sent_is_free').alias('sum_free'),func.sum('sent_is_subscribed').alias('sum_subs'),func.sum('sent_is_discontinued').alias('sum_discont'))
    col_one =feature_concat+"_sum_paid"
    col_two =feature_concat+"_sum_free"
    col_three =feature_concat+"_sum_subs"
    col_four = feature_concat+"_sum_discont"
    final_base_one = more_stick_base.selectExpr("user_id_n as user_id_n","sum_paid as "+ col_one ,"sum_free as "+ col_two,"sum_subs as "+ col_three,"sum_discont as "+ col_four)
    return final_base_one

def get_result(data_source,dataframe):
    print data_source
    #reading in all the written data 
    one = spark.read.parquet("gs://ds-url-catag/plenty_stickers_data/"+data_source+"/derived_features/temp/distinctdays/")
    two = spark.read.parquet("gs://ds-url-catag/plenty_stickers_data/"+data_source+"/derived_features/temp/Avg_con_days/")
    three = spark.read.parquet("gs://ds-url-catag/plenty_stickers_data/"+data_source+"/derived_features/temp/stick_packs_sent/")
    four = spark.read.parquet("gs://ds-url-catag/plenty_stickers_data/"+data_source+"/derived_features/temp/dist_stick_sent/")
    five = spark.read.parquet("gs://ds-url-catag/plenty_stickers_data/"+data_source+"/derived_features/temp/sticker_type_cat/")
    #join all datasets
    data_one = one.join(two,two.user_id_n == one.user_id_n).drop(two.user_id_n)
    data_two = data_one.join(three,three.user_id_n == data_one.user_id_n).drop(three.user_id_n)
    data_three = data_two.join(four,four.user_id_n == data_two.user_id_n).drop(four.user_id_n)
    data_four = data_three.join(five,five.user_id_n == data_three.user_id_n).drop(five.user_id_n)
    data_city = dataframe.groupby('user_id_n','City').count()
    data_city = data_city.dropDuplicates()
    data_five = data_four.join(data_city,data_city.user_id_n == data_four.user_id_n).drop(data_city.user_id_n)
    new_data = data_five.repartition(200)
    return new_data


#write the final file to all the data
def writing_to_parquet(data_source,dataframe,location):
    print "writing"
    path =  "gs://ds-url-catag/plenty_stickers_data/"+data_source+"/derived_features/temp/"+location+"/"
    print path 
    dataframe.dropDuplicates().write.mode('overwrite').parquet(path)  
    return "Data written"

In [4]:
if __name__ == '__main__':
    Sticker_datasources = ['event=BBM-STICKER-SEND']
    # values for Sticker_Datasources
    # 'event=BBM-STICKER-RECEIVED','event=BBM-STICKER_SPONSORED_LINK-CLICK','event=BBM-STICKER-CLICK','event=BBM-STICKER-DOWNLOAD','event=BBM-STICKER-SEND'
    for ds in Sticker_datasources:
        split_name = ds.split('-')
        if split_name[1] == "STICKER_SPONSORED_LINK": 
            feature_concat = "sp_"+split_name[2].lower()
        else:
            feature_concat = split_name[2].lower()
        read_data = reading_datasource(ds)
        features =['distinctdays','list_dates','Avg_con_days','dist_stick_sent','sticker_type_cat']
        #values for features 
        #'distinctdays','list_dates','Avg_con_days','stick_packs_sent','dist_stick_sent','sticker_type_cat']
        for sub_feature in features:
            if sub_feature == 'distinctdays':
                numofdays = get_numofdays(read_data,feature_concat)     
                writing_to_parquet(ds,numofdays,'distinctdays')
            elif sub_feature == 'list_dates':
                datelist = get_date_list(read_data)          
                writing_to_parquet(ds,datelist,'list_dates')
            elif sub_feature == 'Avg_con_days':
                avgcondays = get_condays(read_data,ds,feature_concat)          
                writing_to_parquet(ds,avgcondays,'Avg_con_days')
            elif sub_feature == 'stick_packs_sent':
                sticksent = get_stick_send(read_data,feature_concat)          
                writing_to_parquet(ds,sticksent,'stick_packs_sent')
            elif sub_feature == 'dist_stick_sent':
                stickdistsent = get_stick_dist(read_data,feature_concat)          
                writing_to_parquet(ds,stickdistsent,'dist_stick_sent')
            elif sub_feature == 'sticker_type_cat':
                sticktypecat = get_stick_type(read_data,feature_concat)      
                writing_to_parquet(ds,sticktypecat,'sticker_type_cat')
        result = get_result(ds,read_data)
        result.dropDuplicates().write.mode('overwrite').parquet("gs://ds-url-catag/plenty_stickers_data/"+ds+"/derived_features/Final/")        

writing
gs://ds-url-catag/plenty_stickers_data/event=BBM-STICKER-SEND/derived_features/temp/distinctdays/
writing
gs://ds-url-catag/plenty_stickers_data/event=BBM-STICKER-SEND/derived_features/temp/list_dates/
writing
gs://ds-url-catag/plenty_stickers_data/event=BBM-STICKER-SEND/derived_features/temp/Avg_con_days/
writing
gs://ds-url-catag/plenty_stickers_data/event=BBM-STICKER-SEND/derived_features/temp/dist_stick_sent/
writing
gs://ds-url-catag/plenty_stickers_data/event=BBM-STICKER-SEND/derived_features/temp/sticker_type_cat/


Py4JJavaError: An error occurred while calling o398.parquet.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply$mcV$sp(FileFormatWriter.scala:213)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:166)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:166)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:166)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:145)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:58)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:56)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:74)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:138)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:135)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:92)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:92)
	at org.apache.spark.sql.execution.datasources.DataSource.writeInFileFormat(DataSource.scala:435)
	at org.apache.spark.sql.execution.datasources.DataSource.write(DataSource.scala:471)
	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:50)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:58)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:56)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:74)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:138)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:135)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:92)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:92)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:609)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:233)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:217)
	at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:508)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 297 in stage 39.0 failed 4 times, most recent failure: Lost task 297.3 in stage 39.0 (TID 23732, 10.29.88.21, executor 350): java.io.FileNotFoundException: File not found : gs://ds-url-catag/plenty_stickers_data/event=BBM-STICKER-SEND/aggregate_data/part-00005-09050e8f-af69-4be2-80d0-76d306ea9177-c000.snappy.parquet
It is possible the underlying files have been updated. You can explicitly invalidate the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by recreating the Dataset/DataFrame involved.
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:127)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:174)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:105)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.agg_doAggregateWithKeys$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:395)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1517)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1505)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1504)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1504)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1732)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1687)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1676)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2029)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply$mcV$sp(FileFormatWriter.scala:186)
	... 45 more
Caused by: java.io.FileNotFoundException: File not found : gs://ds-url-catag/plenty_stickers_data/event=BBM-STICKER-SEND/aggregate_data/part-00005-09050e8f-af69-4be2-80d0-76d306ea9177-c000.snappy.parquet
It is possible the underlying files have been updated. You can explicitly invalidate the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by recreating the Dataset/DataFrame involved.
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:127)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:174)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:105)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.agg_doAggregateWithKeys$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:395)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
