In [1]:
import pandas as pd
import numpy as np
import datetime as dt

import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *#avg, count, expr
from pyspark.sql.types import *

In [2]:
# initialize
sc = pyspark.SparkContext()
spark = SparkSession(sc)
spark.sparkContext.appName = 'manipulate'
# show the number of cores
print('%d cores'%spark._jsc.sc().getExecutorMemoryStatus().keySet().size())
spark

1 cores


In [12]:
# load data
fil = '../data/youtubevideos.csv'
schem = StructType([StructField('video_id', StringType()), StructField('trending_date', DateType()),
                    StructField('title', StringType()), StructField('channel_title', StringType()),
                    StructField('category_id', IntegerType()), StructField('publish_time', TimestampType()),
                    StructField('tags', StringType()), StructField('views', IntegerType()),
                    StructField('likes', IntegerType()), StructField('dislikes', IntegerType()),
                    StructField('comment_count', IntegerType()), StructField('thumbnail_link', StringType()),
                    StructField('comments_disabled', BooleanType()), StructField('ratings_disabled', BooleanType()),
                    StructField('video_error_or_removed', BooleanType()), StructField('description', StringType())])
# for the date / timestamp columns, could read in as string, then use withColumn to cast as dates as in
# withColumn(col_name, to_date(col_name, date_format_string))
# also could maybe need regexp_replace

youtube = spark.read.format('csv').options(header=True, dateFormat='yy.dd.MM', timestampFormat='yyyy-MM-ddTkk:mm:ss.SSZ').schema(schem).load(fil)

In [13]:
# talk
print('%d rows'%youtube.count())
display(youtube.limit(10).toPandas())

48137 rows


Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,2kyS6SvSYSE,2017-11-14,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13 17:13:01,SHANtell martin,748374,57527,2966,15954,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg,False,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...
1,1ZAPwfrtAFY,2017-11-14,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13 07:30:00,"""last week tonight trump presidency""|""last wee...",2418783,97185,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John..."
2,5qpjK5DgCt4,2017-11-14,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12 19:05:24,"""racist superman""|""rudy""|""mancuso""|""king""|""bac...",3191434,146033,5339,8181,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► http...
3,puqaWrEC7tY,2017-11-14,Nickelback Lyrics: Real or Fake?,Good Mythical Morning,24,2017-11-13 11:00:04,"""rhett and link""|""gmm""|""good mythical morning""...",343168,10172,666,2146,https://i.ytimg.com/vi/puqaWrEC7tY/default.jpg,False,False,False,Today we find out if Link is a Nickelback amat...
4,d380meD0W0M,2017-11-14,I Dare You: GOING BALD!?,nigahiga,24,2017-11-12 18:01:41,"""ryan""|""higa""|""higatv""|""nigahiga""|""i dare you""...",2095731,132235,1989,17518,https://i.ytimg.com/vi/d380meD0W0M/default.jpg,False,False,False,I know it's been a while since we did this sho...
5,gHZ1Qz0KiKM,2017-11-14,2 Weeks with iPhone X,iJustine,28,2017-11-13 19:07:23,"""ijustine""|""week with iPhone X""|""iphone x""|""ap...",119180,9763,511,1434,https://i.ytimg.com/vi/gHZ1Qz0KiKM/default.jpg,False,False,False,Using the iPhone for the past two weeks -- her...
6,39idVpFF7NQ,2017-11-14,Roy Moore & Jeff Sessions Cold Open - SNL,Saturday Night Live,24,2017-11-12 05:37:17,"""SNL""|""Saturday Night Live""|""SNL Season 43""|""E...",2103417,15993,2445,1970,https://i.ytimg.com/vi/39idVpFF7NQ/default.jpg,False,False,False,Embattled Alabama Senate candidate Roy Moore (...
7,nc99ccSXST0,2017-11-14,5 Ice Cream Gadgets put to the Test,CrazyRussianHacker,28,2017-11-12 21:50:37,"""5 Ice Cream Gadgets""|""Ice Cream""|""Cream Sandw...",817732,23663,778,3432,https://i.ytimg.com/vi/nc99ccSXST0/default.jpg,False,False,False,Ice Cream Pint Combination Lock - http://amzn....
8,jr9QtXwC9vc,2017-11-14,The Greatest Showman | Official Trailer 2 [HD]...,20th Century Fox,1,2017-11-13 14:00:23,"""Trailer""|""Hugh Jackman""|""Michelle Williams""|""...",826059,3543,119,340,https://i.ytimg.com/vi/jr9QtXwC9vc/default.jpg,False,False,False,"Inspired by the imagination of P.T. Barnum, Th..."
9,TUmyygCMMGA,2017-11-14,Why the rise of the robots won’t mean the end ...,Vox,25,2017-11-13 13:45:16,"""vox.com""|""vox""|""explain""|""shift change""|""futu...",256426,12654,1363,2368,https://i.ytimg.com/vi/TUmyygCMMGA/default.jpg,False,False,False,"For now, at least, we have better things to wo..."


In [15]:
youtube.select('*').show()

+-----------+-------------+--------------------+--------------------+-----------+-------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|   video_id|trending_date|               title|       channel_title|category_id|       publish_time|                tags|  views| likes|dislikes|comment_count|      thumbnail_link|comments_disabled|ratings_disabled|video_error_or_removed|         description|
+-----------+-------------+--------------------+--------------------+-----------+-------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|2kyS6SvSYSE|   2017-11-14|WE WANT TO TALK A...|        CaseyNeistat|         22|2017-11-13 17:13:01|     SHANtell martin| 748374| 57527|    2966|        15954|https://i.ytimg.c...|            false|           false| 

In [19]:
# trimming
trimmed = youtube.select('title', trim('title').alias('trimmed'), ltrim('title').alias('ltrimmed'), rtrim('title').alias('rtrimmed')).toPandas()
display(trimmed.head())

Unnamed: 0,title,trimmed,ltrimmed,rtrimmed
0,WE WANT TO TALK ABOUT OUR MARRIAGE,WE WANT TO TALK ABOUT OUR MARRIAGE,WE WANT TO TALK ABOUT OUR MARRIAGE,WE WANT TO TALK ABOUT OUR MARRIAGE
1,The Trump Presidency: Last Week Tonight with J...,The Trump Presidency: Last Week Tonight with J...,The Trump Presidency: Last Week Tonight with J...,The Trump Presidency: Last Week Tonight with J...
2,"Racist Superman | Rudy Mancuso, King Bach & Le...","Racist Superman | Rudy Mancuso, King Bach & Le...","Racist Superman | Rudy Mancuso, King Bach & Le...","Racist Superman | Rudy Mancuso, King Bach & Le..."
3,Nickelback Lyrics: Real or Fake?,Nickelback Lyrics: Real or Fake?,Nickelback Lyrics: Real or Fake?,Nickelback Lyrics: Real or Fake?
4,I Dare You: GOING BALD!?,I Dare You: GOING BALD!?,I Dare You: GOING BALD!?,I Dare You: GOING BALD!?


In [20]:
# case when
lkd = youtube.select('Likes', 'Dislikes', when(col('Likes') > col('Dislikes'), ':-)')\
                     .when(col('Likes') < col('Dislikes'), ':-(').otherwise(':-|').alias('NetPref'))
lkd.show(10)
lkd.select('NetPref').groupBy('NetPref').count().show()

+------+--------+-------+
| Likes|Dislikes|NetPref|
+------+--------+-------+
| 57527|    2966|    :-)|
| 97185|    6146|    :-)|
|146033|    5339|    :-)|
| 10172|     666|    :-)|
|132235|    1989|    :-)|
|  9763|     511|    :-)|
| 15993|    2445|    :-)|
| 23663|     778|    :-)|
|  3543|     119|    :-)|
| 12654|    1363|    :-)|
+------+--------+-------+
only showing top 10 rows

+-------+-----+
|NetPref|count|
+-------+-----+
|    :-(|  576|
|    :-)|40192|
|    :-|| 7369|
+-------+-----+



In [23]:
# concat
youtube.select('channel_title', 'title', concat_ws('-', col('channel_title'), col('title')).alias('alltitle')).show(10, truncate=False)

+---------------------+-----------------------------------------------------------------+----------------------------------------------------------------------------------+
|channel_title        |title                                                            |alltitle                                                                          |
+---------------------+-----------------------------------------------------------------+----------------------------------------------------------------------------------+
|CaseyNeistat         |WE WANT TO TALK ABOUT OUR MARRIAGE                               |CaseyNeistat-WE WANT TO TALK ABOUT OUR MARRIAGE                                   |
|LastWeekTonight      |The Trump Presidency: Last Week Tonight with John Oliver (HBO)   |LastWeekTonight-The Trump Presidency: Last Week Tonight with John Oliver (HBO)    |
|Rudy Mancuso         |Racist Superman | Rudy Mancuso, King Bach & Lele Pons            |Rudy Mancuso-Racist Superman | Rudy Mancuso, K

In [24]:
youtube.select('trending_date', year('trending_date'), month('trending_date')).show(5)

+-------------+-------------------+--------------------+
|trending_date|year(trending_date)|month(trending_date)|
+-------------+-------------------+--------------------+
|   2017-11-14|               2017|                  11|
|   2017-11-14|               2017|                  11|
|   2017-11-14|               2017|                  11|
|   2017-11-14|               2017|                  11|
|   2017-11-14|               2017|                  11|
+-------------+-------------------+--------------------+
only showing top 5 rows



In [28]:
youtube.select('title', 'trending_date', 'publish_time', datediff(col('trending_date'), col('publish_time')).alias('TimeToTrend'))\
    .orderBy(col('TimeToTrend').desc()).show(10, truncate=False)

+--------------------------------+-------------+-------------------+-----------+
|title                           |trending_date|publish_time       |TimeToTrend|
+--------------------------------+-------------+-------------------+-----------+
|Budweiser - Original Whazzup? ad|2018-02-05   |2006-07-23 09:24:11|4215       |
|Kramer vs Kramer-Clou Scene     |2018-01-09   |2008-04-05 19:22:40|3566       |
|Kramer vs Kramer-Clou Scene     |2018-01-08   |2008-04-05 19:22:40|3565       |
|Kramer vs Kramer-Clou Scene     |2018-01-07   |2008-04-05 19:22:40|3564       |
|Kramer vs Kramer-Clou Scene     |2018-01-06   |2008-04-05 19:22:40|3563       |
|Behind The Sounds: That's Not Me|2017-11-28   |2008-06-17 01:07:56|3451       |
|Behind The Sounds: That's Not Me|2017-11-27   |2008-06-17 01:07:56|3450       |
|Behind The Sounds: That's Not Me|2017-11-26   |2008-06-17 01:07:56|3449       |
|Behind The Sounds: That's Not Me|2017-11-25   |2008-06-17 01:07:56|3448       |
|SAOIRSE RONAN - MORONIC (IR

In [37]:
titleWords = youtube.select('title', split(lower(col('title')), ' ').alias('parsedTitle'))
titleWords.show(10, truncate=False)

+-----------------------------------------------------------------+------------------------------------------------------------------------------+
|title                                                            |parsedTitle                                                                   |
+-----------------------------------------------------------------+------------------------------------------------------------------------------+
|WE WANT TO TALK ABOUT OUR MARRIAGE                               |[we, want, to, talk, about, our, marriage]                                    |
|The Trump Presidency: Last Week Tonight with John Oliver (HBO)   |[the, trump, presidency:, last, week, tonight, with, john, oliver, (hbo)]     |
|Racist Superman | Rudy Mancuso, King Bach & Lele Pons            |[racist, superman, |, rudy, mancuso,, king, bach, &, lele, pons]              |
|Nickelback Lyrics: Real or Fake?                                 |[nickelback, lyrics:, real, or, fake?]             

In [41]:
titleWords.select('title', array_contains(col('parsedTitle'), 'trump').alias('aca')).where(col('aca')==True).show(truncate=False)

+-----------------------------------------------------------------------------------------------+----+
|title                                                                                          |aca |
+-----------------------------------------------------------------------------------------------+----+
|The Trump Presidency: Last Week Tonight with John Oliver (HBO)                                 |true|
|The Trump Presidency: Last Week Tonight with John Oliver (HBO)                                 |true|
|The Trump Presidency: Last Week Tonight with John Oliver (HBO)                                 |true|
|Donald Trump makes ASEAN handshake photo op go awry                                            |true|
|The Trump Presidency: Last Week Tonight with John Oliver (HBO)                                 |true|
|Donald Trump makes ASEAN handshake photo op go awry                                            |true|
|President Donald Trump Makes Statement From White House On Asia Trip (Fu

In [43]:
titleWords.select('title', array_distinct(array_remove(col('parsedTitle'), 'the')).alias('reparsed')).show(10, truncate=False)

+-----------------------------------------------------------------+----------------------------------------------------------------------+
|title                                                            |reparsed                                                              |
+-----------------------------------------------------------------+----------------------------------------------------------------------+
|WE WANT TO TALK ABOUT OUR MARRIAGE                               |[we, want, to, talk, about, our, marriage]                            |
|The Trump Presidency: Last Week Tonight with John Oliver (HBO)   |[trump, presidency:, last, week, tonight, with, john, oliver, (hbo)]  |
|Racist Superman | Rudy Mancuso, King Bach & Lele Pons            |[racist, superman, |, rudy, mancuso,, king, bach, &, lele, pons]      |
|Nickelback Lyrics: Real or Fake?                                 |[nickelback, lyrics:, real, or, fake?]                                |
|I Dare You: GOING BALD!?  

In [51]:
# create a pysaprk UDF, which can get farmed out to all workers on the cluster
def squareCol(acol):
    if (acol is None) or np.isnan(acol):
        res = 0
    else:
        res = int(acol**2)
    return res
squareColUDF = udf(lambda x: squareCol(x), IntegerType())

In [53]:
youtube.select('likes', squareColUDF('likes').alias('squaredLikes')).where(col('likes').isNotNull()).show()

Py4JJavaError: An error occurred while calling o470.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 47.0 failed 1 times, most recent failure: Lost task 0.0 in stage 47.0 (TID 238) (192.168.150.128 executor driver): org.apache.spark.SparkException: 
Bad data in pyspark.daemon's standard output. Invalid port number:
  458961458 (0x1b5b3232)
Python command to execute the daemon was:
  ipython3 -m pyspark.daemon
Check that you don't have any unexpected modules or libraries in
your PYTHONPATH:
  /home/ahowe42/spark-3.1.1-bin-hadoop2.7/python/lib/pyspark.zip:/home/ahowe42/spark-3.1.1-bin-hadoop2.7/python/lib/py4j-0.10.9-src.zip:/home/ahowe42/spark-3.1.1-bin-hadoop2.7/jars/spark-core_2.12-3.1.1.jar:/home/ahowe42/anaconda3/bin
Also, check if you have a sitecustomize.py module in your python path,
or in your python installation, that is printing to standard output
	at org.apache.spark.api.python.PythonWorkerFactory.startDaemon(PythonWorkerFactory.scala:238)
	at org.apache.spark.api.python.PythonWorkerFactory.createThroughDaemon(PythonWorkerFactory.scala:132)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:105)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:119)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:145)
	at org.apache.spark.sql.execution.python.BatchEvalPythonExec.evaluate(BatchEvalPythonExec.scala:70)
	at org.apache.spark.sql.execution.python.EvalPythonExec.$anonfun$doExecute$2(EvalPythonExec.scala:130)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:863)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:863)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2253)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2202)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2201)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2201)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1078)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1078)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1078)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2440)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2382)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2371)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2202)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2223)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2242)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:472)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:425)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:47)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3696)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2722)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3687)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:772)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3685)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2722)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2929)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:301)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:338)
	at sun.reflect.GeneratedMethodAccessor92.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: 
Bad data in pyspark.daemon's standard output. Invalid port number:
  458961458 (0x1b5b3232)
Python command to execute the daemon was:
  ipython3 -m pyspark.daemon
Check that you don't have any unexpected modules or libraries in
your PYTHONPATH:
  /home/ahowe42/spark-3.1.1-bin-hadoop2.7/python/lib/pyspark.zip:/home/ahowe42/spark-3.1.1-bin-hadoop2.7/python/lib/py4j-0.10.9-src.zip:/home/ahowe42/spark-3.1.1-bin-hadoop2.7/jars/spark-core_2.12-3.1.1.jar:/home/ahowe42/anaconda3/bin
Also, check if you have a sitecustomize.py module in your python path,
or in your python installation, that is printing to standard output
	at org.apache.spark.api.python.PythonWorkerFactory.startDaemon(PythonWorkerFactory.scala:238)
	at org.apache.spark.api.python.PythonWorkerFactory.createThroughDaemon(PythonWorkerFactory.scala:132)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:105)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:119)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:145)
	at org.apache.spark.sql.execution.python.BatchEvalPythonExec.evaluate(BatchEvalPythonExec.scala:70)
	at org.apache.spark.sql.execution.python.EvalPythonExec.$anonfun$doExecute$2(EvalPythonExec.scala:130)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:863)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:863)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [54]:
sc.stop()