In [9]:
import findspark
findspark.init('/usr/local/spark')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [12]:
import pyspark
import random
sc = pyspark.SparkContext(appName="Pi")
num_samples = 100000000
def inside(p):     
  x, y = random.random(), random.random()
  return x*x + y*y < 1
count = sc.parallelize(range(0, num_samples)).filter(inside).count()
pi = 4 * count / num_samples
print(pi)
sc.stop()

3.1414656


In [19]:
sc = pyspark.SparkContext(master = 'local[*]')
sc.defaultParallelism
sc.stop()

16

In [20]:
list(map(lambda x: x ** 3, [1, 2, 3, 4, 5]))

[1, 8, 27, 64, 125]

In [37]:
sc = pyspark.SparkContext(master = 'local[*]')
x = sc.parallelize([1, 2, 3, 4, 5]).map(lambda x: x ** 3).collect()
sc.stop()
x

[1, 8, 27, 64, 125]

In [15]:
sc = pyspark.SparkContext(master = 'local[*]')
sc.defaultParallelism
sc.stop()

16

# Feature tools

In [66]:
import featuretools as ft
import featuretools.variable_types as vtypes 

feature_defs = ft.load_features('/data/churn/features.txt')
print(f'There are {len(feature_defs)} features.')

There are 230 features.


In [72]:
import pandas as pd
import numpy as np

N_PARTITIONS = 1000

In [143]:
def partition_to_feature_matrix(partition, feature_defs=feature_defs):
    """Take in a partition number and return a feature matrix"""
    directory = '/data/churn/partitions/p' + str(partition)
    
    # Read in the data files
    members = pd.read_csv(f'{directory}/members.csv', 
                      parse_dates=['registration_init_time'], 
                      infer_datetime_format = True, 
                      dtype = {'gender': 'category'})

    trans = pd.read_csv(f'{directory}/transactions.csv',
                       parse_dates=['transaction_date', 'membership_expire_date'], 
                        infer_datetime_format = True)

    logs = pd.read_csv(f'{directory}/logs.csv', parse_dates = ['date'])
    cutoff_times = pd.read_csv(f'{directory}/cutoff_times.csv', parse_dates = ['cutoff'])
    cutoff_times = cutoff_times.drop_duplicates()
    
    # Create empty entityset
    es = ft.EntitySet(id = 'customers')

    # Add the members parent table
    es.entity_from_dataframe(entity_id='members', dataframe=members,
                             index = 'msno', time_index = 'registration_init_time', 
                             variable_types = {'city': vtypes.Categorical, 'bd': vtypes.Categorical,
                                               'registered_via': vtypes.Categorical})
    # Create new features in transactions
    trans['price_difference'] = trans['plan_list_price'] - trans['actual_amount_paid']
    trans['planned_daily_price'] = trans['plan_list_price'] / trans['payment_plan_days']
    trans['daily_price'] = trans['actual_amount_paid'] / trans['payment_plan_days']

    # Add the transactions child table
    es.entity_from_dataframe(entity_id='transactions', dataframe=trans,
                             index = 'transactions_index', make_index = True,
                             time_index = 'transaction_date', 
                             variable_types = {'payment_method_id': vtypes.Categorical, 
                                               'is_auto_renew': vtypes.Boolean, 'is_cancel': vtypes.Boolean})

    # Add transactions interesting values
    es['transactions']['is_cancel'].interesting_values = [0, 1]
    es['transactions']['is_auto_renew'].interesting_values = [0, 1]
    
    # Create new features in logs
    logs['total'] = logs[['num_25', 'num_50', 'num_75', 'num_985', 'num_100']].sum(axis = 1)
    logs['percent_100'] = logs['num_100'] / logs['total']
    logs['percent_unique'] = logs['num_unq'] / logs['total']
    
    # Add the logs child table
    es.entity_from_dataframe(entity_id='logs', dataframe=logs,
                         index = 'logs_index', make_index = True,
                         time_index = 'date')

    # Add the relationships
    r_member_transactions = ft.Relationship(es['members']['msno'], es['transactions']['msno'])
    r_member_logs = ft.Relationship(es['members']['msno'], es['logs']['msno'])
    es.add_relationships([r_member_transactions, r_member_logs])

    # Calculate and save the feature matrix
    feature_matrix = ft.calculate_feature_matrix(entityset=es, features=feature_defs, cutoff_time=cutoff_times)
    
    feature_matrix.to_csv(f'{directory}/feature_matrix.csv')
    
    # Report progress every 10th of number of partitions
    if (partition % (N_PARTITIONS / 10) == 0):
        print(f'{100 * round(partition / N_PARTITIONS)}% complete.', end = '\r')

In [144]:
import os
base_dir = '/data/churn/partitions/'
partitions = list(range(len(os.listdir(base_dir))))
partitions[-1]

999

In [145]:
sc.stop()

In [114]:
conf = pyspark.SparkConf()
conf.set('spark.eventLog.enabled', True);
conf.set('spark.eventLog.dir', '/usr/local/spark/tmp');
conf.getAll()

<pyspark.conf.SparkConf at 0x7f90e9364d30>

<pyspark.conf.SparkConf at 0x7f90e9364d30>

[('spark.eventLog.dir', '/usr/local/spark/tmp'),
 ('spark.eventLog.enabled', 'True'),
 ('spark.master', 'local[*]'),
 ('spark.submit.deployMode', 'client'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.app.name', 'pyspark-shell')]

In [119]:
sc = pyspark.SparkContext(master = 'spark://ip-172-31-23-133.ec2.internal:7077', 
                          conf = conf,
                          appName='Cubed')
sc.parallelize([1, 2, 3, 4, 5], numSlices=1).map(lambda x: x ** 3).collect()
sc.stop()

[1, 8, 27, 64, 125]

In [142]:
df = pd.read_json('/usr/local/spark/tmp/app-20180917145714-0007', lines = True)
df.head()

Unnamed: 0,App ID,App Name,Block Manager ID,Classpath Entries,Completion Time,Event,Executor ID,Executor Info,JVM Information,Job ID,Job Result,Maximum Memory,Maximum Offheap Memory,Maximum Onheap Memory,Properties,Spark Properties,Spark Version,Stage Attempt ID,Stage ID,Stage IDs,Stage Info,Stage Infos,Submission Time,System Properties,Task End Reason,Task Info,Task Metrics,Task Type,Timestamp,User
0,,,,,,SparkListenerLogStart,,,,,,,,,,,2.3.1,,,,,,,,,,,,NaT,
1,,,"{'Executor ID': 'driver', 'Host': 'ip-172-31-2...",,,SparkListenerBlockManagerAdded,,,,,,411775795.0,0.0,411775795.0,,,,,,,,,,,,,,,2018-09-17 14:57:14.450,
2,,,,{'/usr/local/spark/jars/commons-lang-2.6.jar':...,,SparkListenerEnvironmentUpdate,,,{'Java Home': '/home/ubuntu/anaconda3/envs/pyt...,,,,,,,{'spark.driver.host': 'ip-172-31-23-133.ec2.in...,,,,,,,,"{'java.io.tmpdir': '/tmp', 'line.separator': '...",,,,,NaT,
3,app-20180917145714-0007,Cubed,,,,SparkListenerApplicationStart,,,,,,,,,,,,,,,,,,,,,,,2018-09-17 14:57:14.417,ubuntu
4,,,,,,SparkListenerJobStart,,,,0.0,,,,,"{'spark.rdd.scope.noOverride': 'true', 'callSi...",,,,,[0],,"[{'Stage ID': 0, 'Stage Attempt ID': 0, 'Stage...",1537196000000.0,,,,,,NaT,


## Testing 

In [146]:
from timeit import default_timer as timer

In [150]:
sc.stop()

In [151]:
start = timer()
sc = pyspark.SparkContext(master = 'local[*]', 
                          appName = 'featuretools', conf = conf)
r = sc.parallelize(partitions, numSlices=1000).map(partition_to_feature_matrix).collect()
sc.stop()
end = timer()

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 9 in stage 0.0 failed 1 times, most recent failure: Lost task 9.0 in stage 0.0 (TID 9, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 230, in main
    process()
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 225, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 372, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/util.py", line 55, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-143-4bd6d5ccd8af>", line 59, in partition_to_feature_matrix
  File "/home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/featuretools/computational_backends/calculate_feature_matrix.py", line 258, in calculate_feature_matrix
    pass_columns=pass_columns)
  File "/home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/featuretools/computational_backends/calculate_feature_matrix.py", line 520, in linear_calculate_chunks
    backend=backend)
  File "/home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/featuretools/computational_backends/calculate_feature_matrix.py", line 342, in calculate_chunk
    training_window=window)
  File "/home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/featuretools/computational_backends/utils.py", line 34, in wrapped
    r = method(*args, **kwargs)
  File "/home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/featuretools/computational_backends/calculate_feature_matrix.py", line 316, in calc_results
    profile=profile)
  File "/home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/featuretools/computational_backends/pandas_backend.py", line 196, in calculate_all_features
    result_frame = handler(group, input_frames)
  File "/home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/featuretools/computational_backends/pandas_backend.py", line 442, in _calculate_agg_features
    to_merge.reset_index(1, drop=True, inplace=True)
  File "/home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/frame.py", line 4101, in reset_index
    level = [self.index._get_level_number(lev) for lev in level]
  File "/home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/frame.py", line 4101, in <listcomp>
    level = [self.index._get_level_number(lev) for lev in level]
  File "/home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/indexes/base.py", line 1961, in _get_level_number
    self._validate_index_level(level)
  File "/home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/indexes/base.py", line 1955, in _validate_index_level
    (level + 1))
IndexError: Too many levels: Index has only 1 level, not 2

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:438)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:421)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:939)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:939)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1602)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1590)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1589)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1589)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1823)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1772)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1761)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2074)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:939)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:938)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:162)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.GeneratedMethodAccessor98.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 230, in main
    process()
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 225, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 372, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/util.py", line 55, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-143-4bd6d5ccd8af>", line 59, in partition_to_feature_matrix
  File "/home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/featuretools/computational_backends/calculate_feature_matrix.py", line 258, in calculate_feature_matrix
    pass_columns=pass_columns)
  File "/home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/featuretools/computational_backends/calculate_feature_matrix.py", line 520, in linear_calculate_chunks
    backend=backend)
  File "/home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/featuretools/computational_backends/calculate_feature_matrix.py", line 342, in calculate_chunk
    training_window=window)
  File "/home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/featuretools/computational_backends/utils.py", line 34, in wrapped
    r = method(*args, **kwargs)
  File "/home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/featuretools/computational_backends/calculate_feature_matrix.py", line 316, in calc_results
    profile=profile)
  File "/home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/featuretools/computational_backends/pandas_backend.py", line 196, in calculate_all_features
    result_frame = handler(group, input_frames)
  File "/home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/featuretools/computational_backends/pandas_backend.py", line 442, in _calculate_agg_features
    to_merge.reset_index(1, drop=True, inplace=True)
  File "/home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/frame.py", line 4101, in reset_index
    level = [self.index._get_level_number(lev) for lev in level]
  File "/home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/frame.py", line 4101, in <listcomp>
    level = [self.index._get_level_number(lev) for lev in level]
  File "/home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/indexes/base.py", line 1961, in _get_level_number
    self._validate_index_level(level)
  File "/home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/indexes/base.py", line 1955, in _validate_index_level
    (level + 1))
IndexError: Too many levels: Index has only 1 level, not 2

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:438)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:421)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:939)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:939)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more


# Read in Results

In [124]:
df = pd.read_json('/usr/local/spark/tmp/app-20180917145854-0009', lines = True)
df.head()

Unnamed: 0,App ID,App Name,Block Manager ID,Classpath Entries,Completion Time,Event,Executor ID,Executor Info,JVM Information,Job ID,...,Stage Info,Stage Infos,Submission Time,System Properties,Task End Reason,Task Info,Task Metrics,Task Type,Timestamp,User
0,,,,,,SparkListenerLogStart,,,,,...,,,,,,,,,NaT,
1,,,"{'Executor ID': 'driver', 'Host': 'ip-172-31-2...",,,SparkListenerBlockManagerAdded,,,,,...,,,,,,,,,2018-09-17 14:58:54.536,
2,,,,{'/usr/local/spark/jars/commons-lang-2.6.jar':...,,SparkListenerEnvironmentUpdate,,,{'Java Home': '/home/ubuntu/anaconda3/envs/pyt...,,...,,,,"{'java.io.tmpdir': '/tmp', 'line.separator': '...",,,,,NaT,
3,app-20180917145854-0009,testing,,,,SparkListenerApplicationStart,,,,,...,,,,,,,,,2018-09-17 14:58:54.487,ubuntu
4,,,,,,SparkListenerJobStart,,,,0.0,...,,"[{'Stage ID': 0, 'Stage Attempt ID': 0, 'Stage...",1537196000000.0,,,,,,NaT,


The next code block finds the time in seconds to complete each task.

In [130]:
def filter_task_info(task_dict):
    try:
        return task_dict.get('Task ID')
    except:
        return np.nan
    
df['Task ID'] = df['Task Info'].apply(filter_task_id)

0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
5         NaN
6         NaN
7         0.0
8         1.0
9         2.0
10        3.0
11        4.0
12        5.0
13        6.0
14        7.0
15        8.0
16        9.0
17       10.0
18       11.0
19       12.0
20       13.0
21       14.0
22       15.0
23        NaN
24       16.0
25       12.0
26       17.0
27        4.0
28       18.0
29       10.0
        ...  
1981    978.0
1982    995.0
1983    981.0
1984    996.0
1985    980.0
1986    997.0
1987    979.0
1988    998.0
1989    983.0
1990    999.0
1991    985.0
1992    984.0
1993    982.0
1994    986.0
1995    987.0
1996    988.0
1997    989.0
1998    992.0
1999    990.0
2000    991.0
2001    993.0
2002    994.0
2003    995.0
2004    999.0
2005    997.0
2006    996.0
2007    998.0
2008      NaN
2009      NaN
2010      NaN
Name: Task ID, Length: 2011, dtype: float64

In [134]:
pd.options.display.max_columns = 40
df[df['Task ID'] == 996]

Unnamed: 0,App ID,App Name,Block Manager ID,Classpath Entries,Completion Time,Event,Executor ID,Executor Info,JVM Information,Job ID,Job Result,Maximum Memory,Maximum Offheap Memory,Maximum Onheap Memory,Properties,Spark Properties,Spark Version,Stage Attempt ID,Stage ID,Stage IDs,Stage Info,Stage Infos,Submission Time,System Properties,Task End Reason,Task Info,Task Metrics,Task Type,Timestamp,User,Task ID
1984,,,,,,SparkListenerTaskStart,,,,,,,,,,,,0.0,0.0,,,,,,,"{'Task ID': 996, 'Index': 996, 'Attempt': 0, '...",,,NaT,,996.0
2006,,,,,,SparkListenerTaskEnd,,,,,,,,,,,,0.0,0.0,,,,,,{'Reason': 'Success'},"{'Task ID': 996, 'Index': 996, 'Attempt': 0, '...","{'Executor Deserialize Time': 0, 'Executor Des...",ResultTask,NaT,,996.0


In [135]:
df.loc[1984, 'Task Info']

{'Task ID': 996,
 'Index': 996,
 'Attempt': 0,
 'Launch Time': 1537196517065,
 'Executor ID': '0',
 'Host': '172.31.23.133',
 'Locality': 'PROCESS_LOCAL',
 'Speculative': False,
 'Getting Result Time': 0,
 'Finish Time': 0,
 'Failed': False,
 'Killed': False,
 'Accumulables': []}

In [138]:
df.loc[2006, 'Task Info']['Finish Time'] - df.loc[2006, 'Task Info']['Launch Time']

2809

In [141]:
pd.to_datetime(1537196517065, unit = 'ms')

Timestamp('2018-09-17 15:01:57.065000')