# Performance comparison between Spark and other systems

*Systems used for comparisons:*

1.   PySpark dataframes
1.   PySpark RDD
1.   Pandas
1.   Polars

*Data link:*https://data.cityofchicago.org/Public-Safety/Crimes-2001-to-Present/ijzp-q8t2/data

*Perfomance measured for these operations:*
1. read file
1. filter by arrested
1. group by district
1. count all criminal arrested by district
1. order by

*Authors*:
- Pablo Nieto Rodríguez
- Pablo Fontádez
- Christian Berdejo Sánchez




In [55]:

path_file="./data/Crimes_-_2001_to_Present.csv" #sustituir por fichero

In [7]:
import time # For measuring time

## CSV Performance comparison

### 1. PySpark dataframes

In [3]:
#Install dependencies
from pyspark.sql import SparkSession, functions
from pyspark.sql.functions import to_timestamp

In [6]:
#Create Spark Session
spark_session = SparkSession \
        .builder \
        .getOrCreate()

#Log Error
spark_session.sparkContext.setLogLevel("ERROR")

Reading

In [8]:
initial_time = time.time()
data_frame = spark_session \
      .read \
      .options(header='true', inferschema='true') \
      .option("delimiter", ",") \
      .option("timestampFormat", "yyyy-MM-dd") \
      .csv(path_file) \
      .persist()

dataframe_reading_time = time.time() - initial_time
print("Reading time: ", str(dataframe_reading_time))


Reading time:  19.636226892471313


 Filter

In [9]:
initial_time = time.time()
filtered_df  = data_frame.filter("Arrest = true")
dataframe_filter_time = time.time() - initial_time
print("Filter time: ", str(dataframe_filter_time))


Filter time:  0.07268404960632324


Group by

In [10]:
start_time = time.time()
grouped_df = filtered_df.groupby("District")
dataframe_time_group_by = time.time() - start_time

print("Group by time: ", str(dataframe_time_group_by))

Group by time:  0.03395581245422363


Count

In [11]:
start_time = time.time()
counted_df = grouped_df.count()

dataframe_count_time = time.time() - start_time
print("Count time: ", str(dataframe_count_time))

Count time:  0.04213404655456543


Sort

In [12]:
start_time = time.time()
sorted_df = counted_df.orderBy("count", ascending=False)
dataframe_time_sort = time.time() - start_time
print("Sort time: ", str(dataframe_time_sort))

Sort time:  0.03127288818359375


###  2. PySpark RDD

In [52]:
#Install dependencies
from pyspark import SparkConf, SparkContext


In [64]:
# Crear SparkSession
spark_session = SparkSession.builder \
    .appName("AddNumbers") \
    .getOrCreate()

# Obtener SparkContext desde SparkSession
spark_context = spark_session.sparkContext

spark_context.setLogLevel("ERROR")


Reading

In [65]:
initial_time = time.time()

rdd = spark_context.textFile(path_file)


rdd_reading_time = time.time() - initial_time
print("Reading time: ", str(rdd_reading_time))


Reading time:  0.028599977493286133


In [66]:
header = rdd.first()
columns = header.split(",")

try:
    arrest_idx = columns.index("Arrest")      # Ajusta al nombre exacto de tu columna
    district_idx = columns.index("District") # Ajusta al nombre exacto de tu columna
except ValueError:
    print("Verifica que tus columnas 'Arrest' y 'District' existan en el header.")
    raise

In [67]:
rdd_no_header = rdd.filter(lambda line: line != header)
rdd_parsed = rdd_no_header.map(lambda line: line.split(","))

Filter

In [72]:
initial_time = time.time()
rdd_filtered = rdd_parsed.filter(
    lambda row: len(row) > max(arrest_idx, district_idx) and row[arrest_idx].lower() == "true"
)
rdd_filter_time = time.time() - initial_time
print("Filter time: ", str(rdd_filter_time))


Filter time:  0.0


Group by

In [69]:
start_time = time.time()
rdd_mapped = rdd_filtered.map(lambda row: (row[district_idx], 1))

rdd_time_group_by = time.time() - start_time

print("Group by time: ", str(rdd_time_group_by))

Group by time:  0.0


Count

In [70]:
start_time = time.time()
rdd_counted = rdd_mapped.reduceByKey(lambda a, b: a + b)
rdd_count_time = time.time() - start_time
print("Count time: ", str(rdd_count_time))

Count time:  0.02471184730529785


Sort

In [71]:
start_time = time.time()
rdd_sorted = rdd_counted.sortBy(lambda x: x[1], ascending=False)
rdd_time_sort = time.time() - start_time
print("Sort time: ", str(rdd_time_sort))

[('011', 211323), ('015', 130453), ('007', 125371), ('025', 118957), ('006', 116946)]
Sort time:  148.438649892807


### 3. Pandas


In [27]:

import pandas as pd
import time

start_time = time.time()
df = pd.read_csv(path_file, delimiter=",", low_memory=False)
pd_read_time = time.time() - start_time
print(f"Tiempo de lectura: {pd_read_time:.6f} segundos")


Tiempo de lectura: 86.904929 segundos


In [28]:
start_time = time.time()
filtered_df = df[df["Arrest"] == True]
pd_filter_time = time.time() - start_time
print(f"Tiempo de filtrado: {pd_filter_time:.6f} segundos")

Tiempo de filtrado: 0.968046 segundos


In [29]:
start_time = time.time()
grouped_df = filtered_df.groupby("District").size()
pd_group_time = time.time() - start_time
print(f"Tiempo de agrupación: {pd_group_time:.6f} segundos")

Tiempo de agrupación: 0.077922 segundos


In [30]:
start_time = time.time()
counted_df = grouped_df.reset_index(name="count")
pd_count_time = time.time() - start_time
print(f"Tiempo de conteo: {pd_count_time:.6f} segundos")

Tiempo de conteo: 0.058875 segundos


In [31]:
start_time = time.time()
sorted_df = counted_df.sort_values(by="count", ascending=False)
pd_sort_time = time.time() - start_time
print(f"Tiempo de ordenamiento: {pd_sort_time:.6f} segundos")

Tiempo de ordenamiento: 0.006889 segundos


### 4. Polars

In [32]:
import polars as pl
import time


Reading

In [33]:
start_computing_time = time.time()
df=pl.read_csv(path_file)
pl_reading_time= time.time()-start_computing_time
print("Reading time: ",pl_reading_time)

Reading time:  17.826291799545288


Filter

In [34]:
start_computing_time = time.time()
filter_df= df.filter(pl.col("Arrest") == True)
pl_filter_time= time.time()-start_computing_time
print("Filter time: ",pl_filter_time)

Filter time:  2.0957674980163574


Group by

In [35]:
start_computing_time = time.time()
grouped_df=filter_df.group_by("District")
pl_group_time= time.time()-start_computing_time
print("Group by time: ",pl_group_time)

Group by time:  0.003191709518432617


In [36]:
start_computing_time = time.time()
count_df=grouped_df.agg(pl.len().alias("count"))
pl_count_time= time.time()-start_computing_time
print("Group by time: ",pl_count_time)

Group by time:  0.07523155212402344


Sort

In [37]:
start_computing_time = time.time()
sort_df = count_df.sort("count", descending=True)
pl_sort_time= time.time()-start_computing_time
print("Sort by time: ",pl_sort_time)

Sort by time:  0.006716489791870117


Total time

In [38]:
pl_total_time=pl_reading_time+pl_filter_time+pl_group_time+pl_sort_time+pl_count_time
print("Total time: ",pl_total_time)

Total time:  20.00719904899597


## parquet Perfomance comparison

### Transform from CSV to parquet

In [39]:
import pandas as pd

In [41]:
data_csv = pd.read_csv(path_file)
data_csv.to_parquet('data_parquet', engine='pyarrow')

In [90]:
path_parquet_file="./data_parquet"


### 1. PySpark dataframes

In [4]:
#Install dependencies
from pyspark.sql import SparkSession, functions
from pyspark.sql.functions import to_timestamp

In [13]:
#Create Spark Session
spark_session = SparkSession \
        .builder \
        .getOrCreate()

#Log Error
spark_session.sparkContext.setLogLevel("ERROR")

Reading

In [14]:
initial_time = time.time()
data_frame = spark_session \
      .read \
      .options(header='true', inferschema='true') \
      .option("delimiter", ",") \
      .option("timestampFormat", "yyyy-MM-dd") \
      .parquet(path_parquet_file) \
      .persist()

dataframe_parquet_reading_time = time.time() - initial_time
print("Reading time: ", str(dataframe_parquet_reading_time))


Reading time:  4.546781778335571


 Filter

In [15]:
initial_time = time.time()
filtered_df  = data_frame.filter("Arrest = true")
dataframe_parquet_filter_time = time.time() - initial_time
print("Filter time: ", str(dataframe_parquet_filter_time))


Filter time:  0.12954926490783691


Group by

In [16]:
start_time = time.time()
grouped_df = filtered_df.groupby("District")
dataframe_parquet_time_group_by = time.time() - start_time

print("Group by time: ", str(dataframe_parquet_time_group_by))

Group by time:  0.06593108177185059


Count

In [17]:
start_time = time.time()
counted_df = grouped_df.count()

dataframe_parquet_count = time.time() - start_time
print("Count time: ", str(dataframe_parquet_count))

Count time:  0.06302261352539062


Sort

In [18]:
start_time = time.time()
sorted_df = counted_df.orderBy("count", ascending=False)
dataframe_parquet_time_sort = time.time() - start_time
print("Sort time: ", str(dataframe_parquet_time_sort))


Sort time:  0.04057002067565918


###  2. PySpark RDD

In [91]:
#Install dependencies
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession



In [92]:
# Crear SparkSession
spark_session = SparkSession.builder \
    .appName("AddNumbers") \
    .getOrCreate()

# Obtener SparkContext desde SparkSession
spark_context = spark_session.sparkContext

spark_context.setLogLevel("ERROR")


Reading

Spark Context can't read a parquet file. So we read it using spark session and we parse it to RDD

In [93]:
initial_time = time.time()
spark_df = spark_session.read.parquet(path_parquet_file)
rdd = spark_df.rdd
rdd_parquet_reading_time = time.time() - initial_time
print("Reading time: ", str(rdd_parquet_reading_time))

Reading time:  0.17148876190185547


Each row of the rdd is a list with the info

Filter

In [94]:
initial_time = time.time()
rdd_filtered = rdd.filter(lambda row: row["Arrest"] == True)

rdd_parquet_filter_time = time.time() - initial_time
print("Filter time: ", str(rdd_parquet_filter_time))


Filter time:  0.00099945068359375


Group by

In [95]:
start_time = time.time()
rdd_mapped = rdd_filtered.map(lambda row: (row["District"], 1))

rdd_parquet_time_group_by = time.time() - start_time

print("Group by time:", str(rdd_parquet_time_group_by))

Group by time: 0.0


Count

In [106]:
start_time = time.time()
rdd_counted = rdd_mapped.reduceByKey(lambda a, b: a + b)
rdd_parquet_count = time.time() - start_time
print("Count time: ", str(rdd_parquet_count))
print(rdd_counted.take(5))


Count time:  0.03737497329711914


Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 6 in stage 79.0 failed 1 times, most recent failure: Lost task 6.0 in stage 79.0 (TID 673) (host.docker.internal executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:192)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:166)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.api.python.PairwiseRDD.compute(PythonRDD.scala:128)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:101)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	at java.base/java.lang.Thread.run(Thread.java:1589)
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.base/sun.nio.ch.NioSocketImpl.timedAccept(NioSocketImpl.java:694)
	at java.base/sun.nio.ch.NioSocketImpl.accept(NioSocketImpl.java:738)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:690)
	at java.base/java.net.ServerSocket.platformImplAccept(ServerSocket.java:655)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:631)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:588)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:546)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:179)
	... 20 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2790)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2726)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2725)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2725)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1211)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1211)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1211)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2989)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2928)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2917)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:976)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2258)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2279)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2298)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:179)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at jdk.internal.reflect.GeneratedMethodAccessor183.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:578)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1589)
Caused by: org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:192)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:166)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.api.python.PairwiseRDD.compute(PythonRDD.scala:128)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:101)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	... 1 more
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.base/sun.nio.ch.NioSocketImpl.timedAccept(NioSocketImpl.java:694)
	at java.base/sun.nio.ch.NioSocketImpl.accept(NioSocketImpl.java:738)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:690)
	at java.base/java.net.ServerSocket.platformImplAccept(ServerSocket.java:655)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:631)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:588)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:546)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:179)
	... 20 more


Sort

In [105]:
start_time = time.time()
rdd_counted_no_null = rdd_counted.filter(lambda x: x[1] is not None)
rdd_sorted = rdd_counted_no_null.sortBy(lambda x: x[1], ascending=False)
rdd_parquet_time_sort = time.time() - start_time
print("Sort time: ", str(rdd_parquet_time_sort))


Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 4 in stage 77.0 failed 1 times, most recent failure: Lost task 4.0 in stage 77.0 (TID 663) (host.docker.internal executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:192)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:166)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.api.python.PairwiseRDD.compute(PythonRDD.scala:128)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:101)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	at java.base/java.lang.Thread.run(Thread.java:1589)
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.base/sun.nio.ch.NioSocketImpl.timedAccept(NioSocketImpl.java:694)
	at java.base/sun.nio.ch.NioSocketImpl.accept(NioSocketImpl.java:738)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:690)
	at java.base/java.net.ServerSocket.platformImplAccept(ServerSocket.java:655)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:631)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:588)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:546)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:179)
	... 20 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2790)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2726)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2725)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2725)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1211)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1211)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1211)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2989)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2928)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2917)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:976)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2258)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2279)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2298)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2323)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1022)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:408)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1021)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:76)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:578)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1589)
Caused by: org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:192)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:166)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.api.python.PairwiseRDD.compute(PythonRDD.scala:128)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:101)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	... 1 more
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.base/sun.nio.ch.NioSocketImpl.timedAccept(NioSocketImpl.java:694)
	at java.base/sun.nio.ch.NioSocketImpl.accept(NioSocketImpl.java:738)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:690)
	at java.base/java.net.ServerSocket.platformImplAccept(ServerSocket.java:655)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:631)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:588)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:546)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:179)
	... 20 more


### 3. Pandas

In [None]:
# Lectura del archivo Parquet
start_time = time.time()
df = pd.read_parquet("data_parquet", engine="pyarrow")
pdParquet_read_time = time.time() - start_time
print("Tiempo de lectura (Pandas Parquet):", pdParquet_read_time)

In [None]:
# Filtrado: seleccionar registros donde 'Arrest' es True
start_time = time.time()
df_filtrado = df[df["Arrest"] == True]
pdParquet_filter_time = time.time() - start_time
print("Tiempo de filtrado (Pandas Parquet):", pdParquet_filter_time)

NameError: name 'time' is not defined

In [None]:
# Agrupación: agrupar por 'District' y contar registros en cada grupo
start_time = time.time()
# Aquí se realiza la agrupación y se crea una nueva columna 'count'
df_agrupado = df_filtrado.groupby("District").size().reset_index(name="count")
pdParquet_group_time = time.time() - start_time
print("Tiempo de agrupación (Pandas Parquet):", pdParquet_group_time)

In [None]:
# Conteo: también se puede medir el tiempo de obtener el conteo de cada grupo
start_time = time.time()
df_count = df_filtrado.groupby("District")["Arrest"].count()
pdParquet_count_time = time.time() - start_time
print("Tiempo de conteo (Pandas Parquet):", pdParquet_count_time)

In [None]:

# Ordenamiento: ordenar los grupos por la columna 'count' de forma descendente
start_time = time.time()
df_sorted = df_agrupado.sort_values(by="count", ascending=False)
pdParquet_sort_time = time.time() - start_time
print("Tiempo de ordenamiento (Pandas Parquet):", pdParquet_sort_time)

### 4. Polars

In [None]:
import polars as pl
import time

Reading

In [None]:
start_computing_time = time.time()
df=pl.read_parquet(path_parquet_file)
pl_parquet_reading_time= time.time()-start_computing_time
print("Reading time: ",pl_parquet_reading_time)

Filter

In [None]:
start_computing_time = time.time()
filter_df= df.filter(pl.col("Arrest") == True)
pl_parquet_filter_time= time.time()-start_computing_time
print("Filter time: ",pl_parquet_filter_time)

Group by

In [None]:
start_computing_time = time.time()
grouped_df=filter_df.group_by("District")
pl_parquet_group_time= time.time()-start_computing_time
print("Group by time: ",pl_parquet_group_time)

Count

In [None]:
start_computing_time = time.time()
count_df=grouped_df.agg(pl.len().alias("count"))
pl_parquet_count_time= time.time()-start_computing_time
print("Group by time: ",pl_parquet_count_time)

Sort

In [None]:
start_computing_time = time.time()
sort_df = count_df.sort("count", descending=True)
pl_parquet_sort_time= time.time()-start_computing_time
print("Sort by time: ",pl_parquet_sort_time)

Total Time

In [None]:
pl_total_time=pl_parquet_reading_time+pl_parquet_filter_time+pl_parquet_group_time+pl_parquet_sort_time+pl_parquet_count_time
print("Total time: ",pl_total_time)

## Time Comparison

In [None]:
import matplotlib.pyplot as plt
import numpy as np

Stacked bar visualization

In [None]:
import pandas as pd
import numpy as np

colors = ["red", "steelblue", "green", "purple", "orange", "brown", "magenta", "cyan"]

data = {
    "Method": [
        "RDD Parquet",
        "RDD CSV",
        "SparkDF CSV",
        "Spark DF Parquet",
        "Polars CSV",
        "Polars Parquet",
        "Pandas CSV",
        "Pandas Parquet"
    ],
    "Reading": [
        rdd_parquet_reading_time,
        rdd_reading_time,
        dataframe_reading_time,
        dataframe_parquet_reading_time,
        pl_reading_time,
        pl_parquet_reading_time,
        pd_read_time,
        pdParquet_read_time
    ],
    "Filtering": [
        rdd_parquet_filter_time,
        rdd_filter_time,
        dataframe_filter_time,
        dataframe_parquet_filter_time,
        pl_filter_time,
        pl_parquet_filter_time,
        pd_filter_time,
        pdParquet_filter_time
    ],
    "Grouping": [
        rdd_parquet_time_group_by,
        rdd_time_group_by,
        dataframe_time_group_by,
        dataframe_parquet_time_group_by,
        pl_group_time,
        pl_parquet_group_time,
        pd_group_time,
        pdParquet_group_time
    ],
    "Counting": [
        rdd_parquet_count,
        rdd_count_time,
        dataframe_count_time,
        dataframe_parquet_count,
        pl_count_time,
        pl_parquet_count_time,
        pd_count_time,
        pdParquet_count_time
    ],
    "Sorting": [
        rdd_parquet_time_sort,
        rdd_time_sort,
        dataframe_time_sort,
        dataframe_parquet_time_sort,
        pl_sort_time,
        pl_parquet_sort_time,
        pd_sort_time,
        pdParquet_sort_time
    ]
}

# Convertir el diccionario a un DataFrame de Pandas
df_tiempos = pd.DataFrame(data)

# Vector de ceros para, por ejemplo, usarse en gráficos apilados
bottom = np.zeros(len(df_tiempos["Method"]))

print(df_tiempos)


In [None]:
#Mostrar datos
df

In [None]:

# Crear gráfico de barras apiladas
fig, ax = plt.subplots(figsize=(8, 6))
# Graficar cada operación como una capa apilada
for i, operation in enumerate(["Reading", "Filtering", "Grouping", "Counting", "Sorting"]):
    ax.bar(df["Method"], df[operation], bottom=bottom, label=operation, color=colors[i])
    bottom += df[operation]

# Agregar etiquetas
ax.set_title("Performance Comparison ")
ax.set_ylabel("Total Time (s)")
ax.legend(title="Operations", loc="upper right")

# Rotar etiquetas en el eje X para mejor visualización
plt.xticks(rotation=15)

# Mostrar gráfico
plt.show()