## Libraries and UDFs

In [59]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.ml.fpm import FPGrowth
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType
import re

spark = SparkSession \
        .builder \
        .appName("frequent_itemsets") \
        .getOrCreate()

In [None]:
def clean_tokens(tokens):
    if not tokens:
        return []
    valid_word_pattern = re.compile(r"^[a-zA-ZáéíóúñÁÉÍÓÚÑüÜ]+$")
    return [token for token in tokens if valid_word_pattern.match(token)]
clean_tokens_udf = udf(clean_tokens, ArrayType(StringType()))

## Preprocessing

In [2]:
path = r"../data/interventions_sample.csv"
interventions_sample = spark.read.csv(path, header=True)
int_df = interventions_sample.select("session_id", "intervention_id", 
                                     "intervention_text", "intervention_words")

In [3]:
path = r"../data/sample_clusters.csv" 
df = spark.read.csv(path, header=True)

In [4]:
int_df = int_df.join(df.select("intervention_id", "cluster"), on="intervention_id", how="inner")
int_df.show()

                                                                                

+---------------+--------------+--------------------+--------------------+-------+
|intervention_id|    session_id|   intervention_text|  intervention_words|cluster|
+---------------+--------------+--------------------+--------------------+-------+
|         451824|gaceta_459 (7)|de acuerdo con el...|['acciones', 'cuá...|      0|
|         451621|    gaceta_852|albán urbano luis...|['carlos', 'casti...|      2|
|         451599|    gaceta_852|presidente el sig...|['artículos', 'av...|      4|
|         451438| gaceta_68 (7)|perdone un segund...|['comisión', 'deb...|      1|
|         451192|gaceta_456 (6)|antes de empezar ...|['asistir', 'comp...|      1|
|         451071|   gaceta_1536|celular 318862010...|['aceta', 'celula...|      3|
|         450768|gaceta_633 (1)|un segundo ya nos...|['activamos', 'al...|      0|
|         450662|gaceta_390 (6)|doctora lo que pa...|['afectados', 'ci...|      0|
|         450234| gaceta_49 (9)|continúa el señor...|['anglicana', 'ap...|      1|
|   

In [5]:
int_df.printSchema()

root
 |-- intervention_id: string (nullable = true)
 |-- session_id: string (nullable = true)
 |-- intervention_text: string (nullable = true)
 |-- intervention_words: string (nullable = true)
 |-- cluster: string (nullable = true)



In [6]:
esp_stopwords = [
    "a", "al", "algo", "algunas", "algunos", "ante", "antes", "aquel", "aquella",
    "aquellas", "aquellos", "aquí", "cada", "casi", "como", "con", "contra",
    "cual", "cuales", "cuando", "cuanta", "cuantas", "cuanto", "cuantos", "de",
    "del", "dentro", "donde", "dos", "el", "él", "ella", "ellas", "ellos", "en",
    "entre", "esa", "esas", "ese", "eso", "esos", "esta", "estas", "este",
    "estos", "lo", "los", "la", "las", "me", "mi", "mí", "mis", "nos", "nosotras",
    "nosotros", "o", "otra", "otras", "otro", "otros", "para", "pero", "poco",
    "por", "que", "qué", "se", "sí", "sin", "sobre", "su", "sus", "tu", "tú",
    "tus", "un", "una", "unas", "uno", "unos", "vosotras", "vosotros", "vuestra",
    "vuestras", "vuestro", "vuestros", "y", "ya"
]

In [7]:
#tokenize to create word "baskets"
tokenizer = Tokenizer(inputCol="intervention_text", outputCol="tokens")

stopwords_remover = StopWordsRemover(inputCol="tokens",
                                     outputCol="filtered_tokens",
                                     stopWords=esp_stopwords)
pipeline = Pipeline(stages=[tokenizer, stopwords_remover])
pipeline_model = pipeline.fit(int_df)

result_df = pipeline_model.transform(int_df)
result_df.select('tokens', 'filtered_tokens').show(10)

                                                                                

+--------------------+--------------------+
|              tokens|     filtered_tokens|
+--------------------+--------------------+
|[de, acuerdo, con...|[acuerdo, observa...|
|[albán, urbano, l...|[albán, urbano, l...|
|[presidente, el, ...|[presidente, sigu...|
|[perdone, un, seg...|[perdone, segundi...|
|[antes, de, empez...|[empezar, orden, ...|
|[celular, 3188620...|[celular, 3188620...|
|[un, segundo, ya,...|[segundo, traslad...|
|[doctora, lo, que...|[doctora, pasa, e...|
|[continúa, el, se...|[continúa, señor,...|
|[señor, president...|[señor, president...|
+--------------------+--------------------+
only showing top 10 rows



In [8]:
cluster_df_dict = {}
cluster_labels = [row[0] for row in result_df.select("cluster").distinct().collect()]

for label in cluster_labels:
    cluster_df = result_df.filter(result_df['cluster'] == label)
    cluster_df = cluster_df.filter((F.col('filtered_tokens').isNotNull()))
    cluster_df_dict[label] = cluster_df

                                                                                

## FP Growth

In [58]:
test_df = result_df.filter(result_df['cluster'] == '1')
test_df.show(3)



+---------------+--------------+--------------------+--------------------+-------+--------------------+--------------------+
|intervention_id|    session_id|   intervention_text|  intervention_words|cluster|              tokens|     filtered_tokens|
+---------------+--------------+--------------------+--------------------+-------+--------------------+--------------------+
|         451438| gaceta_68 (7)|perdone un segund...|['comisión', 'deb...|      1|[perdone, un, seg...|[perdone, segundi...|
|         451192|gaceta_456 (6)|antes de empezar ...|['asistir', 'comp...|      1|[antes, de, empez...|[empezar, orden, ...|
|         450234| gaceta_49 (9)|continúa el señor...|['anglicana', 'ap...|      1|[continúa, el, se...|[continúa, señor,...|
+---------------+--------------+--------------------+--------------------+-------+--------------------+--------------------+
only showing top 3 rows



                                                                                

In [64]:
cleaned_df = test_df.withColumn("filtered_tokens", clean_tokens_udf(F.col("filtered_tokens")))
cleaned_df = cleaned_df.filter(F.size(F.col("filtered_tokens")) > 0)

cleaned_df.show()

24/12/05 22:47:29 ERROR PythonUDFRunner: Python worker exited unexpectedly (crashed)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/software/spark-3.3.2-el8-x86_64/python/lib/pyspark.zip/pyspark/worker.py", line 666, in main
    eval_type = read_int(infile)
  File "/software/spark-3.3.2-el8-x86_64/python/lib/pyspark.zip/pyspark/serializers.py", line 595, in read_int
    raise EOFError
EOFError

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:552)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:86)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:68)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:505)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.

Py4JJavaError: An error occurred while calling o3505.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 119.0 failed 1 times, most recent failure: Lost task 3.0 in stage 119.0 (TID 922) (midway3-0096.rcc.local executor driver): org.apache.spark.SparkException: Failed to execute user defined function (Tokenizer$$Lambda$3927/0x0000000801b0bb38: (string) => array<string>)
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:190)
	at org.apache.spark.sql.errors.QueryExecutionErrors.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificMutableProjection.ScalaUDF_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificMutableProjection.apply(Unknown Source)
	at org.apache.spark.sql.execution.python.EvalPythonExec.$anonfun$doExecute$10(EvalPythonExec.scala:127)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$GroupedIterator.takeDestructively(Iterator.scala:1161)
	at scala.collection.Iterator$GroupedIterator.go(Iterator.scala:1176)
	at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1214)
	at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1217)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:307)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.writeIteratorToStream(PythonUDFRunner.scala:53)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:431)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:2066)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:265)
Caused by: java.lang.NullPointerException

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
Caused by: org.apache.spark.SparkException: Failed to execute user defined function (Tokenizer$$Lambda$3927/0x0000000801b0bb38: (string) => array<string>)
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:190)
	at org.apache.spark.sql.errors.QueryExecutionErrors.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificMutableProjection.ScalaUDF_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificMutableProjection.apply(Unknown Source)
	at org.apache.spark.sql.execution.python.EvalPythonExec.$anonfun$doExecute$10(EvalPythonExec.scala:127)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$GroupedIterator.takeDestructively(Iterator.scala:1161)
	at scala.collection.Iterator$GroupedIterator.go(Iterator.scala:1176)
	at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1214)
	at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1217)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:307)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.writeIteratorToStream(PythonUDFRunner.scala:53)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:431)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:2066)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:265)
Caused by: java.lang.NullPointerException


In [63]:
cleaned_df.printSchema()
cleaned_df.show()

root
 |-- intervention_id: string (nullable = true)
 |-- session_id: string (nullable = true)
 |-- intervention_text: string (nullable = true)
 |-- intervention_words: string (nullable = true)
 |-- cluster: string (nullable = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filtered_tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)





+---------------+--------------+--------------------+--------------------+-------+--------------------+--------------------+
|intervention_id|    session_id|   intervention_text|  intervention_words|cluster|              tokens|     filtered_tokens|
+---------------+--------------+--------------------+--------------------+-------+--------------------+--------------------+
|         451438| gaceta_68 (7)|perdone un segund...|['comisión', 'deb...|      1|[perdone, un, seg...|[perdone, segundi...|
|         451192|gaceta_456 (6)|antes de empezar ...|['asistir', 'comp...|      1|[antes, de, empez...|[empezar, orden, ...|
|         450234| gaceta_49 (9)|continúa el señor...|['anglicana', 'ap...|      1|[continúa, el, se...|[continúa, señor,...|
|         450076|gaceta_134 (7)|señor presidente,...|['después', 'día'...|      1|[señor, president...|[señor, es, veo, ...|
|         449987|    gaceta_634|austed senador al...|['alirio', 'aquí'...|      1|[austed, senador,...|[austed, senador,...|


                                                                                