## Libraries and UDFs

In [1]:
from pyspark.sql import SparkSession, Row
from pyspark.sql import functions as F
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.ml.fpm import FPGrowth
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType
import re

spark = SparkSession \
        .builder \
        .appName("frequent_itemsets") \
        .getOrCreate()

24/12/06 06:15:00 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
def clean_tokens(tokens):
    if not tokens:
        return []
    valid_word_pattern = re.compile(r"^[a-zA-ZáéíóúñÁÉÍÓÚÑüÜ]+$")
    return [token for token in tokens if valid_word_pattern.match(token)]
clean_tokens_udf = udf(clean_tokens, ArrayType(StringType()))

def is_valid_string(token):
    return isinstance(token, str) and bool(re.match(r"^[a-zA-ZáéíóúñÁÉÍÓÚÑüÜ]+$", token))

## Preprocessing

In [3]:
path = r"../data/interventions_sample.csv"
interventions_sample = spark.read.csv(path, header=True)
int_df = interventions_sample.select("session_id", "intervention_id", 
                                     "intervention_text", "intervention_words")

In [4]:
path = r"../data/sample_clusters.csv" 
df = spark.read.csv(path, header=True)

In [5]:
int_df = int_df.join(df.select("intervention_id", "cluster"), on="intervention_id", how="inner")
int_df.show()

                                                                                

+---------------+--------------+--------------------+--------------------+-------+
|intervention_id|    session_id|   intervention_text|  intervention_words|cluster|
+---------------+--------------+--------------------+--------------------+-------+
|         451824|gaceta_459 (7)|de acuerdo con el...|['acciones', 'cuá...|      0|
|         451621|    gaceta_852|albán urbano luis...|['carlos', 'casti...|      2|
|         451599|    gaceta_852|presidente el sig...|['artículos', 'av...|      4|
|         451438| gaceta_68 (7)|perdone un segund...|['comisión', 'deb...|      1|
|         451192|gaceta_456 (6)|antes de empezar ...|['asistir', 'comp...|      1|
|         451071|   gaceta_1536|celular 318862010...|['aceta', 'celula...|      3|
|         450768|gaceta_633 (1)|un segundo ya nos...|['activamos', 'al...|      0|
|         450662|gaceta_390 (6)|doctora lo que pa...|['afectados', 'ci...|      0|
|         450234| gaceta_49 (9)|continúa el señor...|['anglicana', 'ap...|      1|
|   

In [6]:
int_df.printSchema()

root
 |-- intervention_id: string (nullable = true)
 |-- session_id: string (nullable = true)
 |-- intervention_text: string (nullable = true)
 |-- intervention_words: string (nullable = true)
 |-- cluster: string (nullable = true)



In [7]:
esp_stopwords = [
    "a", "al", "algo", "algunas", "algunos", "ante", "antes", "aquel", "aquella",
    "aquellas", "aquellos", "aquí", "cada", "casi", "como", "con", "contra",
    "cual", "cuales", "cuando", "cuanta", "cuantas", "cuanto", "cuantos", "de",
    "del", "dentro", "donde", "dos", "el", "él", "ella", "ellas", "ellos", "en",
    "entre", "esa", "esas", "ese", "eso", "esos", "esta", "estas", "este",
    "estos", "lo", "los", "la", "las", "me", "mi", "mí", "mis", "nos", "nosotras",
    "nosotros", "o", "otra", "otras", "otro", "otros", "para", "pero", "poco",
    "por", "que", "qué", "se", "sí", "sin", "sobre", "su", "sus", "tu", "tú",
    "tus", "un", "una", "unas", "uno", "unos", "vosotras", "vosotros", "vuestra",
    "vuestras", "vuestro", "vuestros", "y", "ya", "senador", "presidente", "honorable",
    "secretario", "honorable", "cámara", "comisión", "representante", "gracias", "comisión",
    "palabra", "doctor", "tiene", "uso", "señor", "le", "usted", "doctora", "muchas", "es", "no",
    "vamos", "muy"
]

In [8]:
#tokenize to create word "baskets"
tokenizer = Tokenizer(inputCol="intervention_text", outputCol="tokens")

stopwords_remover = StopWordsRemover(inputCol="tokens",
                                     outputCol="filtered_tokens",
                                     stopWords=esp_stopwords)
pipeline = Pipeline(stages=[tokenizer, stopwords_remover])
pipeline_model = pipeline.fit(int_df)

result_df = pipeline_model.transform(int_df)
result_df.select('tokens', 'filtered_tokens').show(10)

                                                                                

+--------------------+--------------------+
|              tokens|     filtered_tokens|
+--------------------+--------------------+
|[de, acuerdo, con...|[acuerdo, observa...|
|[albán, urbano, l...|[albán, urbano, l...|
|[presidente, el, ...|[siguiente, bloqu...|
|[perdone, un, seg...|[perdone, segundi...|
|[antes, de, empez...|[empezar, orden, ...|
|[celular, 3188620...|[celular, 3188620...|
|[un, segundo, ya,...|[segundo, traslad...|
|[doctora, lo, que...|[pasa, discusione...|
|[continúa, el, se...|[continúa, marco,...|
|[señor, president...|[presidente,, veo...|
+--------------------+--------------------+
only showing top 10 rows



In [9]:
cluster_df_dict = {}
cluster_labels = [row[0] for row in result_df.select("cluster").distinct().collect()]

for label in cluster_labels:
    cluster_df = result_df.filter(result_df['cluster'] == label)
    cleaned_rows_for_fpgrowth = []
    for row in cluster_df.collect():
        filtered_tokens = row['filtered_tokens']
        cleaned_tokens = []
        for token in filtered_tokens:
            if is_valid_string(token) and token not in cleaned_tokens:
                cleaned_tokens.append(token)
        cleaned_rows_for_fpgrowth.append([cleaned_tokens])
    cluster_df_dict[label] = spark.createDataFrame(cleaned_rows_for_fpgrowth, ["items"])

                                                                                

## FP Growth

In [None]:
fp = FPGrowth(itemsCol="items", minSupport=0.01, minConfidence=0.1)
for label, df in cluster_df_dict.items():
    print(f"Showing results for cluster {label}, with {df.count()} interventions")
    fpm = fp.fit(df)
    fpm.freqItemsets.sort("freq", ascending=False).show()
    fpm.associationRules.sort("confidence", ascending=False).show()
    fpm.transform(cleaned_df_for_fpgrowth).show()
