## Libraries and UDFs

In [None]:
# General spark
from pyspark import SparkContext, SparkContext

#Working with dataframes
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, Row, DataFrame
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, FloatType, IntegerType

#Dimension reduction and clustering
from pyspark.ml import Pipeline
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StandardScaler, PCA
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.mllib.feature import StandardScaler as StandardScalerRDD
from pyspark.mllib.linalg.distributed import RowMatrix

#Regular python libraries
import pandas as pd
import ast
import warnings
import ast
import pickle

spark = SparkSession.builder \
    .appName("dimension reduction and clustering") \
    .getOrCreate()

In [None]:
def string_to_vector(s):
    try:
        embedding = [float(x) for x in ast.literal_eval(s)]
        return Vectors.dense(embedding)
    except:
        return None
string_to_vector_udf = udf(string_to_vector, ArrayType(FloatType()))

def cluster_and_evaluate_features(data, clust_model, evaluator):
    model = clust_model.fir(data)
    df_with_clusters = model.transform(df)
    score = evaluator.evaluate(df_with_clusters)

    if hasattr(clust_model, 'getK'):
        num_clusters = clust_model.getK()
    elif hasattr(clust_model, 'getNumClusters'):
        num_clusters = clust_model.getNumClusters()
    else:
        num_clusters = 'Unknown

    algorithm_name = type(clust_model).__name__
    
    print(f"Score for {algorithm_name} with {num_clusters} clusters: {score}")
    return df_with_clusters, model

    


## Preprocessing

In [None]:
path = r"../data/interventions_sample.pkl"

interventions_pd_df = pd.read_pickle(path)
interventions_df = spark.createDataFrame(interventions_pd_df)
interventions_df.show()

In [None]:
df = interventions_df.select('session_id', 'intervention_id', 'embeddings_str')
df = df.withColumn('features', string_to_vector_udf('embeddings_str'))
df.persist()

## Round 1

### Benchmarks

In [None]:
n_clusters = 5
gmm = GaussianMixture(k=n_clusters, featuresCol='features', predictionCol='cluster', seed=0)
kmeans = KMeans(k=n_clusters, featuresCol='features', predictionCol='cluster', seed=0)
evaluator = ClusteringEvaluator(featuresCol='features', predictionCol='cluster', metricName='silhouette', distanceMeasure='squaredEuclidean')

for alg in [gmm, kmeans]:
    cluster_and_evaluate_features(df, alg, evaluator)


### PCA

In [None]:
embedding_dim = df.select('features').first()[0].size

pca = PCA(k=embedding_dim, inputCol='features', outputCol='pca_features')
pca_model = pca.fit(df)

explained_variance = pca_model.explainedVariance.toArray()
cumulative_variance = np.cumsum(explained_variance)
num_components_90 = np.where(cumulative_variance >= 0.9)[0][0] + 1

pca = PCA(k=num_components_90, inputCol='features', outputCol='pca_features')
pca_model = pca.fit(df)
df_pca = pca_model.transform(df)

In [None]:
clusters = list(range(2, 51))

for n in clusters:
    gmm = GaussianMixture(k=n, featuresCol='features', predictionCol='cluster', seed=0)
    kmeans = KMeans(k=n, featuresCol='features', predictionCol='cluster', seed=0)
    for alg in [gmm, kmeans]:
        cluster_and_evaluate_features(df_pca, alg, evaluator)


### SVD

In [None]:
rdd_vectors = df.select('features').rdd.map(lambda row: Vectors.dense(row['features']))
mat = RowMatrix(rdd_vectors)
svd = mat.computeSVD(k=num_components_90, computeU=True)

U_df = svd.U.rows.map(lambda row: Row(features=Vectors.dense(row.toArray()))).toDF()

In [None]:
for n in clusters:
    gmm = GaussianMixture(k=n, featuresCol='features', predictionCol='cluster', seed=0)
    kmeans = KMeans(k=n, featuresCol='features', predictionCol='cluster', seed=0)
    for alg in [gmm, kmeans]:
        cluster_and_evaluate_features(U_df, alg, evaluator)

### Best model
SVD features, kmeans clustering, 4 clusters

In [None]:
kmeans = KMeans(k=4, featuresCol='features', predictionCol='cluster', seed=0)

df_labels, _ = cluster_and_evaluate_features(U_df, kmeans, evaluator)

### Evaluate clusters

In [None]:
df_with_random = df_labels.withColumn('random', F.rand())
window_spec = Window.partitionBy('cluster').orderBy('random')
df_sampled = df_with_random.withColumn('rank', F.row_number().over(window_spec)).filter(F.col('rank') <= 5)
sampled_data = df_sampled.select('cluster', 'intervention_id').collect()

cluster_dict = {}
for row in sampled_data:
    cluster = row['cluster']
    intervention_id = row['intervention_id']
    if cluster not in cluster_dict:
        cluster_dict[cluster] = []
    cluster_dict[cluster].append(intervention_id)

for cluster, intervention_ids in cluster_dict.items():
    print(f"Cluster {cluster}")
    for intervention_id in intervention_ids:
        text_row = interventions_df.filter(interventions_df['intervention_id'] == intervention_id).select('intervention_text').collect()
        if text_row:
            intervention_text = text_row[0]['intervention_text']
            print(intervention_text[:100])
    print("\n")

Cluster 2 contains interventions that were not properly filtered. Goin to filter and re cluster

## Round 2

### Benchmarks

In [None]:
df_filtered = df.filter(df.cluster != 2)

In [None]:
n_clusters = 5
gmm = GaussianMixture(k=n_clusters, featuresCol='features', predictionCol='cluster', seed=0)
kmeans = KMeans(k=n_clusters, featuresCol='features', predictionCol='cluster', seed=0)
evaluator = ClusteringEvaluator(featuresCol='features', predictionCol='cluster', metricName='silhouette', distanceMeasure='squaredEuclidean')

for alg in [gmm, kmeans]:
    cluster_and_evaluate_features(df, alg, evaluator)

### PCA

In [None]:
embedding_dim = df_filtered.select('features').first()[0].size

pca = PCA(k=embedding_dim, inputCol='features', outputCol='pca_features')
pca_model = pca.fit(df_filtered)

explained_variance = pca_model.explainedVariance.toArray()
cumulative_variance = np.cumsum(explained_variance)
num_components_90 = np.where(cumulative_variance >= 0.9)[0][0] + 1

pca = PCA(k=num_components_90, inputCol='features', outputCol='pca_features')
pca_model = pca.fit(df_filtered)
df_pca = pca_model.transform(df_filtered)

In [None]:
for n in clusters:
    gmm = GaussianMixture(k=n, featuresCol='features', predictionCol='cluster', seed=0)
    kmeans = KMeans(k=n, featuresCol='features', predictionCol='cluster', seed=0)
    for alg in [gmm, kmeans]:
        cluster_and_evaluate_features(df_pca, alg, evaluator)

### SVD

In [None]:
rdd_vectors = df_filtered.select('features').rdd.map(lambda row: Vectors.dense(row['features']))
mat = RowMatrix(rdd_vectors)
svd = mat.computeSVD(k=num_components_90, computeU=True)

U_df = svd.U.rows.map(lambda row: Row(features=Vectors.dense(row.toArray()))).toDF()

In [None]:
for n in clusters:
    gmm = GaussianMixture(k=n, featuresCol='features', predictionCol='cluster', seed=0)
    kmeans = KMeans(k=n, featuresCol='features', predictionCol='cluster', seed=0)
    for alg in [gmm, kmeans]:
        cluster_and_evaluate_features(U_df, alg, evaluator)

### Best model

SVD features, kmeans clustering, 5 clusters

In [None]:
kmeans = KMeans(k=5, featuresCol='features', predictionCol='cluster', seed=0)

df_labels, _ = cluster_and_evaluate_features(U_df, kmeans, evaluator)

### Evaluate clusters

In [None]:
df_with_random = df_labels.withColumn('random', F.rand())
window_spec = Window.partitionBy('cluster').orderBy('random')
df_sampled = df_with_random.withColumn('rank', F.row_number().over(window_spec)).filter(F.col('rank') <= 5)
sampled_data = df_sampled.select('cluster', 'intervention_id').collect()

cluster_dict = {}
for row in sampled_data:
    cluster = row['cluster']
    intervention_id = row['intervention_id']
    if cluster not in cluster_dict:
        cluster_dict[cluster] = []
    cluster_dict[cluster].append(intervention_id)

for cluster, intervention_ids in cluster_dict.items():
    print(f"Cluster {cluster}")
    for intervention_id in intervention_ids:
        text_row = interventions_df.filter(interventions_df['intervention_id'] == intervention_id).select('intervention_text').collect()
        if text_row:
            intervention_text = text_row[0]['intervention_text']
            print(intervention_text[:100])
    print("\n")

These seem more interpretable

In [None]:
df_labels_pandas = df_labels.toPandas()

output_path = "../data/dataset_with_clusters.pkl"
with open(output_path, "wb") as file:
    pickle.dump(df_labels_pandas, file)