In [23]:
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, FloatType
from pyspark.ml import Pipeline
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StandardScaler, PCA
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.mllib.feature import StandardScaler as StandardScalerRDD
from pyspark.mllib.linalg.distributed import RowMatrix
import sparknlp
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import Tokenizer, BertEmbeddings
import tensorflow


spark = SparkSession.builder \
    .appName("Spark NLP") \
    .config("spark.jars", "/project/macs40123/spark-jars/spark-nlp_2.12-3.3.2.jar") \
    .getOrCreate()

In [21]:
#load and process data
path = r"../data/interventions_sample.csv"

interventions = spark.read.csv(path, header=True, inferSchema=True)

In [None]:
#this cell only applies when working with interventions_sample, not the full dataset

interventions = interventions.withColumn("embeddings", 
                                         split(col("embeddings"), ",\s*") \
                                         .cast(ArrayType(FloatType()))
                                        )

In [11]:
interventions = interventions.withColumn("intervention_words",
    F.split(F.regexp_replace(F.col("intervention_words"), r"[\[\]']", ""), ","))
interventions.printSchema()

root
 |-- _c0: double (nullable = true)
 |-- session_id: string (nullable = true)
 |-- intervention_id: string (nullable = true)
 |-- speaker_text: string (nullable = true)
 |-- intervention_text: string (nullable = true)
 |-- intervention_words: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- embeddings: string (nullable = true)



In [42]:
int_features = interventions.select('session_id', 'intervention_id', 'intervention_text', 'embeddings')
int_features.select('embeddings').first()[0]

'tensor([[ 2.7571e-01, -1.5672e-01,  5.2478e-02, -1.6506e-01, -2.2252e-01,'

In [43]:
#convert word list into vectors using base bert in spanish (BETO)
document_assembler = DocumentAssembler() \
    .setInputCol("intervention_text") \
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols("document") \
    .setOutputCol("tokens")

embeddings = BertEmbeddings.load(r"../data/bert_base/") \
    .setInputCols(["document", "token"]) \
    .setOutputCol("embeddings") \
    .setCaseSensitive(False)

# Define the pipeline
pipeline = Pipeline(stages=[
    document_assembler,
    tokenizer,
    embeddings
])

#Fit and transform the DataFrame
model = pipeline.fit(int_features)
int_features = model.transform(int_features)

TypeError: 'JavaPackage' object is not callable

In [None]:
#pca

In [None]:
#svd