# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [None]:
%help

####  Run this cell to set up and start your interactive session.


In [1]:
%idle_timeout 2880
%glue_version 3.0
%worker_type G.2X
%number_of_workers 10

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 0.38.1 
Current idle_timeout is 2800 minutes.
idle_timeout has been set to 2880 minutes.
Setting Glue version to: 3.0
Previous worker type: G.1X
Setting new worker type to: G.2X
Previous number of workers: 5
Setting new number of workers to: 10
Authenticating with environment variables and user-defined glue_role_arn: arn:aws:iam::102165494304:role/glueinteractive
Trying to create a Glue session for the kernel.
Worker Type: G.2X
Number of Workers: 10
Session ID: 2207338f-4b27-4bcd-b999-e8be81530897
Job Type: glueetl
Applying the following default arguments:
--glue_kernel_version 0.38.1
--enable-glue-datacatalog true
Waiting for session 2207338f-4b27

#### Example: Create a DynamicFrame from a table in the AWS Glue Data Catalog and display its schema


In [2]:
df = spark.read.option("recursiveFileLookup", "true").text('s3://cdkstack-documentsbucket9ec9deb9-sbbf9n4wdhze/embeddingarchive/')




In [3]:
dfP = spark.read.option("recursiveFileLookup", "true").text('s3://cdkstack-documentsbucket9ec9deb9-sbbf9n4wdhze/promptarchive/')




In [4]:
df.count()

835


In [5]:
dfP.count()

4


In [6]:
from pyspark.sql.functions import split, col, regexp_replace, transform
df = df.withColumn("value", regexp_replace("value", r'(\[)', '')).withColumn("value", regexp_replace("value", r'(])', ''))
dfP = dfP.withColumn("value", regexp_replace("value", r'(\[)', '')).withColumn("value", regexp_replace("value", r'(])', ''))




In [7]:
from pyspark.sql.functions import split, col
df = df.select(split(col("value"),",").alias("EmbedArray")).drop("value")
dfP = dfP.select(split(col("value"),",").alias("EmbedArray")).drop("value")




In [8]:
df = df.withColumn("EmbedArray", transform(col("EmbedArray"), lambda x: x.cast("float")))
df = df.withColumn("EmbedArray", col("EmbedArray").cast("array<float>"))
dfP = dfP.withColumn("EmbedArray", transform(col("EmbedArray"), lambda x: x.cast("float")))
dfP = dfP.withColumn("EmbedArray", col("EmbedArray").cast("array<float>"))




In [9]:
from pyspark.ml.linalg import Vectors




In [10]:
from pyspark.ml.functions import array_to_vector




In [11]:
df = df.select(array_to_vector('EmbedArray').alias('EmbedArray'))
dfP = dfP.select(array_to_vector('EmbedArray').alias('EmbedArray'))




In [12]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(
  inputCols=["EmbedArray"], outputCol="features"
)

dfTrain = assembler.transform(df).drop('EmbedArray')
dfTrainP = assembler.transform(dfP).drop('EmbedArray')




In [13]:
from pyspark.ml.feature import PCA
pca = PCA(k=100, inputCol="features")




In [14]:
pca.setOutputCol("pca_features")
pca_model = pca.fit(dfTrain)




In [15]:
pca_model.setOutputCol("output")
dfPca = pca_model.transform(dfTrain)
dfPcaP = pca_model.transform(dfTrainP)




In [16]:
import numpy as np




In [17]:
from pyspark.ml.clustering import KMeans




In [18]:
kmeans = KMeans(k=10)




In [19]:
kmeans_model = kmeans.fit(dfPca)




In [20]:
kmeans_model.setPredictionCol("newPrediction")

KMeansModel: uid=KMeans_5ed8cbc668c1, k=10, distanceMeasure=euclidean, numFeatures=4096


In [22]:
dfKmeans = kmeans_model.transform(dfPca).select("features", "newPrediction")




In [21]:
dfKmeansP = kmeans_model.transform(dfPcaP).select("features", "newPrediction")




In [23]:
from pyspark.ml.evaluation import ClusteringEvaluator
evaluator = ClusteringEvaluator(predictionCol='newPrediction', featuresCol='features', \
                                metricName='silhouette', distanceMeasure='squaredEuclidean')




In [24]:
score=evaluator.evaluate(dfKmeans)
score

0.1058243792342071


In [25]:
scoreP=evaluator.evaluate(dfKmeansP)
scoreP

-0.13371579342985535


In [26]:
dfKmeansP.printSchema()

root
 |-- features: vector (nullable = true)
 |-- newPrediction: integer (nullable = false)


In [28]:
dfKmeansP.head(1)[0]['newPrediction']

5


In [33]:
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType
l_clusters = kmeans_model.clusterCenters()
# Let's convert the list of centers to a dict, each center is a list of float
d_clusters = {int(i):[float(l_clusters[i][j]) for j in range(len(l_clusters[i]))] for i in range(len(l_clusters))}

# Let's create a dataframe containing the centers and their coordinates
df_centers = spark.sparkContext.parallelize([(k,)+(v,) for k,v in d_clusters.items()]).toDF(['prediction','center'])

dfKmeansP = dfKmeansP.withColumn('prediction',F.col('newPrediction').cast(IntegerType()))
dfKmeansP = dfKmeansP.join(df_centers,on='prediction',how='left')




In [34]:
from pyspark.sql.types import FloatType
get_dist = F.udf(lambda features, center : float(features.squared_distance(center)),FloatType())
dfKmeansP = dfKmeansP.withColumn('dist',get_dist(F.col('features'),F.col('center')))




In [36]:
dfKmeansP.printSchema()

root
 |-- prediction: integer (nullable = false)
 |-- features: vector (nullable = true)
 |-- newPrediction: integer (nullable = false)
 |-- center: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- dist: float (nullable = true)


In [37]:
dfKmeansP.head(1)[0]['dist']

551.3375854492188


In [38]:
from pyspark.sql.functions import mean as _mean, stddev as _stddev

df_stats = dfKmeansP.select(
    _mean(col('dist')).alias('mean'),
    _stddev(col('dist')).alias('std')
).collect()

mean = df_stats[0]['mean']
std = df_stats[0]['std']




In [39]:
mean

669.6846122741699


In [40]:
std

389.3161660099286
