# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [None]:
%help

####  Run this cell to set up and start your interactive session.


In [1]:
%idle_timeout 2880
%glue_version 3.0
%worker_type G.1X
%number_of_workers 5

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 0.37.3 
Current idle_timeout is 2800 minutes.
idle_timeout has been set to 2880 minutes.
Setting Glue version to: 3.0
Previous worker type: G.1X
Setting new worker type to: G.1X
Previous number of workers: 5
Setting new number of workers to: 5
Authenticating with environment variables and user-defined glue_role_arn: arn:aws:iam::102165494304:role/glueinteractive
Trying to create a Glue session for the kernel.
Worker Type: G.1X
Number of Workers: 5
Session ID: 1a4dcae2-e612-48c7-82e9-9395b5ca7938
Job Type: glueetl
Applying the following default arguments:
--glue_kernel_version 0.37.3
--enable-glue-datacatalog true
Waiting for session 1a4dcae2-e612-4

#### Example: Create a DynamicFrame from a table in the AWS Glue Data Catalog and display its schema


In [2]:
df = spark.read.option("recursiveFileLookup", "true").text('s3://cdkstack-documentsbucket9ec9deb9-sbbf9n4wdhze/embeddingarchive/')




In [3]:
df.count()

91


In [4]:
df.take(1)

[Row(value='[0.09326172, 0.22167969, 0.0011672974, 0.40234375, -0.09765625, 0.17675781, 0.31640625, 0.096191406, -0.19042969, -0.022460938, 0.071777344, -0.2109375, -0.1640625, 0.05078125, -0.29296875, -0.16113281, -0.73828125, 0.047851562, 0.046142578, 0.36328125, -0.06298828, -0.24902344, 0.20214844, 0.73828125, 0.140625, -0.23242188, -0.05029297, 0.08691406, 0.033447266, 0.390625, -0.122558594, -0.40234375, 0.203125, -0.18359375, -0.14746094, 0.123535156, -0.36914062, 0.12402344, 0.06982422, -0.359375, 0.0703125, -0.25585938, -0.05493164, -0.5234375, -0.039794922, -0.18261719, 0.5234375, -0.1875, -0.22851562, -0.18164062, 0.21972656, 0.10888672, 0.061279297, 0.27929688, 0.00078582764, -0.36132812, 0.21289062, 0.09423828, 0.29296875, -0.022827148, -0.24609375, -0.24023438, 0.076660156, -0.3359375, -0.12109375, 0.66796875, 0.578125, -0.20605469, 0.28515625, 0.16601562, -0.09716797, -0.1015625, -0.15820312, -0.18066406, -0.029052734, -0.5, -0.06542969, -0.0008163452, -0.4921875, -0.155

In [4]:
df.printSchema()

root
 |-- value: string (nullable = true)


In [5]:
from pyspark.sql.functions import split, col, regexp_replace, transform
df2 = df.withColumn("value", regexp_replace("value", r'(\[)', '')).withColumn("value", regexp_replace("value", r'(])', ''))
df2.head(1)

[Row(value='0.09326172, 0.22167969, 0.0011672974, 0.40234375, -0.09765625, 0.17675781, 0.31640625, 0.096191406, -0.19042969, -0.022460938, 0.071777344, -0.2109375, -0.1640625, 0.05078125, -0.29296875, -0.16113281, -0.73828125, 0.047851562, 0.046142578, 0.36328125, -0.06298828, -0.24902344, 0.20214844, 0.73828125, 0.140625, -0.23242188, -0.05029297, 0.08691406, 0.033447266, 0.390625, -0.122558594, -0.40234375, 0.203125, -0.18359375, -0.14746094, 0.123535156, -0.36914062, 0.12402344, 0.06982422, -0.359375, 0.0703125, -0.25585938, -0.05493164, -0.5234375, -0.039794922, -0.18261719, 0.5234375, -0.1875, -0.22851562, -0.18164062, 0.21972656, 0.10888672, 0.061279297, 0.27929688, 0.00078582764, -0.36132812, 0.21289062, 0.09423828, 0.29296875, -0.022827148, -0.24609375, -0.24023438, 0.076660156, -0.3359375, -0.12109375, 0.66796875, 0.578125, -0.20605469, 0.28515625, 0.16601562, -0.09716797, -0.1015625, -0.15820312, -0.18066406, -0.029052734, -0.5, -0.06542969, -0.0008163452, -0.4921875, -0.1552

In [6]:
from pyspark.sql.functions import split, col
df3 = df2.select(split(col("value"),",").alias("EmbedArray")) \
    .drop("value")
df3.printSchema()

root
 |-- EmbedArray: array (nullable = true)
 |    |-- element: string (containsNull = true)


In [21]:
df3.take(1)

[Row(EmbedArray=['0.09326172', ' 0.22167969', ' 0.0011672974', ' 0.40234375', ' -0.09765625', ' 0.17675781', ' 0.31640625', ' 0.096191406', ' -0.19042969', ' -0.022460938', ' 0.071777344', ' -0.2109375', ' -0.1640625', ' 0.05078125', ' -0.29296875', ' -0.16113281', ' -0.73828125', ' 0.047851562', ' 0.046142578', ' 0.36328125', ' -0.06298828', ' -0.24902344', ' 0.20214844', ' 0.73828125', ' 0.140625', ' -0.23242188', ' -0.05029297', ' 0.08691406', ' 0.033447266', ' 0.390625', ' -0.122558594', ' -0.40234375', ' 0.203125', ' -0.18359375', ' -0.14746094', ' 0.123535156', ' -0.36914062', ' 0.12402344', ' 0.06982422', ' -0.359375', ' 0.0703125', ' -0.25585938', ' -0.05493164', ' -0.5234375', ' -0.039794922', ' -0.18261719', ' 0.5234375', ' -0.1875', ' -0.22851562', ' -0.18164062', ' 0.21972656', ' 0.10888672', ' 0.061279297', ' 0.27929688', ' 0.00078582764', ' -0.36132812', ' 0.21289062', ' 0.09423828', ' 0.29296875', ' -0.022827148', ' -0.24609375', ' -0.24023438', ' 0.076660156', ' -0.3359

In [7]:
df4 = df3.withColumn("EmbedArray", transform(col("EmbedArray"), lambda x: x.cast("float")))
df4 = df4.withColumn("EmbedArray", col("EmbedArray").cast("array<float>"))
df4.printSchema()

root
 |-- EmbedArray: array (nullable = true)
 |    |-- element: float (containsNull = true)


In [23]:
df4.head(1)

[Row(EmbedArray=[0.09326171875, 0.2216796875, 0.00116729736328125, 0.40234375, -0.09765625, 0.1767578125, 0.31640625, 0.09619140625, -0.1904296875, -0.0224609375, 0.07177734375, -0.2109375, -0.1640625, 0.05078125, -0.29296875, -0.1611328125, -0.73828125, 0.0478515625, 0.046142578125, 0.36328125, -0.06298828125, -0.2490234375, 0.2021484375, 0.73828125, 0.140625, -0.232421875, -0.05029296875, 0.0869140625, 0.033447265625, 0.390625, -0.12255859375, -0.40234375, 0.203125, -0.18359375, -0.1474609375, 0.12353515625, -0.369140625, 0.1240234375, 0.06982421875, -0.359375, 0.0703125, -0.255859375, -0.054931640625, -0.5234375, -0.039794921875, -0.1826171875, 0.5234375, -0.1875, -0.228515625, -0.181640625, 0.2197265625, 0.10888671875, 0.061279296875, 0.279296875, 0.00078582763671875, -0.361328125, 0.212890625, 0.09423828125, 0.29296875, -0.0228271484375, -0.24609375, -0.240234375, 0.07666015625, -0.3359375, -0.12109375, 0.66796875, 0.578125, -0.2060546875, 0.28515625, 0.166015625, -0.09716796875, 

In [8]:
from pyspark.ml.linalg import Vectors




In [9]:
from pyspark.ml.functions import array_to_vector




In [10]:
df5 = df4.select(array_to_vector('EmbedArray').alias('EmbedArray'))
df5.printSchema()

root
 |-- EmbedArray: vector (nullable = true)


In [11]:
df5.head(1)

[Row(EmbedArray=DenseVector([0.0933, 0.2217, 0.0012, 0.4023, -0.0977, 0.1768, 0.3164, 0.0962, -0.1904, -0.0225, 0.0718, -0.2109, -0.1641, 0.0508, -0.293, -0.1611, -0.7383, 0.0479, 0.0461, 0.3633, -0.063, -0.249, 0.2021, 0.7383, 0.1406, -0.2324, -0.0503, 0.0869, 0.0334, 0.3906, -0.1226, -0.4023, 0.2031, -0.1836, -0.1475, 0.1235, -0.3691, 0.124, 0.0698, -0.3594, 0.0703, -0.2559, -0.0549, -0.5234, -0.0398, -0.1826, 0.5234, -0.1875, -0.2285, -0.1816, 0.2197, 0.1089, 0.0613, 0.2793, 0.0008, -0.3613, 0.2129, 0.0942, 0.293, -0.0228, -0.2461, -0.2402, 0.0767, -0.3359, -0.1211, 0.668, 0.5781, -0.2061, 0.2852, 0.166, -0.0972, -0.1016, -0.1582, -0.1807, -0.0291, -0.5, -0.0654, -0.0008, -0.4922, -0.1553, 0.5039, -0.1699, 0.1226, -0.0806, -0.2324, 0.1582, -0.0593, 0.033, -0.0972, -0.084, 0.1113, -0.0215, -0.0894, 0.1572, 0.1123, -0.6367, -0.625, -0.3066, -0.1621, 0.1582, 0.125, 0.1846, 0.0635, 0.3613, 0.6406, 0.207, 0.1484, -0.165, 0.1289, 0.1289, -0.3887, -0.0008, -0.249, -0.4824, 0.4629, 0.3008, 

In [12]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(
  inputCols=["EmbedArray"], outputCol="features"
)

dfTrain = assembler.transform(df5).drop('EmbedArray')
dfTrain.printSchema()

root
 |-- features: vector (nullable = true)


In [34]:
dfTrain.head(1)

[Row(features=DenseVector([0.0933, 0.2217, 0.0012, 0.4023, -0.0977, 0.1768, 0.3164, 0.0962, -0.1904, -0.0225, 0.0718, -0.2109, -0.1641, 0.0508, -0.293, -0.1611, -0.7383, 0.0479, 0.0461, 0.3633, -0.063, -0.249, 0.2021, 0.7383, 0.1406, -0.2324, -0.0503, 0.0869, 0.0334, 0.3906, -0.1226, -0.4023, 0.2031, -0.1836, -0.1475, 0.1235, -0.3691, 0.124, 0.0698, -0.3594, 0.0703, -0.2559, -0.0549, -0.5234, -0.0398, -0.1826, 0.5234, -0.1875, -0.2285, -0.1816, 0.2197, 0.1089, 0.0613, 0.2793, 0.0008, -0.3613, 0.2129, 0.0942, 0.293, -0.0228, -0.2461, -0.2402, 0.0767, -0.3359, -0.1211, 0.668, 0.5781, -0.2061, 0.2852, 0.166, -0.0972, -0.1016, -0.1582, -0.1807, -0.0291, -0.5, -0.0654, -0.0008, -0.4922, -0.1553, 0.5039, -0.1699, 0.1226, -0.0806, -0.2324, 0.1582, -0.0593, 0.033, -0.0972, -0.084, 0.1113, -0.0215, -0.0894, 0.1572, 0.1123, -0.6367, -0.625, -0.3066, -0.1621, 0.1582, 0.125, 0.1846, 0.0635, 0.3613, 0.6406, 0.207, 0.1484, -0.165, 0.1289, 0.1289, -0.3887, -0.0008, -0.249, -0.4824, 0.4629, 0.3008, -0

In [13]:
from pyspark.ml.feature import PCA
pca = PCA(k=100, inputCol="features")




In [14]:
pca.setOutputCol("pca_features")
pca_model = pca.fit(dfTrain)




In [15]:
pca_model.setOutputCol("output")
dfPca = pca_model.transform(dfTrain)




In [51]:
expl_var = pca_model.explainedVariance.cumsum()




In [52]:
import numpy as np




In [58]:
expl_95 = np.argwhere(expl_var > 0.95)[0][0]
expl_95

61


In [17]:
from pyspark.ml.clustering import KMeans




In [18]:
kmeans = KMeans(k=10)




In [19]:
kmeans_model = kmeans.fit(dfPca)




In [20]:
kmeans_model.setPredictionCol("newPrediction")

KMeansModel: uid=KMeans_e568fd3ff5e1, k=10, distanceMeasure=euclidean, numFeatures=4096


In [21]:
dfKmeans = kmeans_model.transform(dfPca).select("features", "newPrediction")




In [46]:
dfKmeans.printSchema()

root
 |-- features: vector (nullable = true)
 |-- newPrediction: integer (nullable = false)


In [23]:
centers = kmeans_model.clusterCenters()




In [34]:
len(centers)

10


In [31]:
kmeans_model.summary.clusterSizes

[8, 28, 2, 21, 10, 3, 1, 8, 3, 7]


In [37]:
kmeans_model.summary.cluster.head(1)

[Row(prediction=1)]


In [38]:
kmeans_model.summary.trainingCost

16570.71541485491


In [43]:
kmeans_model.summary.predictions

DataFrame[features: vector, output: vector, prediction: int]


In [45]:
kmeans_model.summary.predictions.head(1)[0]['output'].shape

(100,)


In [47]:
from pyspark.ml.evaluation import ClusteringEvaluator
evaluator = ClusteringEvaluator(predictionCol='newPrediction', featuresCol='features', \
                                metricName='silhouette', distanceMeasure='squaredEuclidean')




In [48]:
score=evaluator.evaluate(dfKmeans)
score

0.09547441911243312


### Consolidated output

In [None]:
# expl_95

In [None]:
# centers

In [5]:
# kmeans_model.summary.clusterSizes

In [6]:
# kmeans_model.summary.trainingCost

In [None]:
# score 