# Text Clustering

In [ ]:
%%configure -f
{
"conf": {
     "spark.dynamicAllocation.disableIfMinMaxNotSpecified.enabled": true,
     "spark.dynamicAllocation.enabled": true,
     "spark.dynamicAllocation.minExecutors": 2,
     "spark.dynamicAllocation.maxExecutors": 4
   }
}

In [ ]:
batch_num = ''
batch_root = ''
documents_contents_tbl_name = ''
file_system = ''
batch_description = ''
text_file_count = 0
minted_tables_output_path = ''

In [ ]:
import pyodbc
database = 'minted'   
driver= '{ODBC Driver 17 for SQL Server}'
sql_user_name = mssparkutils.credentials.getSecretWithLS("keyvault", "SynapseSQLUserName")
sql_user_pwd = mssparkutils.credentials.getSecretWithLS("keyvault", "SynapseSQLPassword")
serverless_sql_endpoint = mssparkutils.credentials.getSecretWithLS("keyvault", "SynapseServerlessSQLEndpoint")

In [ ]:
# Initiate logging
import logging
from opencensus.ext.azure.log_exporter import AzureLogHandler
from opencensus.ext.azure.trace_exporter import AzureExporter
from opencensus.trace import config_integration
from opencensus.trace.samplers import AlwaysOnSampler
from opencensus.trace.tracer import Tracer

config_integration.trace_integrations(['logging'])

instrumentation_connection_string = mssparkutils.credentials.getSecretWithLS("keyvault", "AppInsightsConnectionString")

logger = logging.getLogger(__name__)
logger.addHandler(AzureLogHandler(connection_string=instrumentation_connection_string))
logger.setLevel(logging.INFO)

tracer = Tracer(
    exporter=AzureExporter(
        connection_string=instrumentation_connection_string
    ),
    sampler=AlwaysOnSampler()
)

# Spool parameters
run_time_parameters = {'custom_dimensions': {
    'documents_contents_tbl_name': documents_contents_tbl_name,
    'batch_root': batch_root,
    'batch_num': batch_num,
    'notebook_name': mssparkutils.runtime.context['notebookname']
} }
  
logger.info(f"{mssparkutils.runtime.context['notebookname']}: INITIALISED", extra=run_time_parameters)

In [ ]:
import json
import ntpath
import numpy as np
from types import SimpleNamespace

from pyspark.ml.feature import HashingTF, IDF, CountVectorizer, StopWordsRemover, PCA, RegexTokenizer
from pyspark.ml.clustering import KMeans
from pyspark.ml import Pipeline
from pyspark import SparkContext, SparkConf
from sklearn.manifold import TSNE
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType

# Initialise session and config
sc = spark.sparkContext
spark = SparkSession.builder.appName(f"TextProcessing {mssparkutils.runtime.context}").getOrCreate()

config = json.loads(''.join(sc.textFile(f'{batch_root}/config.json').collect()), object_hook=lambda dictionary: SimpleNamespace(**dictionary))
job_config = config.clustering

# Set log level
if config.log_level == "INFO":
    logger.setLevel(logging.INFO)
else:
    logger.setLevel(logging.ERROR)
    config.log_level = "ERROR"

job_config_parameters = {'custom_dimensions': {
    'batch_num': batch_num,
    'minTokenLength': job_config.min_token_length,
    'minDF': job_config.min_DF,
    'maxDF': job_config.max_DF,
    'numFeatures': job_config.num_features,
    'minDocFreq': job_config.min_doc_freq,
    'k': job_config.k,
    'pca1': job_config.pca1,
    'perplexity': job_config.perplexity,
    'notebook_name': mssparkutils.runtime.context['notebookname']
} }
  
logger.info(f"{mssparkutils.runtime.context['notebookname']}: JOB_CONFIG", extra=job_config_parameters)

In [ ]:
def get_x(row):
    return str(row.values[0])

def get_y(row):
    return str(row.values[1])

def get_joined_text(row):
    return " ".join(row)

In [ ]:
from pyspark.sql.functions import regexp_replace, length, col
clustered_text_tbl_name = f'{batch_num}_clustered_text'

with tracer.span(name='Load documents table'):
    df = spark.read.parquet(minted_tables_output_path + documents_contents_tbl_name).select('text_content_target_lang','file_name','file_type','file_path')

def run_clustering(df): 
    # use a regex to clear out other characters, and if there are less than 100 chars left, filter out the row
    if df.count() < 2: 
        empty_RDD = spark.sparkContext.emptyRDD()
        columns = StructType([
            StructField('file_path', StringType()),
            StructField('processed_text',StringType()),
            StructField('cluster', IntegerType()),
            StructField('X', FloatType()),
            StructField('Y', FloatType())
        ])
        empty_df = spark.createDataFrame(data = empty_RDD, schema = columns)        
        return empty_df


    df = df.filter(length(regexp_replace(col('text_content_target_lang'), '[^a-z]', '')) > 100)

    # Now we will determine the distance measure based on the dataset
    # For small functional tests we want to use the euclidean distance else cosine
    distance_measure = "cosine"

    if int(df.count()) < 100: # Threshold until we find a better value
        distance_measure = "euclidean"

    with tracer.span(name='Create clustering pipeline'):
        k = int(job_config.k) # We will need to calculate k to determine the optimum cluster number for any new dataset
        tokenizer = RegexTokenizer(inputCol="text_content_target_lang", outputCol="tokens", gaps=False, minTokenLength=int(job_config.min_token_length), toLowercase=True, pattern="[a-zA-Z\-][a-zA-Z\-]{2,}")
        remover = StopWordsRemover(inputCol="tokens", outputCol="stopWordsRemovedTokens")
        vectorizer = CountVectorizer(inputCol="stopWordsRemovedTokens", outputCol="word_count_vector", minDF=float(job_config.min_DF), maxDF=float(job_config.max_DF))
        hashingTF = HashingTF(inputCol="stopWordsRemovedTokens", outputCol="rawFeatures", numFeatures=int(job_config.num_features))
        idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=int(job_config.min_doc_freq))
        pca_1 = PCA(k=int(job_config.pca1), inputCol="features")
        pca_1.setOutputCol("pca_features")
        kmeans = KMeans(k=k, seed=42, initMode="k-means||", distanceMeasure=distance_measure)
        pipeline = Pipeline(stages=[tokenizer, remover, vectorizer, hashingTF, idf, pca_1, kmeans])

    with tracer.span(name='Fit pipeline'):
        model = pipeline.fit(df)

    with tracer.span(name='Transform pipeline'):
        results = model.transform(df)

    with tracer.span(name='Convert to pandas dataframe'):
        pandas_df = results.toPandas()

    with tracer.span(name='T-SNE dimensionality reduction'):
        series = pandas_df['features'].apply(lambda x : np.array(x.toArray())).to_numpy().reshape(-1,1)
        features = np.apply_along_axis(lambda x : x[0], 1, series)

        tsne = TSNE(verbose=1, perplexity=int(job_config.perplexity)) 
        X_embedded = tsne.fit_transform(features)

    with tracer.span(name='Clean up dataframe'):
        pandas_df.rename(columns = {'file':'file_name', 'prediction':'cluster', 'stopWordsRemovedTokens': 'processed_text'}, inplace = True)
        pandas_df['X'] = X_embedded[:,0]
        pandas_df['Y'] = X_embedded[:,1]
        pandas_df['processed_text'] = pandas_df.apply(lambda x: get_joined_text(x['processed_text']), axis=1)
        pandas_df.drop(columns = ["tokens", "word_count_vector", "pca_features", "features", "rawFeatures", "file_type", "file_name", "text_content_target_lang"], inplace = True)

    with tracer.span(name='Move back to spark RDD'):
        df = spark.createDataFrame(pandas_df)

        return df 

with tracer.span(name='Run clustering'): 
    df = run_clustering(df)

with tracer.span(name='Perist to clustered text table'):
    df.write.mode("overwrite").parquet(f'{minted_tables_output_path}{clustered_text_tbl_name}')
    df.show()
    df.printSchema()

    df_output_sql_command = f"IF NOT EXISTS (SELECT * FROM sys.tables WHERE name = '{clustered_text_tbl_name}') CREATE EXTERNAL TABLE [{clustered_text_tbl_name}] ([file_path] nvarchar(4000), [processed_text] nvarchar(4000), [cluster] bigint, [X] float, [Y] float) WITH (LOCATION = 'minted_tables/{clustered_text_tbl_name}/**', DATA_SOURCE = [synapse_<<STORAGE_ACCOUNT_NAME>>_dfs_core_windows_net], FILE_FORMAT = [SynapseParquetFormat])"

    with pyodbc.connect('DRIVER='+driver+';SERVER=tcp:'+serverless_sql_endpoint+';PORT=1433;DATABASE='+database+';UID='+sql_user_name+';PWD='+ sql_user_pwd) as conn:
      with conn.cursor() as cursor:
        cursor.execute(df_output_sql_command)

# return name of new table
output = {'custom_dimensions': {
    'batch_num': batch_num,
    'clustered_text_tbl_name': clustered_text_tbl_name,
    'notebook_name': mssparkutils.runtime.context['notebookname']
} }

## Raise clustering complete event

In [ ]:
# Prepare the event contents
with tracer.span(name='preparing contents to send to event grid'):   
    from datetime import datetime
    now = datetime.now().strftime("%Y-%m-%dT%H:%M:%S%Z")    
    web_app_uri = config.rule_sets.webapp_uri
    subscriber_uri = config.rule_sets.teams_webhook_uri
    alert_email = config.rule_sets.alert_email    
    df_cluster_count = df.groupBy("cluster").count()
    df_cluster_count = df_cluster_count.orderBy('cluster', ascending=True)
    cluster_json_list = df_cluster_count.toJSON().collect()
    num_of_clusters = df_cluster_count.distinct().count ()
    cluster_output = ''
    for x in range(len(cluster_json_list)): 
        cluster_output = cluster_output + ', ' + cluster_json_list[x]   
    cluster_output = cluster_output[2:]
    cluster_output_str = ''.join(cluster_output)

    # generate the Event Grid json 
    event_data = f'{{"batch_id": "{batch_num}",' \
        f'"batch_description": "{batch_description}",' \
        f'"eventDate": "{now}",' \
        f'"eventMetrics": {{' \
        f'  "event_type": "text",' \
        f'  "files_processed_count": "{text_file_count}",' \
        f'  "event_detail_uri": "https://{web_app_uri}/reports",' \
        f'  "num_of_clusters": {num_of_clusters},' \
        f'  "clusters": [' \
        f'      {cluster_output_str}' \
        f'  ]' \
        f'}},' \
        f'"teams_webhook_endpoint": "{subscriber_uri}",' \
        f'"alert_email": "{alert_email}"' \
        f'}}'

    event_data_obj = json.loads(event_data)

In [ ]:
# Raise the event
with tracer.span(name='sending message to event grid'):    
    from azure.identity import ClientSecretCredential
    from azure.eventgrid import EventGridPublisherClient, EventGridEvent    

    # Get value from keyvault to build Event Grid Topic event
    subscription_id = TokenLibrary.getSecretWithLS("keyvault", 'SubscriptionId')
    resource_group_name = TokenLibrary.getSecretWithLS("keyvault", 'ResourceGroupName')
    event_grid_topic_name = TokenLibrary.getSecretWithLS("keyvault", 'EventGridTopicName')
    event_grid_topic_endpoint = TokenLibrary.getSecretWithLS("keyvault", 'EventGridTopicEndpointUri')
    tenant_id = TokenLibrary.getSecretWithLS("keyvault", 'TenantID')
    client_id = TokenLibrary.getSecretWithLS("keyvault", 'ADAppRegClientId')
    client_secret = TokenLibrary.getSecretWithLS("keyvault", 'ADAppRegClientSecret')
    event_grid_topic = f'/subscriptions/{subscription_id}/resourceGroups/{resource_group_name}/providers/Microsoft.EventGrid/topics/{event_grid_topic_name}'
    credential = ClientSecretCredential(tenant_id, client_id, client_secret)
    client = EventGridPublisherClient(event_grid_topic_endpoint, credential)

    try:
        # queue event grid message
        event = EventGridEvent(data=event_data_obj, subject="MINTED/ClusterAlert", event_type="MINTED.ruleTriggered", data_version="1.0", topic=event_grid_topic)
        client.send(event)
        print("done")
    except Exception as e:
        logger.exception(e)
        raise e


# Return the object to the pipeline
logger.info(f"{mssparkutils.runtime.context['notebookname']}: OUTPUT", extra=output)
mssparkutils.notebook.exit(output['custom_dimensions'])        