In [None]:
key_vault_name = 'kv_to-be-replaced'

In [None]:
from trident_token_library_wrapper import PyTridentTokenLibrary as tl

def get_secrets_from_kv(kv_name, secret_name):

    access_token = mssparkutils.credentials.getToken("keyvault")
    kv_endpoint = f'https://{kv_name}.vault.azure.net/'
    return(tl.get_secret_with_token(kv_endpoint,secret_name,access_token))

openai_api_type = "azure"
openai_api_version  = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-VERSION")
openai_api_base = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-ENDPOINT")
openai_api_key = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-KEY")

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, MapType, LongType, TimestampType
import os

folder_path = 'Files/data/conversation_input/'

# Define the schema for the nested Messages in the Conversation
message_schema = StructType([
    StructField("Id", StringType(), True),
    StructField("ReferenceId", StringType(), True),
    StructField("EventType", StringType(), True),
    StructField("EventTime", StringType(), True),
    StructField("ConversationId", StringType(), True),
    StructField("Value", StringType(), True),
    StructField("UserId", StringType(), True),
    StructField("CustomProperties", MapType(StringType(), StringType()), True)
])

# Define the schema for the Conversation
conversation_schema = StructType([
    StructField("ConversationId", StringType(), True),
    StructField("Messages", ArrayType(message_schema), True),
    StructField("StartTime", TimestampType(), True),
    StructField("EndTime", TimestampType(), True),
    StructField("Merged_content", StringType(), True),
    StructField("Merged_content_user", StringType(), True),
    StructField("Merged_content_agent", StringType(), True),
    StructField("Full_conversation", StringType(), True),
    StructField("Duration", LongType(), True)  # New field for duration
])

# Define the complete schema for the JSON document
schema = StructType([
    StructField("AgentName", StringType(), True),
    StructField("AgentId", StringType(), True),
    StructField("Team", StringType(), True),
    StructField("ResolutionStatus", StringType(), True),
    StructField("CallReason", StringType(), True),
    StructField("CallerID", StringType(), True),
    StructField("Conversation", conversation_schema, True)
])

df = None
df = spark.read.option("multiLine", True).schema(schema).option("mode", "FAILFAST").json(folder_path)

#use the legacy time parser policy
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")

# Update Duration field with the duration from StartTime to Endtime in milliseconds
df = df.withColumn("Conversation", df["Conversation"].withField("Duration", 
                                                (F.unix_timestamp(df["Conversation"]["EndTime"], 'yyyy-MM-dd\'T\'HH:mm:ss') - 
                                                 F.unix_timestamp(df["Conversation"]["StartTime"], 'yyyy-MM-dd\'T\'HH:mm:ss')) / 60))


# Create ConversationDate field based on StartTime and set to the beginning of the day
df = df.withColumn("Conversation", df["Conversation"].withField("ConversationDate", 
                                                F.date_trunc('day', df["Conversation"]["StartTime"])))


In [None]:
# Select specific columns, including nested ones
selected_df = df.select(
    "AgentName",
    "AgentId",
    "Team",
    "ResolutionStatus",
    "CallReason",
    "CallerID",
    "Conversation.ConversationId",
    "Conversation.StartTime",
    "Conversation.EndTime",
    "Conversation.ConversationDate",
    "Conversation.Merged_content",
    "Conversation.Merged_content_user",
    "Conversation.Merged_content_agent",
    "Conversation.Full_conversation",
    "Conversation.Duration"
)

In [None]:
# display(selected_df)

In [None]:
import os
import openai
import json
import time
import ast
import traceback

# Function to get details from a conversation
def get_details(input_text):
    time.sleep(4)

    openai.api_type = openai_api_type
    openai.api_version = openai_api_version
    openai.api_base = openai_api_base
    openai.api_key =  openai_api_key

    # Construct the prompt for the OpenAI API

    # Reference: For further details and guidance on how to effectively write metaprompt or system prompts, please refer to https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/system-message . Last Updated: 05/28/2024
    prompt = '''You are a JSON formatter for extracting information out of a single chat conversation. 
            Summarize the conversation in 20 words, key: summary .
            Is the customer satisfied with the agent interaction (Yes or No), key: satisfied . 
            Identify the sentiment of the customer as (Positive, Negative or Neutral),key : avgSentiment . 
            Normalize the conversation text by converting it to lowercase and trimming whitespace. Identify the single primary complaint of the conversation in 3 words or less. The complaint must always start with a noun and be a noun phrase. Key: Complaint.
            Identify the single primary compliment of the conversation in 6 words or less,key: Compliment . 
            Identify the top 10 key phrases as comma separated string excluding people names , key: keyPhrases .
            Identify the main topic, key: topic .
            Identify the language of the text using ISO 639 two letter language identifier, key: lang .
            Answer in JSON machine-readable format, using the keys from above. 
            Pretty print the JSON and make sure that it is properly closed at the end and do not generate any other content.'''
        

    #  prompt = '''You are a JSON formatter for extracting information out of a single chat conversation. 
    #         Summarize the conversation in 20 words, key: summary .
    #         Is the customer satisfied with the agent interaction (Yes or No), key: satisfied . 
    #         Identify the sentiment of the customer as (Positive, Negative or Neutral),key : avgSentiment . 
    #         Identify the origin city of travel,key: OriginCity . 
    #         Identify the destination city of travel,key : DestinationCity . 
    #         Identify the single primary complaint of the conversation in 3 words or less,key: Complaint . 
    #         Identify the single primary compliment of the conversation in 6 words or less,key: Compliment . 
    #         Identify the name of hotel that was mentioned,key: Hotel . 
    #         Identify the name of airline if mentioned,key: Airline . 
    #         Identify the name of the agent,key: AgentName .
    #         Identify the top 10 key phrases as comma seperated string excluding people names , key: keyPhrases .
    #         Identify the main topic, key: topic .
    #         Identify the language of the text using ISO 639 two letter language identifier, key: lang .
    #         Answer in JSON machine-readable format, using the keys from above. 
    #         Pretty print the JSON and make sure that it is properly closed at the end and do not generate any other content.'''

    # Add to prompt if desired:
    # Identify input_text translated to english, return the same text if already in english, key: translated_text .

    # Set maximum number of retries
    max_retries = 4
    attempts = 0
    # print("attempts: ", attempts, "max retries: ", max_retries)

    # Loop until maximum retries are reached
    while attempts < max_retries:
        try:
            print('in get_details')
            #print(input_text)
            response = openai.ChatCompletion.create(
            engine= "gpt-4",
            messages=[{"role": "system", "content": prompt},{"role": "user", "content": input_text}],
            response_format={"type": "json_object"})

            # response = openai.ChatCompletion.create(
            # engine= "gpt-35-turbo-16k",
            # messages=[{"role": "system", "content": prompt},{"role": "user", "content": input_text}])

           # Parse the response from the API
            result = ast.literal_eval(response['choices'][0]['message']['content'])
            # If 'summary' is found in the result, print and return the result
            if 'summary' in result and result['summary'] is not None and result['summary'].strip() != '':
                return result
            else:
                # If 'summary' is not found, increment attempts and try again
                attempts += 1
                print(f"Attempt {attempts} failed. 'summary' not found in result. Trying again.")
                time.sleep(40)
        except Exception as e:
            # If an error occurs, increment attempts and try again
            print(f"Attempt {attempts} failed with error: {e}. Trying again. Full exception: {traceback.format_exc()}")
            attempts += 1
            time.sleep(40)

    print("Maximum number of retries reached. Exiting.")
    return {
        'summary': '',
        'satisfied': '',
        'avgSentiment': '',
        'Complaint': '',
        'Compliment': '',''
        'keyPhrases': '',
        'topic': '',
        'lang': ''
    }

    # print("Maximum number of retries reached. Exiting.")
    # return {
    #     'summary': '',
    #     'satisfied': '',
    #     'avgSentiment': '',
    #     'OriginCity': '',
    #     'DestinationCity': '',
    #     'Complaint': '',
    #     'Compliment': "",
    #     'Hotel': '',
    #     'Airline': '',
    #     'AgentName': '',
    #     'keyPhrases': '',
    #     'topic': '',
    #     'lang': ''
    # }
    #,
    #     'translated_text': ''
    # }

In [None]:
selected_df_pandas = selected_df.toPandas()

In [None]:
# display(selected_df_pandas)

In [None]:
from pyspark.sql.types import *

# Define the schema
schema = StructType([
    StructField('ConversationId', StringType(), True),
    StructField('ConversationDate', TimestampType(), True),
    StructField('EndTime', TimestampType(), True),
    StructField('StartTime', TimestampType(), True),
    StructField('Duration', DoubleType(), True),
    StructField('AgentId', StringType(), True),
    StructField('AgentName', StringType(), True),
    StructField('Team', StringType(), True),
    StructField('ResolutionStatus', StringType(), True),
    StructField('CallReason', StringType(), True),
    StructField('CallerID', StringType(), True),
    StructField('Merged_content', StringType(), True),
    StructField('Merged_content_agent', StringType(), True),
    StructField('Merged_content_user', StringType(), True),
    StructField('summary', StringType(), True),
    StructField('satisfied', StringType(), True),
    StructField('avgSentiment', StringType(), True),
    # StructField('OriginCity', StringType(), True),
    # StructField('DestinationCity', StringType(), True),
    StructField('Complaint', StringType(), True),
    StructField('Compliment', StringType(), True),
    # StructField('Hotel', StringType(), True),
    # StructField('Airline', StringType(), True),
    StructField('keyPhrases', StringType(), True),
    StructField('topic', StringType(), True),
    StructField('lang', StringType(), True)
])

In [None]:
import pandas as pd

# Initialize an empty list to store the results
res_list = []

# Iterate over each row in the selected pandas DataFrame
for i, row in selected_df_pandas.iterrows():
    print(f"processing row {i}, ConversationID: {row.ConversationId}")
    # Convert the row to a dictionary and merge it with the details obtained from the 'Merged_content' column
    result = row.to_dict() | get_details(row.Merged_content)
    # Convert pandas timestamp objects to Python datetime objects
    for key in ['ConversationDate', 'EndTime', 'StartTime']:
        if key in result and isinstance(result[key], pd.Timestamp):
            result[key] = result[key].to_pydatetime()
    # Append the result to the list
    res_list.append(result)

# Create a Spark DataFrame from the list of results
df_processed = spark.createDataFrame(res_list, schema=schema)

# Display the processed DataFrame
# display(df_processed)

In [None]:
# Select the columns in desired order
df_processed = df_processed.select(["ConversationId", "ConversationDate", "EndTime","StartTime","Duration","AgentId","AgentName","Team","ResolutionStatus","CallReason","CallerID", "Merged_content", "Merged_content_agent","Merged_content_user", \
                          "summary", \
                          "satisfied", \
                          "avgSentiment", \
                          # "OriginCity", \
                          # "DestinationCity", \
                          "Complaint", \
                          "Compliment", \
                          # "Hotel", \
                          # "Airline", \
                          "keyPhrases", \
                          "topic", \
                          "lang"])

# Display the DataFrame
# display(df_processed)


In [None]:
#This code can be used for debugging

# for i, row in selected_df_pandas.iterrows():
#     print("")
#     print(f"row {i}")
#     print(f"ConversationID: {row.ConversationId}")
#     print(get_details(row.Merged_content))
#     # break

In [None]:
# Drop the table if it exists
spark.sql('drop table if exists ckm_conv_processed_raw')

In [None]:
# Save processed records to ckm_conv_processed_raw table
df_processed.write.format('delta').mode('append').option("overwriteSchema", "true").saveAsTable('ckm_conv_processed_raw')

In [None]:
# Move input files to processed directory

import os
import shutil

# Directory paths
input_dir = '/lakehouse/default/Files/data/conversation_input/'
processed_dir = '/lakehouse/default/Files/data/conversation_processed/'

# Get a list of all .json files in the input directory
json_files = [f for f in os.listdir(input_dir) if f.endswith('.json')]

# Move each .json file to the processed directory
for file_name in json_files:
    shutil.move(os.path.join(input_dir, file_name), os.path.join(processed_dir, file_name))

In [None]:
# display(df_processed)

In [None]:
df = spark.sql("SELECT * FROM ckm_conv_processed_raw LIMIT 1")
# display(df)

In [None]:
# df = spark.sql("SELECT ConversationId,AgentId,CallerID,avgSentiment,lang,summary  FROM ckm_conv_processed_raw LIMIT 1000")
# display(df)