In [1]:
# !pip install scikit-learn
# !pip install pandas
# !pip install matplotlib
# !pip install langchain
# !pip install gpt4all

In [1]:
import pandas as pd
import numpy as np
import os
import re

from gpt4all import GPT4All
model = GPT4All("orca-mini-3b-gguf2-q4_0.gguf") # downloads / loads a 2GB LLM

In [3]:
# Checking if model is loaded successfully
with model.chat_session():
    print(model.generate("How can I run LLMs efficiently on my laptop?", max_tokens=256))

 There are a few things you can do to optimize the performance of your laptop for running LLMs:

1. Upgrade your hardware: Consider upgrading your processor, RAM, and hard drive space. These components play a significant role in how well your laptop can handle demanding applications like LLMs.

2. Reduce screen resolution: Lowering the screen resolution can help reduce the amount of resources your laptop uses, especially if you're using an integrated graphics card.

3. Install an SSD: An SSD can improve the speed at which your operating system and applications load, as compared to a traditional hard drive. This can significantly improve the overall performance of your laptop for running LLMs.

4. Use battery power: If you're using your laptop on battery power, consider reducing the brightness of your screen and turning off any non-essential apps or services that are consuming battery life.

5. Optimize startup processes: Some applications may take a long time to start up, especially if

### 1. Setting up the parameters for running the file 

In [2]:

# Path to directory containing transcript text files
transcripts_dir = 'E:/AXA Data Science Assignment/data/transcripts_v3'

### 2.  Utility Functions

In [3]:
def parse_transcript(file_path):
    """Function to read and parse the transcript from a text file
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = [line.rstrip() for line in file]
        lines = [line for line in lines if line]

    member_conversation_data = []
    agent_conversation_data = []


    for line in lines:
        line = line.strip()
        
        if not line:  # Skip empty lines
            continue
        elif line.startswith("Member"):
            # Set the member text
            try:
                current_text = line.split(":",1)[1].strip()
                member_conversation_data.append(current_text)
            except:
                print("Skipping Lines - ", line)
        else:
            # Save the agent text
            try:
                current_text = line.split(":",1)[1].strip()
                # Skip the call durartion from conversation
                if(line.split(":",1)[0].strip() !="Call duration"):
                    agent_conversation_data.append(current_text)
            except:
                print("Skipping Lines - ", line)

    return {"Original Convesation" : lines,"Agent": agent_conversation_data,"Member":member_conversation_data}


def create_dataframe_from_transcripts(directory):
    """
    Function to create a dataframe from all transcript files
    """
    all_conversations = []
    
    for file_name in os.listdir(directory):
        if file_name.endswith('.txt'):
            file_path = os.path.join(directory, file_name)
            conversation_data = parse_transcript(file_path)
            file_id = re.search('[0-9]+', file_name).group()
            try:
                all_conversations.append({'File_ID':file_id,'File': file_name,'Original Conversation': conversation_data['Original Convesation'],'Agent': conversation_data['Agent'], 'Member': conversation_data['Member']})
            except:
                print("Skipping File : ", file_path)
    # Create a dataframe
    df = pd.DataFrame(all_conversations)
    return df


In [4]:
## Testing the Parser
parse_transcript(file_path="../data/transcripts_v3/transcript_1.txt")

Skipping Lines -  (pause)


{'Original Convesation': ["Member: Hi, I'm calling about a denied claim I received for my recent medical service. I was told that my policy doesn't cover it, but I'm certain it should be covered under my new policy. My member ID is MEM123456.",
  'Customer Support: I apologize for the inconvenience, MEM123456. Can you please provide me with more information about the denied claim, such as the claim number and the date of service?',
  'Member: The claim number is CLM789012, and the date of service was February 10th.',
  "Customer Support: Thank you for providing that information. I've located your claim in our system. Can you tell me more about the new policy you're referring to? When did you switch policies?",
  "Member: I switched policies on January 1st. I was told that the new policy would cover the service I received, but the denial letter says it's not covered.",
  'Customer Support: I understand your concern. Let me check on the status of your policy update in our system. (pause)

###  3. Parsing the Text File and Splitting the convesation as Member and Agent

In [10]:
# Create dataframe from transcripts
df_transcripts = create_dataframe_from_transcripts(transcripts_dir)
df_transcripts["File_ID"] = df_transcripts["File_ID"].astype("int")
df_transcripts.sort_values("File_ID",inplace=True)
df_transcripts.set_index("File_ID",drop=True,inplace=True,)
# Save to CSV
df_transcripts.to_csv('../test/parsed_transcripts.csv', index=False)

Skipping Lines -  (pause)
Skipping Lines -  (pause)
Skipping Lines -  (Call ended at 9 minutes)
Skipping Lines -  (pause)
Skipping Lines -  (re-connected)
Skipping Lines -  (pause)
Skipping Lines -  (Minute 9)
Skipping Lines -  (Minute 6)
Skipping Lines -  (Minute 8)
Skipping Lines -  (pause)
Skipping Lines -  (pause)
Skipping Lines -  (pause)
Skipping Lines -  (at 2 minutes)
Skipping Lines -  (at 4 minutes)
Skipping Lines -  (at 6 minutes)
Skipping Lines -  (at 7 minutes)
Skipping Lines -  (at 8 minutes)
Skipping Lines -  (pause)
Skipping Lines -  (12 minutes)
Skipping Lines -  (pause)
Skipping Lines -  (pause)
Skipping Lines -  (Pause)
Skipping Lines -  (Call ended)
Skipping Lines -  (pause)
Skipping Lines -  (pause for 2 minutes)
Skipping Lines -  (pause)
Skipping Lines -  (pause)
Skipping Lines -  (pause)
Skipping Lines -  (pause)
Skipping Lines -  pause)
Skipping Lines -  (pause)
Skipping Lines -  (Member holds for 2 minutes)
Skipping Lines -  (The conversation ends after 8 minute

In [11]:
df_transcripts.head()

Unnamed: 0_level_0,File,Original Conversation,Agent,Member
File_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,transcript_0.txt,"[Member: Hi, I'm calling to get a case pre-aut...","[Hi Emily, thank you for calling PA customer c...","[Hi, I'm calling to get a case pre-authorized...."
1,transcript_1.txt,"[Member: Hi, I'm calling about a denied claim ...","[I apologize for the inconvenience, MEM123456....","[Hi, I'm calling about a denied claim I receiv..."
2,transcript_2.txt,"[Member: Hi, I'm calling about my recent docto...","[I apologize for the inconvenience, Mr. Johnso...","[Hi, I'm calling about my recent doctor's visi..."
3,transcript_3.txt,"[Member: Hi, I'm calling about my recent visit...","[I apologize for the inconvenience, can you pl...","[Hi, I'm calling about my recent visit to the ..."
4,transcript_4.txt,"[Member: Hi, I'd like to schedule an appointme...","[Thank you for calling us, MEM123456. Can you ...","[Hi, I'd like to schedule an appointment with ..."


### Question 1: Use a large language model of your choice to analyse the customer side of the transcript only:

* Identify the sentiment (positive, negative, neutral) of the call
* Determine call outcome (issue resolved, follow-up action needed)

In [12]:

def get_prompt(transcript):
    """
    Function to return the prompt with transcript
    """

    prompt = f"""
    You are a sentiment analysis and call outcome expert. You have to analyse the customer support conversation, but you would only be provided with the customer side of conversation. Kindly analyze it and provive the results. 
    Customer has said/replied with the following statements :

    "{transcript}"

    ### Directives
    1. Identify the sentiment of this conversation (positive, negative, or neutral).
    2. Determine the outcome of the call: was the issue resolved, or does the customer need follow-up action?
    3. Provide your output in the following format:
        Sentiment: <sentiment>
        Call Outcome: <issue resolved/follow-up action needed>
    
    ### Fewshot Example :
    'Example 1': ["Hi, I'm calling about a denied claim I received for my recent medical service. I was told that my policy doesn't cover it, but I'm certain it should be covered under my new policy. My member ID is MEM123456.",
                'The claim number is CLM789012, and the date of service was February 10th.',
                "I switched policies on January 1st. I was told that the new policy would cover the service I received, but the denial letter says it's not covered.",
                "That's frustrating. How can we get this resolved?",
                "No, that's all. Thank you for your help.",
                'Thank you.'],
    'Response 1': "Sentiment: Positive \n Call Outcome: Follow-up action needed"
    """
    
    return prompt


In [16]:
# Analyze the prompt and storing the output
output = None
output_list = []
# Use the LLM
with model.chat_session():
    for i in range(160,200):
        member_conv = df_transcripts['Member'].iloc[i]
        prompt = get_prompt(member_conv)
        try: 
            output = model.generate(prompt, max_tokens=256,temp = 0.5)
        except:
            print("LLM call unsuccessful")
            output = None
            output_list.append([df_transcripts['File'].iloc[i],"LLM call unsuccessful","","",""])

        if output:
            try : 
                # Parse output
                output_text_list = output.split('\n')
                sentiment = output_text_list[0].replace("Sentiment:", "").strip()
                call_outcome = output_text_list[1].replace("Call Outcome:", "").strip()
                try: 
                    explanation = output_text_list[3].replace("Explanation:", "").strip()
                except:
                    # Incase explanation is not present 
                    explanation = ""
                output_list.append([df_transcripts['File'].iloc[i],output,sentiment,call_outcome,explanation])
            except:
                output_list.append([df_transcripts['File'].iloc[i],output,"","",""])

        # Store output after every 3 iterations are complete
        if(i%3 == 0):
            result_df = pd.DataFrame(output_list, columns = ["File","LLM_Output","Sentiment","Call_Outcome","Explanation"])
            result_df.to_csv("../test/LLM_output_160_199.csv",index=False)

# Store output again once complete
result_df = pd.DataFrame(output_list, columns = ["File","LLM_Output","Sentiment","Call_Outcome","Explanation"])
result_df.to_csv("../test/LLM_output_160_199.csv",index=False)
