### This is just some example code showcasing how this can be done

In [None]:
# Importing necessary libraries
import openai
import pandas as pd
import json
from collections import Counter
import difflib


In [None]:
# Set up OpenAI API key (use your own key here)
openai.api_key = 'YOUR_OPENAI_API_KEY'

# Sample survey data (replace this with your actual survey data)
survey_data = [
    {
        "response": "The service was great, but the wait time was too long.",
        "question": "Why did you give the above rating?",
        "context": "Customer service feedback",
        "business_context": "Survey about customer satisfaction"
    },
    {
        "response": "The product was okay, but it could have been more affordable.",
        "question": "What was your overall experience?",
        "context": "Product experience feedback",
        "business_context": "Survey about product quality"
    },
    # Add more responses here as needed
]

# Convert the survey data to a pandas DataFrame for easy manipulation
df = pd.DataFrame(survey_data)

# Define a function to create the LLM prompt for assigning categories
def create_llm_prompt(response, question, context, business_context):
    return f"""
    You are an AI assistant helping with a survey analysis. Below is a response to a survey question:
    
    Question: {question}
    Response: {response}
    Context: {context}
    Business Context: {business_context}

    Please assign a category that best describes the response. Provide the category as a single word or short phrase.
    Some examples of categories are, Good Cusomter Service, Long Wait Time, Low Stock, Bad Product Quality etc.
    """

# Function to get category from LLM
def get_category_from_llm(response, question, context, business_context):
    prompt = create_llm_prompt(response, question, context, business_context)
    
    # Call the OpenAI API with the constructed prompt
    response = openai.Completion.create(
        model="text-davinci-003",  # or use any LLM of your choice
        prompt=prompt,
        max_tokens=60,  # Adjust based on your needs
        temperature=0.5,  # Control randomness
    )
    
    # Extract and return the category assigned by the LLM
    return response.choices[0].text.strip()

# Apply the categorization to random 1000 response in the survey data
df = df.sample(n=1000)
df['category'] = df.apply(
    lambda row: get_category_from_llm(
        row['response'], row['question'], row['context'], row['business_context']
    ),
    axis=1
)

# Display the results
print(df[['response', 'category']])


## this shows one way to combine categoires, you can choose to do whatever is best for your use case

In [None]:
# Sample categories that might have been assigned by the LLM (replace with your actual categories)
categories = [
    "excellent", "great", "good", "poor", "bad", "adequate", 
    "good service", "poor customer service", "great product", "excellent experience"
]

# Convert categories to a pandas DataFrame for easy manipulation
df_categories = pd.DataFrame(categories, columns=['category'])

# Step 1: Cleaning categories (lowercase and removing extra spaces)
df_categories['cleaned_category'] = df_categories['category'].str.lower().str.strip()

# Step 2: Use string similarity to merge similar categories
# We'll use difflib's get_close_matches to find similar categories
def merge_similar_categories(df):
    unique_categories = df['cleaned_category'].unique()
    
    # Create a dictionary to store category mappings (i.e., merge them)
    category_map = {}
    
    for cat in unique_categories:
        # Find similar categories using difflib
        close_matches = difflib.get_close_matches(cat, unique_categories, n=5, cutoff=0.7)
        
        for match in close_matches:
            if match != cat:
                # If a match is found, map it to the original category
                category_map[match] = cat

    # Apply the category mappings to the cleaned categories
    df['merged_category'] = df['cleaned_category'].apply(lambda x: category_map.get(x, x))
    
    return df

# Merge similar categories based on similarity
df_merged = merge_similar_categories(df_categories)

# Step 3: Apply custom rules to combine categories (e.g., excellent & great => good, poor & bad => bad)
def apply_custom_rules(df):
    # Example rules: 
    # - "excellent" and "great" are considered "good"
    # - "poor" and "bad" are considered "bad"
    rules = {
        'good': ['excellent', 'great', 'good service'],
        'bad': ['poor', 'bad', 'poor customer service'],
        'adequate': ['adequate'],
        'great product': ['great product'],
        'excellent experience': ['excellent experience']
    }
    
    # Reverse the dictionary to easily map categories to new categories
    rules_reverse = {cat: key for key, values in rules.items() for cat in values}
    
    # Apply the rules
    df['final_category'] = df['merged_category'].apply(lambda x: rules_reverse.get(x, x))
    
    return df

# Apply custom rules to the merged categories
df_final = apply_custom_rules(df_merged)

# Step 4: Get unique categories after merging and applying rules
unique_final_categories = df_final['final_category'].unique()

# Show the cleaned and merged categories
print("Original Categories:")
print(df_categories['category'].tolist())

print("\nCleaned and Merged Categories:")
print(df_final[['category', 'merged_category', 'final_category']].drop_duplicates())

print("\nFinal Unique Categories:")
print(unique_final_categories)


## Assuming we now have a unique list of categories

In [None]:
# Assume `df_final` contains the cleaned, final categories from the previous steps

# Sample data for full responses (in a real use case, these would be your actual responses)
responses = [
    "The product was fantastic, I had a great experience!",
    "The service was poor, I would not recommend this to anyone.",
    "I had an adequate experience, nothing special but it was fine.",
    "Great customer service, the staff was really friendly and helpful.",
    "The product was bad, I was really disappointed."
] * 100  # Multiply to simulate a larger dataset

# Cleaned categories list (from Step 2)
final_categories = ['good', 'bad', 'adequate', 'great product', 'excellent experience']

# Create prompt for LLM using cleaned categories
def create_prompt(responses, final_categories):
    prompt = "We have a list of responses and a list of categories. Your task is to assign each response to a category from the list provided.\n\n"
    
    # Format the list of categories for the LLM prompt
    category_list = "\n".join([f"- {category}" for category in final_categories])
    prompt += f"Categories:\n{category_list}\n\n"
    
    # Format the responses in the prompt
    for idx, response in enumerate(responses, start=1):
        prompt += f"Response {idx}: {response}\nCategory: _______\n\n"
    
    return prompt

# Example prompt creation for the first 5 responses
prompt = create_prompt(responses[:5], final_categories)
print(prompt)

# Simulate running the prompt through an LLM (this is just a simulation for illustration)
def simulate_llm_response(prompt):
    # Simulated LLM output (in practice, call the API here to get actual responses)
    # The model would return categories for each response, simulated as random categories from the final_categories list
    return [random.choice(final_categories) for _ in range(len(responses))]

# Simulate LLM response for the full dataset
llm_responses = simulate_llm_response(prompt)

# Step 4: Combine categories based on their count
def combine_categories_by_count(df, threshold=5):
    # Get the counts of each category
    category_counts = df['final_category'].value_counts()
    
    # Combine categories that appear less than the threshold count
    combined_categories = category_counts[category_counts < threshold].index.tolist()
    
    # Map those categories to a broader category (e.g., 'other')
    df['final_category'] = df['final_category'].apply(lambda x: 'other' if x in combined_categories else x)
    
    return df

# Adding LLM responses to the dataframe
df_full_responses = pd.DataFrame(responses, columns=['response'])
df_full_responses['assigned_category'] = llm_responses

# Combine categories with low counts (for example, categories with less than 5 occurrences)
df_combined = combine_categories_by_count(df_full_responses)

# Final unique categories after combining
final_combined_categories = df_combined['assigned_category'].unique()

print("\nFinal Combined Categories (after applying count-based merging):")
print(final_combined_categories)
