# Code to help compare the labels for all the various options for each question

In [None]:
import os, glob
import pandas as pd

from transformers import pipeline
import torch

In [None]:
froot = "INCLU1x_IF_Responses_-_ALL_RUNS_041924_M0_IF_Reflection_Questions_cleaned"
# froot = "INCLU1x_IF_Responses_-_ALL_RUNS_041924_M1_IF_Reflection_Question_cleaned"
# froot = "INCLU1x_IF_Responses_-_ALL_RUNS_041924_M2_IF_Reflection_Question_cleaned"

embedders = ['UAE', 'bge', 'jina']
labellers = ['zephyr','tinyllama']

labels_list = []
# create an Excel file and add all the sheets
with pd.ExcelWriter(os.path.join('tables', froot + '_label_comparison_MC.xlsx'), engine='openpyxl') as writer:
    for embedder in embedders:
        labels = pd.DataFrame()
        for labeller in labellers:
            for filename in glob.glob(os.path.join('tables', '*' + froot + '*' + embedder + '*clusters*' + labeller + '*MC.xlsx')):
                with open(os.path.join(os.getcwd(), filename), 'r') as f: 
                    print(filename)
                    df = pd.read_excel(filename, sheet_name = 'labels_map')
                    labels[labeller] = df['label'].to_list()
                    labels_list.extend(df['label'].to_list())
                labels.to_excel(writer, sheet_name = embedder, index = False)

## Try to use an LLM to combine/summarize all these labels?

In [None]:
# adapted from latent-scope 
class transformers_chat_provider():
    def __init__(self, name, params):
        self.name = name
        self.params = params
        self.pipe = pipeline("text-generation", model=self.name, torch_dtype=torch.float16, device_map="auto")
        self.encoder = self.pipe.tokenizer

    def chat(self, messages, max_new_tokens=24):
        prompt = self.pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        outputs = self.pipe(prompt, max_new_tokens=max_new_tokens, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
        generated_text = outputs[0]["generated_text"]
        print("GENERATED TEXT", generated_text)
        return generated_text.split("<|assistant|>")[1].strip()

In [None]:
instructions_before = "Below is a list of items each starting with [item].  Some items may be very similar, while others are different."
instructions_after = "That was the last item in the list.  Now summarize these items with a new list of up to 10 themes.  Your themes should describe all the unique ideas from the items above.  Each theme should be only one sentence.  You can return less than 10 themes if there are less than 10 unique ideas.  Only return the themes."

In [None]:

# currently this seems to at least be providing output using TinyLlama.  Didn't wait long enough for zephyr.

# model = transformers_chat_provider('HuggingFaceH4/zephyr-7b-beta', {"max_tokens": 2048})
# ofile = os.path.join('tables', froot + '_label_summary_zephyr_MC.xlsx')

model = transformers_chat_provider('TinyLlama/TinyLlama-1.1B-Chat-v1.0', {"max_tokens": 2048})
ofile = os.path.join('tables', froot + '_label_summary_tinyllama_MC.txt')

input_list = ''
for label in labels_list:
    input_list += '[item] ' + label + '\n'

messages=[
    {"role":"system", "content":instructions_before},
    {"role":"user", "content": input_list},
    {"role":"system", "content":instructions_after}
]
response = model.chat(messages, max_new_tokens = 1000)


with open(ofile,'w') as f:
    f.write(response)        