# Comparing the labels for all the various options for each question

In [None]:
import os, glob
import pandas as pd

from transformers import pipeline
import torch

In [None]:
# froot = "INCLU1x_IF_Responses_-_ALL_RUNS_041924_M0_IF_Reflection_Questions_cleaned"
# froot = "INCLU1x_IF_Responses_-_ALL_RUNS_041924_M1_IF_Reflection_Question_cleaned"
# froot = "INCLU1x_IF_Responses_-_ALL_RUNS_041924_M2_IF_Reflection_Question_cleaned"
# froot = "INCLU1x_IF_Responses_-_ALL_RUNS_041924_M3_IF_Reflection_Question_cleaned"
# froot = "INCLU1x_IF_Responses_-_ALL_RUNS_041924_M4_IF_Reflection_Question_cleaned"
# froot = "INCLU1x_IF_Responses_-_ALL_RUNS_041924_M5_IF_Reflection_Question_cleaned"

froots = [
    "INCLU1x_IF_Responses_-_ALL_RUNS_041924_M0_IF_Reflection_Questions_cleaned",
    "INCLU1x_IF_Responses_-_ALL_RUNS_041924_M1_IF_Reflection_Question_cleaned",
    "INCLU1x_IF_Responses_-_ALL_RUNS_041924_M2_IF_Reflection_Question_cleaned",
    "INCLU1x_IF_Responses_-_ALL_RUNS_041924_M3_IF_Reflection_Question_cleaned",
    "INCLU1x_IF_Responses_-_ALL_RUNS_041924_M4_IF_Reflection_Question_cleaned",
    "INCLU1x_IF_Responses_-_ALL_RUNS_041924_M5_IF_Reflection_Question_cleaned" ,
]

## Create an Excel file that compares the labels for each embedder

In [None]:
embedders = ['UAE', 'bge', 'jina']
labellers = ['zephyr','tinyllama']

for froot in froots:
    # create an Excel file and add all the sheets
    with pd.ExcelWriter(os.path.join('tables', froot + '_label_comparison_MC.xlsx'), engine='openpyxl') as writer:
        for embedder in embedders:
            labels = pd.DataFrame()
            for labeller in labellers:
                for filename in glob.glob(os.path.join('tables', '*' + froot + '*' + embedder + '*clusters*' + labeller + '*MC.xlsx')):
                    with open(os.path.join(os.getcwd(), filename), 'r') as f: 
                        print(filename)
                        df = pd.read_excel(filename, sheet_name = 'labels_map')
                        labels[labeller] = df['label'].to_list()
                    labels.to_excel(writer, sheet_name = embedder, index = False)

## Try to use an LLM to combine/summarize all these labels?

In [None]:
# adapted from latent-scope 
class transformers_chat_provider():
    def __init__(self, name, params):
        self.name = name
        self.params = params
        self.pipe = pipeline("text-generation", model=self.name, torch_dtype=torch.float16, device_map="auto")
        self.encoder = self.pipe.tokenizer

    def chat(self, messages, max_new_tokens=24):
        prompt = self.pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        outputs = self.pipe(prompt, max_new_tokens=max_new_tokens, do_sample=True)#, temperature=0.5, top_p=0.95) #top_k=50, 
        generated_text = outputs[0]["generated_text"]
        print("GENERATED TEXT", generated_text)
        return generated_text.split("<|assistant|>")[1].strip()

In [None]:
froot = froots[5]

# embedders = ['UAE', 'bge', 'jina']
embedders = ['UAE', 'bge'] #remove jina
labellers = ['zephyr','tinyllama']

labels_list = []
# create an Excel file and add all the sheets
for embedder in embedders:
    labels = pd.DataFrame()
    for labeller in labellers:
        for filename in glob.glob(os.path.join('tables', '*' + froot + '*' + embedder + '*clusters*' + labeller + '*MC.xlsx')):
            with open(os.path.join(os.getcwd(), filename), 'r') as f: 
                print(filename)
                df = pd.read_excel(filename, sheet_name = 'labels_map')
                labels_list.extend(df['label'].to_list())


In [None]:
instructions_before = "Below is a list of items each starting with [item]. I will want you to summarize this list."
instructions_after = "That was the last item in the list.  Now summarize these items with a new list of up to 10 themes.  Your themes should describe all the unique ideas from the items above.  Do not repeat any item from above verbatim in your themes.  Each theme should be only one short sentence.  Only return the short one-sentence themes."

In [None]:
# define the model and output file (pick one)

model = transformers_chat_provider('HuggingFaceH4/zephyr-7b-beta', {"max_tokens": 2048})
ofile = os.path.join('tables', froot + '_label_summary_zephyr_MC.txt')

# model = transformers_chat_provider('TinyLlama/TinyLlama-1.1B-Chat-v1.0', {"max_tokens": 2048})
# ofile = os.path.join('tables', froot + '_label_summary_tinyllama_MC.txt')

# define the message to the LLM
input_list = ''
for label in labels_list:
    input_list += '[item] ' + label + '\n'
messages=[
    {"role":"system", "content":instructions_before},
    {"role":"user", "content": input_list},
    {"role":"system", "content":instructions_after}
]

# run the model
response = model.chat(messages, max_new_tokens = 1000)

# save to file
with open(ofile,'w') as f:
    f.write(response)        

## Compare the summary themes

In [None]:
froot = froots[3]

print('tinyLlama:')
with open(os.path.join('tables', froot + '_label_summary_tinyllama_MC.txt'),'r') as f:
    print(f.read())

print('')
print('zephyr:')
with open(os.path.join('tables', froot + '_label_summary_zephyr_MC.txt'),'r') as f:
    print(f.read())

## (Later) Re-label the clusters ?

I considered updating the latent-scope code to remove the temperature, top_p, and top_k settings and rerunning all the LLM labels.  I did this for tinyllama, and it didn't really improve things dramatically.  I started for zephyr (which takes MUCH longer), and I also didn't see much improvement.  So I will not do this.  (Though the file below with the cluster parameters is helpful.)

### Create a file with the cluster parameters

(only need to run this once)

In [None]:
out = {'fname':[], 'dataset_id':[], 'embedding_model_id':[], 'embedding_n_dimensions':[], 'umap_number':[], 'cluster_number':[]}
with open('tables/tmp_best_clusters.txt','r') as f:
    lines = f.readlines()
    for line in lines:
        x = line.split()
        key = x[0].replace('worker.','')
        val = x[2].replace("'",'')
        out[key].append(val)
        if (key == 'dataset_id'):
            fname = val.rpartition('_')[0] + '.csv'
            out['fname'].append(fname)

            embedder = val.rpartition('_')[-1]
            if ('UAE' in embedder):
                out['embedding_model_id'].append("transformers-WhereIsAI___UAE-Large-V1")
                out['embedding_n_dimensions'].append(1024) 
            if ('bge' in embedder):
                out['embedding_model_id'].append("transformers-BAAI___bge-large-en-v1.5")
                out['embedding_n_dimensions'].append(1024) 
            if ('jina' in embedder):
                out['embedding_model_id'].append("transformers-jinaai___jina-embeddings-v2-small-en")
                out['embedding_n_dimensions'].append(1024) 

df = pd.DataFrame(out)
df['embedding_number'] = '00001'
df.to_csv('tables/best_clusters.csv', index=False)

### Iterate through and re-label each dataset

In [None]:
from latentscope_helper import latentscope_helper
import os
import pandas as pd

In [None]:
worker = latentscope_helper(
    latent_scope_dir = "../../latent-scope_data",
    text_column = "student_responses",
    chat_model_instructions_before = "Below is a list of items each starting with [item].  Each item is a response from a different person to a survey. These items all have a similar theme.  The list begins below.", 
    chat_model_instructions_after = "That was the last item in the list.  Now return a concise label for the items in this list that describes the theme.  This label should not be fully verbatim text from any individual item.  Your label should contain no more than 10 words.",
)

worker.suppress_latentscope_output = worker.suppress_helper_output = worker.remove_old_files = False
worker.run_embedding = worker.run_umap = worker.run_cluster = False
worker.run_label = worker.save_scope = True

# choose one of the following

worker.chat_model_id = 'transformers-HuggingFaceH4___zephyr-7b-beta'
worker.chat_file_label = 'zephyr'
worker.scope_number = '00003'
worker.label_number = '00003'

# worker.chat_model_id = 'transformers-TinyLlama___TinyLlama-1.1B-Chat-v1.0'
# worker.chat_file_label = 'tinyllama'
# worker.scope_number = '00004'
# worker.label_number = '00004'

In [None]:
df = pd.read_csv('tables/best_clusters.csv', dtype=str)
for index,row in df.iterrows():
    if (index == 2):
        print(row['dataset_id'])
        
        data = pd.read_csv("../../data/" + row['fname'])

        worker.data = data
        worker.dataset_id = row['dataset_id']
        worker.embedding_number = row['embedding_number']
        worker.umap_number = row['umap_number']
        worker.cluster_number = row['cluster_number']

        worker.initialize_files_and_numbering()
        worker.initialize_latentscope()
        worker.run_latentscope()

        f, ax = worker.create_bar_chart(filename = os.path.join('plots', worker.dataset_id + '_' + worker.chat_file_label + '_bar_MC.png'))

        data_raw = data.copy()
        worker.create_excel_workbook(data_raw, os.path.join('tables', worker.dataset_id + '_clusters_' + worker.chat_file_label + '_MC.xlsx'))


### Checking difference between default and updated parameters

(For the ones I checked, they don't look different enough to justify this change or spending time running zephyr for all.  Aborting.)

In [None]:
embedder = 'UAE'
labeller = 'tinyllama' #zephyr
froot = froots[5]

# again, I don't need this as a for loop since it should only pick up one file
for filename in glob.glob(os.path.join('tables', '*' + froot + '*' + embedder + '*clusters*' + labeller + '*MC.xlsx')):
    with open(os.path.join(os.getcwd(), filename), 'r') as f: 
        print(filename)
        df = pd.read_excel(filename, sheet_name = 'labels_map')
        labels_new = df['label'].to_list()

for filename in glob.glob(os.path.join('tables', 'latentscope_defaults','*' + froot + '*' + embedder + '*clusters*' + labeller + '*MC.xlsx')):
    with open(os.path.join(os.getcwd(), filename), 'r') as f: 
        print(filename)
        df = pd.read_excel(filename, sheet_name = 'labels_map')
        labels_default = df['label'].to_list()

for x,y in zip(labels_new, labels_default):
    print()
    print('new:',x)
    print('old:',y)
