# Testing Bennett's Data

`INCLU1x IF Responses - ALL RUNS 041924.xlsx`

In [None]:
import os
import pandas as pd
import numpy as np
import nltk
import seaborn as sns
# nltk.download('punkt')  # Download the punkt tokenizer if you haven't already

from sklearn.preprocessing import minmax_scale

In [None]:
%load_ext autoreload
%autoreload 2

from latentscope_helper import latentscope_helper

## Read in and clean the data

These next two cells only need to be run once.  (If rerunning this notebook, you can start after the next markdown cell.)

In [None]:
# set this to True if you want to change the data the is used 
# (It takes some time to split the responses by sentences, so I will only do this once and then use that file late)
read_in_original_data_file = False

In [None]:
# Function to split text into sentences
def split_into_sentences(text):
    sentences = nltk.sent_tokenize(text)
    return sentences

In [None]:
if (read_in_original_data_file):
    # read in the data
    df = pd.read_excel("../../data/INCLU1x IF Responses - ALL RUNS 041924.xlsx")
    print(f"length of original DataFrame = {len(df)}")

    # only take the columns we need and rename them to remove spaces and special characters
    data_tmp = df[['ID#','Course Run','Student Response', 'Problem Code']].rename(columns = {'ID#':'ID','Course Run':'course_run','Student Response':'student_responses', 'Problem Code':'question_code'})


    # remove extra newlines, etc.
    data_tmp['student_responses'] = data_tmp['student_responses'].str.replace('\n', ' ')

    # this phrase appears a lot; we should remove it
    data_tmp['student_responses'] = data_tmp['student_responses'].str.replace('Inclusive teaching is important to me because','') 
    
    # remove rows with short answers (otherwise the sentence finder might choke -- not sure why)
    n_min = 5
    data_tmp = data_tmp[data_tmp['student_responses'].str.split().str.len().gt(n_min)]  

    # get the unique questions, and save these to individual files
    question_codes = data_tmp['question_code'].unique()

    for qc in question_codes:

        print(qc)
        
        data_use = data_tmp.loc[data_tmp['question_code'] == qc]

        # save to .csv file
        data_use.to_csv("../../data/INCLU1x_IF_Responses_-_ALL_RUNS_041924_" + qc.replace(' ','_') + "_cleaned.csv", index=False)

        # split into sentences
        data = pd.DataFrame(columns=data_use.columns)
        for index, row in data_use.iterrows():
            # Split the response into sentences
            sentences = split_into_sentences(row['student_responses'])
            
            # Create a new row for each sentence and append it to the new DataFrame
            for sentence in sentences:
                new_row = row.copy()
                new_row['student_responses'] = sentence
                data = data._append(new_row, ignore_index=True)

        # remove rows with short answers (again)
        n_min = 5
        data = data[data['student_responses'].str.split().str.len().gt(n_min)]  

        print(f"length of new DataFrame (after cleaning and sentence splitting) = {len(data)}")

        # Save the new DataFrame to a new file (since this takes a while to run)
        data.to_csv("../../data/INCLU1x_IF_Responses_-_ALL_RUNS_041924_" + qc.replace(' ','_') + "_cleaned_split_into_sentences.csv", index=False)


## Initialize and run `latent-scope` using my Class

In [None]:
# initialize my helper object with some parameters that will be common across all runs below
worker = latentscope_helper(
    latent_scope_dir = "../../latent-scope_data", # directory where the latentscope files are stored
    text_column = "student_responses", # response column name from data_file
    remove_old_files = True, # set this to True if you want to clean the latent-scope directories and start fresh
    quick_clean = True, # set this to True if you want to remove every file in the directories, regardless of imin and imax
    imin = 0, # minimum number of for files to search through (to remove)
    imax = 50, # maximum number of scopes that it should search through
    label_length = 10, # max length to tell the LLM to use when generating a given label (not always respected by the LLM!)
    chat_model_instructions_before = "Below is a list of items each starting with [item].  Each item is a response from a different person to a survey. These items all have a similar theme.  The list begins below.", # string of text to provide the LLM as instructions before the list of cluster items is given
    chat_model_instructions_after = "That was the last item in the list.  Now return a concise label for the items in this list that describes the theme.  This label should not be fully verbatim text from any individual item.  Your label should contain no more than 10 words.", # string of text to provide the LLM as instructions after the list of cluster items is given
)

In [None]:
# first pass just to get the embeddings (will run multiple times on the different data sets and embedding models)

worker.run_embedding = True
worker.run_umap = worker.run_label = worker.run_cluster = worker.save_scope = False


#############
# choose one data file
#############
# I am not going to split these into sentences here
# fname = "INCLU1x_IF_Responses_-_ALL_RUNS_041924_M0_IF_Reflection_Questions_cleaned.csv"
# fname = "INCLU1x_IF_Responses_-_ALL_RUNS_041924_M1_IF_Reflection_Question_cleaned.csv"
# fname = "INCLU1x_IF_Responses_-_ALL_RUNS_041924_M2_IF_Reflection_Question_cleaned.csv"
# fname = "INCLU1x_IF_Responses_-_ALL_RUNS_041924_M3_IF_Reflection_Question_cleaned.csv"
# fname = "INCLU1x_IF_Responses_-_ALL_RUNS_041924_M4_IF_Reflection_Question_cleaned.csv"
fname = "INCLU1x_IF_Responses_-_ALL_RUNS_041924_M5_IF_Reflection_Question_cleaned.csv"

# these may need to be split into sentences, since they ask for multiple response (check with Bennett whether he wants to analyze these at all)
# fname = "INCLU1x_IF_Responses_-_ALL_RUNS_041924_M0_IF_Application_Question_cleaned.csv"
# fname = "INCLU1x_IF_Responses_-_ALL_RUNS_041924_M1_IF_Application_Question_cleaned.csv"
# fname = "INCLU1x_IF_Responses_-_ALL_RUNS_041924_M2_IF_Application_Question_cleaned.csv"
# fname = "INCLU1x_IF_Responses_-_ALL_RUNS_041924_M3_IF_Application_Question_cleaned.csv"
# fname = "INCLU1x_IF_Responses_-_ALL_RUNS_041924_M4_IF_Application_Question_cleaned.csv"
# fname = "INCLU1x_IF_Responses_-_ALL_RUNS_041924_M5_IF_Application_Question_cleaned.csv"

#############

data = pd.read_csv("../../data/" + fname)


#############
# choose one embedding model
#############
worker.data = data
worker.dataset_id = fname.replace('.csv','') + "_UAE1024"
worker.embedding_model_id = "transformers-WhereIsAI___UAE-Large-V1"
worker.embedding_n_dimensions = 1024

# worker.data = data
# worker.dataset_id = fname.replace('.csv','') + "_bge1024"
# worker.embedding_model_id = "transformers-BAAI___bge-large-en-v1.5"
# worker.embedding_n_dimensions = 1024

# For some reason the kernel died for me on M0 (always at 51%, I also tried 256 dimensions with no luck)
# worker.data = data
# worker.dataset_id = fname.replace('.csv','') + "_jina512"
# worker.embedding_model_id = "transformers-jinaai___jina-embeddings-v2-small-en"
# worker.embedding_n_dimensions = 512  

#############



In [None]:
# run embeddings
worker.initialize_files_and_numbering()
worker.initialize_latentscope()
worker.run_latentscope()

## Calculate metrics to assess the quality of this analysis

Ideally, I would want to do this for a number of runs each changing some parameter and returning a different number of clusters.

In [None]:
# in case I need to remove files (but keep the embeddings)
worker.remove_old_files = True
worker.quick_clean = True
worker.initialize_files_and_numbering(dirs_to_remove = ['umaps', 'clusters', 'scopes'])

In [None]:
# set to True to run the metrics as a grid and False to draw random parameters for the metrics (between defined limits)
rungrid = False

In [None]:
#%%capture
# %%capture magic to suppress output

if rungrid:

    # loop over parameters to check the resulting metrics
    worker.suppress_latentscope_output = True
    worker.suppress_helper_output = True
    worker.remove_old_files = False
    worker.initialize_files_and_numbering()

    worker.embedding_number = '00001'
    worker.run_embedding = worker.run_umap = worker.run_label = worker.run_cluster = worker.save_scope = False

    umap_n_components = [2, 3]
    umap_n_neighbors = [5, 10, 15, 20, 25, 30]
    umap_min_dist = [0, 0.05, 0.1, 0.15]

    cluster_min_samples = [5, 10, 15, 20, 25, 30]
    cluster_selection_epsilon = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1]

    un = 1 # starting number for umap files
    cn = 1 # starting number for the cluster files

    cluster_file_numbers = []
    umap_file_numbers = []

    for u_nc in umap_n_components:
        for u_nn in umap_n_neighbors:
            for u_md in umap_min_dist:
                print(f'umap_file_number = {un}, n_components = {u_nc}, n_neighbors = {u_nn}, min_dist = {u_md}')
                worker.umap_n_components = u_nc
                worker.umap_n_neighbors = u_nn
                worker.umap_min_dist = u_md
                worker.umap_number = str(un).zfill(5)
                worker.run_umap = True
                worker.run_cluster = False
                worker.initialize_latentscope_filenames()
                worker.run_latentscope()

                
                # cluster parameters
                for c_ms in cluster_min_samples:
                    for c_eps in cluster_selection_epsilon:
                        print(f'cluster_file_number = {cn}, min_samples = {c_ms}, selection_epsilon = {c_eps}')
                        worker.cluster_min_samples = c_ms
                        worker.cluster_selection_epsilon = c_eps
                        worker.cluster_number = str(cn).zfill(5)
                        worker.run_umap = False
                        worker.run_cluster = True
                        worker.initialize_latentscope_filenames()
                        worker.run_latentscope()
            
                        cluster_file_numbers.append(cn)
                        umap_file_numbers.append(un)
                        cn += 1
                        
                un += 1
                print('')

In [None]:
if not rungrid:

    # draw parameters randomly, Ndraw_u*Ndraw_c times
 
    Ndraw_u = 10 # number of different umap parameters
    Ndraw_c = 50 # number of different HDBSCAN parameters (for each umap parameter set)

    worker.suppress_latentscope_output = True
    worker.suppress_helper_output = True
    worker.remove_old_files = False
    worker.initialize_files_and_numbering()

    worker.embedding_number = '00001'
    worker.run_embedding = worker.run_umap = worker.run_label = worker.run_cluster = worker.save_scope = False

    umap_n_components_limits = [2, 5]
    umap_n_neighbors_limits = [10, 100]
    umap_min_dist_limits = [0, 0.15]

    cluster_min_samples_limits = [5, 30]
    cluster_selection_epsilon_limits = [0.01, 0.1]

    un = 1 # starting number for umap files
    cn = 1 # starting number for the cluster files

    cluster_file_numbers = []
    umap_file_numbers = []

    for i in range(Ndraw_u):
        u_nc = np.random.randint(low = umap_n_components_limits[0], high = umap_n_components_limits[1] + 1)
        u_nn = np.random.randint(low = umap_n_neighbors_limits[0], high = umap_n_neighbors_limits[1] + 1)
        u_md = np.random.random()*(umap_min_dist_limits[1] - umap_min_dist_limits[0]) + umap_min_dist_limits[0]

        print(f'umap_file_number = {un}, n_components = {u_nc}, n_neighbors = {u_nn}, min_dist = {u_md}')
        worker.umap_n_components = u_nc
        worker.umap_n_neighbors = u_nn
        worker.umap_min_dist = u_md
        worker.umap_number = str(un).zfill(5)
        worker.run_umap = True
        worker.run_cluster = False
        worker.initialize_latentscope_filenames()
        worker.run_latentscope()
        
        for j in range(Ndraw_c):
            c_ms = np.random.randint(low = cluster_min_samples_limits[0], high = cluster_min_samples_limits[1] + 1)
            c_eps = np.random.random()*(cluster_selection_epsilon_limits[1] - cluster_selection_epsilon_limits[0]) + cluster_selection_epsilon_limits[0]
            
            print(f'cluster_file_number = {cn}, min_samples = {c_ms}, selection_epsilon = {c_eps}')
            worker.cluster_min_samples = c_ms
            worker.cluster_selection_epsilon = c_eps
            worker.cluster_number = str(cn).zfill(5)
            worker.run_umap = False
            worker.run_cluster = True
            worker.initialize_latentscope_filenames()
            worker.run_latentscope()

            cluster_file_numbers.append(cn)
            umap_file_numbers.append(un)
            cn += 1
                        
        un += 1
        print('')

In [None]:
metrics = []
inertia = []
sc = []
ch = []
db = []
n_neighbors = []
min_dist = []
n_components = []
min_samples = []
selection_epsilon = []
n_clusters = []
for (un,cn) in zip(umap_file_numbers, cluster_file_numbers):
    print(un, cn)
    m = worker.calculate_metrics(embedding_number = '00001', umap_number = str(un).zfill(5), cluster_number = str(cn).zfill(5))
    metrics.append(m)
    inertia.append(m['inertia'])
    sc.append(m['silhouette_coefficient'])
    ch.append(m['calinski_harabasz_index'])
    db.append(m['davies_bouldin_index'])
    n_components.append(m['umap_info']['n_components'])
    n_neighbors.append(m['umap_info']['neighbors'])
    min_dist.append(m['umap_info']['min_dist'])
    min_samples.append(m['cluster_info']['min_samples'])
    selection_epsilon.append(m['cluster_info']['cluster_selection_epsilon'])
    n_clusters.append(m['cluster_info']['n_clusters'])
df = pd.DataFrame()
df['umap_file_number'] = umap_file_numbers
df['cluster_file_number'] = cluster_file_numbers
df['umap_n_components'] = n_components
df['umap_n_neighbors'] = n_neighbors
df['umap_min_dist'] = min_dist
df['cluster_min_samples'] = min_samples
df['cluster_selection_epsilon'] = selection_epsilon
df['n_clusters'] = n_clusters
df['inertia'] = inertia
df['silhouette_coefficient'] = sc
df['calinski_harabasz_index'] = ch
df['davies_bouldin_index'] = db


if rungrid:
    df.to_csv(os.path.join('tables', worker.dataset_id + '_metrics_grid.csv'), index = False)
else:
    df.to_csv(os.path.join('tables', worker.dataset_id + '_metrics_MC.csv'), index = False)

df



In [None]:
# I'm still having trouble resetting the plotting backend given the suppression above 
import matplotlib
%matplotlib inline

In [None]:
if rungrid:
    dfm = pd.read_csv(os.path.join('tables', worker.dataset_id + '_metrics_grid.csv'))
else:
    dfm = pd.read_csv(os.path.join('tables', worker.dataset_id + '_metrics_MC.csv'))

dfm

In [None]:
cols_to_plot = dfm.columns[2:].tolist()
g = sns.pairplot(dfm[cols_to_plot], corner = True, diag_kind='kde')

In [None]:
f = 0.15
dfm['combined_metric'] = minmax_scale(dfm['inertia']) + minmax_scale(dfm['silhouette_coefficient']) + minmax_scale(dfm['calinski_harabasz_index']) + (1. - minmax_scale(dfm['davies_bouldin_index']))

dfm_best = dfm.loc[(dfm['n_clusters'] > 2) & (dfm['n_clusters'] < 30) & (dfm['silhouette_coefficient'] > (1. - f)*dfm['silhouette_coefficient'].max()) & (dfm['calinski_harabasz_index'] > (1. - f)*dfm['calinski_harabasz_index'].max()) & (dfm['davies_bouldin_index'] < (1. + f)*dfm['davies_bouldin_index'].min())]

dfm_best.sort_values(by=["n_clusters", "combined_metric"], ascending = [True, False])

In [None]:
x_vars = ['umap_n_components','umap_n_neighbors','umap_min_dist','cluster_min_samples','cluster_selection_epsilon','n_clusters']
y_vars = ['silhouette_coefficient','calinski_harabasz_index','davies_bouldin_index', 'inertia']
g = sns.PairGrid(dfm, x_vars = x_vars, y_vars = y_vars, hue = 'n_clusters', palette = 'viridis')
g.map(sns.scatterplot)
#g.map_diag(sns.histplot)

# I played around with the loc statement to isolate what appears to be the best clusters
# (and asked ChatGPT how to create this overlay)
# dfm_best = df.loc[(dfm['n_clusters'] > 5) & (dfm['n_clusters'] < 500) & (dfm['silhouette_coefficient'] > -0.2) & (dfm['calinski_harabasz_index'] > 6000) & (dfm['davies_bouldin_index'] < 5)]
def custom_scatter(x, y, **kwargs):
    sns.scatterplot(x=x, y=y, **kwargs)
    sns.scatterplot(data=dfm_best, x=x.name, y=y.name, marker='o', color='black', edgecolor='black', facecolors='none', s=100)
g.map(custom_scatter)

if rungrid:
    g.savefig(os.path.join('plots', worker.dataset_id + '_metric_grid.png'), bbox_inches = 'tight') 
else:
    g.savefig(os.path.join('plots', worker.dataset_id + '_metric_MC.png'), bbox_inches = 'tight') 


In [None]:
dfm_best.sort_values(by=["n_clusters", "combined_metric"], ascending = [True, False])

In [None]:
worker.dataset_id

## Pick one of these to create labels and save the scope

In [None]:
worker.suppress_latentscope_output = False
worker.suppress_helper_output = False
worker.remove_old_files = False
worker.embedding_number = '00001'

worker.run_embedding = worker.run_umap = worker.run_label = worker.run_cluster = False
worker.run_label = worker.save_scope = True

# I need to run the combinations (after deciding on bge and UMAP params)

###################
# choose between data sets
###################

###################
# M0
# worker.dataset_id = 'INCLU1x_IF_Responses_-_ALL_RUNS_041924_M0_IF_Reflection_Questions_cleaned_UAE1024'
# worker.umap_number = '00001'
# worker.cluster_number = '00017'

# worker.dataset_id = 'INCLU1x_IF_Responses_-_ALL_RUNS_041924_M0_IF_Reflection_Questions_cleaned_bge1024'
# worker.umap_number = '00010'
# worker.cluster_number = '00462'

###################
# M1
# worker.dataset_id = 'INCLU1x_IF_Responses_-_ALL_RUNS_041924_M1_IF_Reflection_Questions_cleaned_UAE1024'
# worker.umap_number = '00008'
# worker.cluster_number = '00355'

# worker.dataset_id = 'INCLU1x_IF_Responses_-_ALL_RUNS_041924_M1_IF_Reflection_Questions_cleaned_bge1024'
# worker.umap_number = '00001'
# worker.cluster_number = '00032'

# worker.dataset_id = 'INCLU1x_IF_Responses_-_ALL_RUNS_041924_M1_IF_Reflection_Questions_cleaned_jina512'
# worker.umap_number = '00005'
# worker.cluster_number = '00219'


###################
# M2
# worker.dataset_id = 'INCLU1x_IF_Responses_-_ALL_RUNS_041924_M2_IF_Reflection_Questions_cleaned_UAE1024'
# worker.umap_number = '00006'
# worker.cluster_number = '00282'

# worker.dataset_id = 'INCLU1x_IF_Responses_-_ALL_RUNS_041924_M2_IF_Reflection_Questions_cleaned_bge1024'
# worker.umap_number = '00005'
# worker.cluster_number = '00209'

# worker.dataset_id = 'INCLU1x_IF_Responses_-_ALL_RUNS_041924_M2_IF_Reflection_Questions_cleaned_jina512'
# worker.umap_number = '00001'
# worker.cluster_number = '00008'


###################
# M3
# worker.dataset_id = 'INCLU1x_IF_Responses_-_ALL_RUNS_041924_M3_IF_Reflection_Questions_cleaned_UAE1024'
# worker.umap_number = '00001'
# worker.cluster_number = '00038'

# worker.dataset_id = 'INCLU1x_IF_Responses_-_ALL_RUNS_041924_M3_IF_Reflection_Questions_cleaned_bge1024'
# worker.umap_number = '00001'
# worker.cluster_number = '00003'

# worker.dataset_id = 'INCLU1x_IF_Responses_-_ALL_RUNS_041924_M3_IF_Reflection_Questions_cleaned_jina512'
# worker.umap_number = '00001'
# worker.cluster_number = '00017'


###################
# M4
# worker.dataset_id = 'INCLU1x_IF_Responses_-_ALL_RUNS_041924_M4_IF_Reflection_Questions_cleaned_UAE1024'
# worker.umap_number = '00006'
# worker.cluster_number = '00251'

# worker.dataset_id = 'INCLU1x_IF_Responses_-_ALL_RUNS_041924_M4_IF_Reflection_Questions_cleaned_bge1024'
# worker.umap_number = '00001'
# worker.cluster_number = '00018'

# worker.dataset_id = 'INCLU1x_IF_Responses_-_ALL_RUNS_041924_M4_IF_Reflection_Questions_cleaned_jina512'
# worker.umap_number = '00001'
# worker.cluster_number = '00006'


###################
# M5
# worker.dataset_id = 'INCLU1x_IF_Responses_-_ALL_RUNS_041924_M5_IF_Reflection_Questions_cleaned_UAE1024'
worker.umap_number = '00006'
worker.cluster_number = '00277'

# worker.dataset_id = 'INCLU1x_IF_Responses_-_ALL_RUNS_041924_M5_IF_Reflection_Questions_cleaned_bge1024'
# worker.umap_number = '00009'
# worker.cluster_number = '00403'

# worker.dataset_id = 'INCLU1x_IF_Responses_-_ALL_RUNS_041924_M5_IF_Reflection_Questions_cleaned_jina512'
# worker.umap_number = '00001'
# worker.cluster_number = '00001'

###################
# choose the LLM
###################

worker.chat_model_id = 'transformers-HuggingFaceH4___zephyr-7b-beta'
worker.chat_file_label = 'zephyr'
worker.scope_number = '00001'
worker.label_number = '00001'

# worker.chat_model_id = 'transformers-TinyLlama___TinyLlama-1.1B-Chat-v1.0'
# worker.chat_file_label = 'tinyllama'
# worker.scope_number = '00002'
# worker.label_number = '00002'

worker.initialize_files_and_numbering()
worker.initialize_latentscope()

In [None]:

worker.run_latentscope()

## Print and save the output

In [None]:
# if you want to plot a specific scope number, you can define it here (and you don't need to actually run latentscope in the previous cell)
# worker.remove_old_files = False
# worker.scope_number = '00001'
# worker.initialize_files_and_numbering()
# worker.initialize_latentscope()
rungrid = False

In [None]:
# print the labels
worker.print_labels()

In [None]:
# create a bar chart of the labels
if (rungrid):
    f, ax = worker.create_bar_chart(filename = os.path.join('plots', worker.dataset_id + '_' + worker.chat_file_label  + '_bar_grid.png'))
else:
    f, ax = worker.create_bar_chart(filename = os.path.join('plots', worker.dataset_id + '_' + worker.chat_file_label + '_bar_MC.png'))

In [None]:
# create an Excel workbook to review the results
# The first sheet will have the raw data.  
# The second sheet will have a map between cluster label and sheet name.  
# Subsequent sheets will be one per cluster containing the cluster data.
data_raw = data.copy()

if (rungrid):
    worker.create_excel_workbook(data_raw, os.path.join('tables', worker.dataset_id + '_clusters_' + worker.chat_file_label + '_grid.xlsx'))
else:
    worker.create_excel_workbook(data_raw, os.path.join('tables', worker.dataset_id + '_clusters_' + worker.chat_file_label + '_MC.xlsx'))

## Run the server to investigate and visualize these results

In [None]:
# import latentscope as ls

In [None]:
# ls.serve()

## Available models can be printed below

In [None]:
# print a list of possible embedding models
# [m["id"] for m in ls.models.get_embedding_model_list()]

In [None]:
# print a list of available LLMS for labelling the clusters
# [m["id"] for m in ls.models.get_chat_model_list()]