# Testing Bennett's Data

`INCLU1x IF Responses - ALL RUNS 041924.xlsx`

In [None]:
import os
import pandas as pd
import nltk
# nltk.download('punkt')  # Download the punkt tokenizer if you haven't already

In [None]:
%load_ext autoreload
%autoreload 2

from latentscope_helper import latentscope_helper

## Read in and clean the data

These next two cells only need to be run once.  (If rerunning this notebook, you can start after the next markdown cell.)

In [None]:
# set this to True if you want to change the data the is used 
# (It takes some time to split the responses by sentences, so I will only do this once and then use that file late)
read_in_original_data_file = False

In [None]:
# Function to split text into sentences
def split_into_sentences(text):
    sentences = nltk.sent_tokenize(text)
    return sentences

In [None]:
if (read_in_original_data_file):
    # read in the data
    df = pd.read_excel("../../data/INCLU1x IF Responses - ALL RUNS 041924.xlsx")
    print(f"length of original DataFrame = {len(df)}")

    # only take the columns we need and rename them to remove spaces and special characters
    data_tmp = df[['ID#','Course Run','Student Response']].rename(columns = {'ID#':'ID','Course Run':'course_run','Student Response':'student_responses'})

    # remove extra newlines, etc.
    data_tmp['student_responses'] = data_tmp['student_responses'].str.replace('\n', ' ')

    # this phrase appears a lot; we should remove it
    data_tmp['student_responses'] = data_tmp['student_responses'].str.replace('Inclusive teaching is important to me because','') 
    
    # remove rows with short answers (otherwise the sentence finder might choke -- not sure why)
    n_min = 5
    data_tmp = data_tmp[data_tmp['student_responses'].str.split().str.len().gt(n_min)]  

    # save to .csv file
    data_tmp.to_csv("../../data/INCLU1x_IF_Responses_-_ALL_RUNS_041924_cleaned.csv", index=False)

    # split into sentences
    data = pd.DataFrame(columns=data_tmp.columns)
    for index, row in data_tmp.iterrows():
        # Split the response into sentences
        sentences = split_into_sentences(row['student_responses'])
        
        # Create a new row for each sentence and append it to the new DataFrame
        for sentence in sentences:
            new_row = row.copy()
            new_row['student_responses'] = sentence
            data = data._append(new_row, ignore_index=True)

    # remove rows with short answers (again)
    n_min = 5
    data = data[data['student_responses'].str.split().str.len().gt(n_min)]  

    print(f"length of new DataFrame (after cleaning and sentence splitting) = {len(data)}")

    # Save the new DataFrame to a new file (since this takes a while to run)
    data.to_csv("../../data/INCLU1x_IF_Responses_-_ALL_RUNS_041924_cleaned_split_into_sentences.csv", index=False)


## Initialize and run `latent-scope` using my Class

In [None]:
# I worry that splitting into sentences may not be ideal because some of the meaning of the answer is lost, but without splitting I might combine different themes
data = pd.read_csv("../../data/INCLU1x_IF_Responses_-_ALL_RUNS_041924_cleaned_split_into_sentences.csv")
#data = pd.read_csv("../../data/INCLU1x_IF_Responses_-_ALL_RUNS_041924_cleaned.csv")

# initialize my helper object
worker = latentscope_helper(
    latent_scope_dir = "../../latent-scope_data", # directory where the latentscope files are stored
    dataset_id = "INCLU1x_IF_Responses_-_ALL_RUNS_041924", # data set name, for the sub-directory name within latent_scope_dir for this project
    data = data, # pandas DataFrame that contains the data to analyze
    text_column = "student_responses", # response column name from data_file
    scope_number = 'new', # number that will be appended to all latentscope files 
    remove_old_files = True, # set this to True if you want to clean the latent-scope directories and start fresh
    imax = 50, # maximum number of scopes that it should search through
    run_embedding = True, # whether to run the embedding step (and potentially remove previous files)
    run_umap = True, # whether to run the umap step (and potentially remove previous files)
    run_cluster = True, # whether to run the clustering step (and potentially remove previous files)
    run_label = True, # whether to run the labeling step (and potentially remove previous files)
    embedding_model_id = "transformers-jinaai___jina-embeddings-v2-small-en", # embeddings model name
    embedding_n_dimensions = 1000, # number of dimensions for embedding.  reading the jina docs, they often use this number as an example for the number of dimensions (not sure this is a recommendation though)
    umap_n_components = 3, # number of UMAP dimensions
    umap_n_neighbors = 100, # "controls how UMAP balances local versus global structure in the data." Larger values mean UMAP will look at larger distances for neighbors (15 is default)
    umap_min_dist = 0, # "controls how tightly UMAP is allowed to pack points together" (default is 0.1)
    cluster_samples = 5, # min_cluster_size in HDBSCAN : "the smallest size grouping that you wish to consider a cluster"
    cluster_min_samples = 10, # min_samples in HDBSCAN : "provide a measure of how conservative you want your clustering to be. The larger the value of min_samples you provide, the more conservative the clustering – more points will be declared as noise, and clusters will be restricted to progressively more dense areas."
    cluster_selection_epsilon =  0.05, # cluster_selection_epsilon in HDBSCAN : distance measure between clusters that "ensures that clusters below the given threshold are not split up any further"
    chat_model_id = "transformers-TinyLlama___TinyLlama-1.1B-Chat-v1.0", # LLM to use for labeling the clusters
    label_length = 10, # max length to tell the LLM to use when generating a given label (not always respected by the LLM!)
    chat_model_instructions_before = "Below is a list of items each starting with [item].  Each item is a response from a different person to a survey. These items all have a similar theme.  The list begins below.", # string of text to provide the LLM as instructions before the list of cluster items is given
    chat_model_instructions_after = "That was the last item in the list.  Now return a concise label for the items in this list that describes the theme.  This label should not be fully verbatim text from any individual item.  Your label should contain no more than 10 words.", # string of text to provide the LLM as instructions after the list of cluster items is given
    scope_description = "First full test with responses separated into sentences", #label to give the scope (when using the latentscope server to view the data)
)

In [None]:
# run latent-scope (using the inputs from above)

worker.run_embedding = False
worker.run_umap = False

worker.initialize_files_and_numbering()
worker.initialize_latentscope()
worker.run_latentscope()

## Print and save the output

In [None]:
# if you want to plot a specific scope number, you can define it here (and you don't need to actually run latentscope in the previous cell)
worker.scope_number = '001'
worker.initialize_latentscope()

In [None]:
# print the labels
worker.print_labels()

In [None]:
# create a bar chart of the labels
f, ax = worker.create_bar_chart(filename = os.path.join('plots', worker.dataset_id + '_scope' + worker.scope_number + '.png'))

In [None]:
# create an Excel workbook to review the results
# The first sheet will have the raw data.  
# The second sheet will have a map between cluster label and sheet name.  
# Subsequent sheets will be one per cluster containing the cluster data.
data_raw = pd.read_excel("../../data/INCLU1x IF Responses - ALL RUNS 041924.xlsx")
# clean up the data for the output
data_raw_sheet = pd.DataFrame()
data_raw_sheet['ID'] = data_raw['ID#']
data_raw_sheet['course_run'] = data_raw['Course Run']
data_raw_sheet['unknown_question'] = data_raw['Student Response'] # I need to update this with the actual question

worker.create_excel_workbook(data_raw_sheet, os.path.join('tables', worker.dataset_id + '_clusters_scope' + worker.scope_number + '.xlsx'))

## Calculate metrics to assess the quality of this analysis

Ideally, I would want to do this for a number of runs each changing some parameter and returning a different number of clusters.

In [None]:
# returns inertia, Silhouette Coefficient, Calinski-Harabasz Index, Davies-Bouldin Index
# - a lower inertia value is generally better
# - a higher Silhouette Coefficient score relates to a model with better defined clusters. 
# - a higher Calinski-Harabasz score relates to a model with better defined clusters.
# - a lower Davies-Bouldin index relates to a model with better separation between the clusters.
worker.calculate_metrics('001')

## Run the server to investigate and visualize these results

In [None]:
# ls.serve()

## Available models can be printed below

In [None]:
# print a list of possible embedding models
# [m["id"] for m in ls.models.get_embedding_model_list()]

In [None]:
# print a list of available LLMS for labelling the clusters
# [m["id"] for m in ls.models.get_chat_model_list()]