# Testing Bennett's Data

`ITP_CourseArtifacts_June 2021_END_of_Course_DeIDENTIFIED.xlsx` using the `Course Meta App` sheet

In [None]:
import os
import glob
import re
import pandas as pd
import numpy as np
import nltk
# nltk.download('punkt')  # Download the punkt tokenizer if you haven't already

import matplotlib.pyplot as plt

In [None]:
%load_ext autoreload
%autoreload 2

import latentscope as ls

## Read in and clean the data

These next two cells only need to be run once.  (If rerunning this notebook, you can start after the next markdown cell.)

In [None]:
# set this to True if you want to change the data the is used 
read_in_original_data_file = False

In [None]:
# Function to split text into sentences
def split_into_sentences(text):
    sentences = nltk.sent_tokenize(text)
    return sentences

In [None]:
if (read_in_original_data_file):
    # read in the data
    df = pd.read_excel("../../data/ITP_CourseArtifacts_June 2021_END_of_Course_DeIDENTIFIED.xlsx", sheet_name = "Course Meta App")
    print(f"length of original DataFrame = {len(df)}")

    # save the question
    question = df.columns[1]
    # only take the columns we need and rename them to remove spaces and special characters
    data_tmp = df.rename(columns = {question:'student_responses'})

    # remove extra newlines and special characters 
    #(some special characters were apparently converted to ascii before creating the xlsx file and remain as e.g., \u2019 ... )
    data_tmp['student_responses'] = data_tmp['student_responses'].str.replace('\n', ' ').replace(r'[^\w\s]', '').str.replace(r'[^\x00-\x7F]', '')

    # remove rows with short answers (otherwise the sentence finder might choke -- not sure why)
    n_min = 5
    data_tmp = data_tmp[data_tmp['student_responses'].str.split().str.len().gt(n_min)]  

    # save to .csv file
    data_tmp.to_csv("../../data/ITP_CourseArtifacts_June_2021_END_of_Course_DeIDENTIFIED_Course_Meta_App_cleaned.csv", index=False)

    # split into sentences
    data = pd.DataFrame(columns=data_tmp.columns)
    for index, row in data_tmp.iterrows():
        # Split the response into sentences
        sentences = split_into_sentences(row['student_responses'])
        
        # Create a new row for each sentence and append it to the new DataFrame
        for sentence in sentences:
            new_row = row.copy()
            new_row['student_responses'] = sentence
            data = data._append(new_row, ignore_index=True)

    # remove rows with short answers (again)
    n_min = 5
    data = data[data['student_responses'].str.split().str.len().gt(n_min)]  

    print(f"length of new DataFrame (after cleaning and sentence splitting) = {len(data)}")

    # Save the new DataFrame to a new file (since this takes a while to run)
    data.to_csv("../../data/ITP_CourseArtifacts_June_2021_END_of_Course_DeIDENTIFIED_Course_Meta_App_cleaned_split_into_sentences.csv", index=False)


## Read in the cleaned data

In [None]:
# This question has multiple components; so definitely best to take the version with split sentences
data = pd.read_csv("../../data/ITP_CourseArtifacts_June_2021_END_of_Course_DeIDENTIFIED_Course_Meta_App_cleaned_split_into_sentences.csv")


data#['student_responses']

## Initialize `latent-scope`

In [None]:
# define the directory where all the latent-scope results are stored (this can be the same directory for multiple projects)
latent_scope_dir = "../../latent-scope_data"

# data set name, for the sub-directory name within latent_scope_dir for this project
dataset_id = "ITP_CourseArtifacts_June_2021_END_of_Course_DeIDENTIFIED_Course_Meta_App"

ls.init(latent_scope_dir)

In [None]:
# define the scope number to use (can keep as 'new', unless you want to rerun something)
# this string is appended to the end of each file saved for each step in latent-scope,
# use 'new' if you want to create a new scope 
scope_number = 'new' 

########################
# REMOVE PREVIOUS FILES?... BEWARE
remove_old_files = True # set this to True if you want to clean the latent-scope directories and start fresh
imax = 50 # maximum number of scopes that it should search through
########################

In [None]:
# REMOVE PREVIOUS FILES?... BEWARE
if (remove_old_files):
    scope_number = '001'
    for i in range(imax):
        for d in ['clusters', 'embeddings', 'umaps', 'scopes']:
            for f in glob.glob(os.path.join(latent_scope_dir, dataset_id, d, '*'+str(i).zfill(3)+'*')):
                print("removing : ",f)
                os.remove(f)
else:
    if (scope_number ==  'new'):
        nums = []
        fls = []
        for f in glob.glob(os.path.join(latent_scope_dir, dataset_id, 'embeddings', '*.json')):
            fls.append(f)
            x = re.split('-|\.', f)
            nums.append(int(x[-2]))
        if (len(nums) > 0):
            n = max(nums)
            scope_number = str(n + 1).zfill(3)
        else:
            scope_number = '001'
        print('list of files :', fls)
        print('list of numbers :', nums)
        print('new scope number = ', scope_number)


In [None]:
# print a list of possible embedding models

# [m["id"] for m in ls.models.get_embedding_model_list()]

In [None]:
# print a list of available LLMS for labelling the clusters

# [m["id"] for m in ls.models.get_chat_model_list()]

In [None]:
# define various parameters to use for latent-scope analysis

# response column name from DataFrame
text_column = "student_responses"

# settings for embeddings
embedding_model_id = "transformers-jinaai___jina-embeddings-v2-small-en"
# Ritika says that usually people choose 1000 or 3000 dimensions, but not sure there's a quantitative criteria for that
# She heard the rule of thumb: "use the fourth root of the total number of unique categorical elements while another is that the embedding dimension should be approximately 1.6 times the square root of the number of unique elements in the category, and no less than 600."
embedding_n_dimensions = 1000

# settings for umap
# Ritika says 2 components is good, but could try 3 with 10k data points if necessary.  
# efren suggests that we cluster on more than 2 dimension...
# Ritika suggests keeping min_dist = 0, and then slowly increasing to see if clusters change.
# Ritika also suggest to set n_neighbors to 100 (w/ 10k points). 30 is the default.
umap_n_components = 3
umap_n_neighbors = 30
umap_min_dist = 0
umap_embedding_id = "embedding-" + scope_number

# settings for clustering
# I should look into the best settings here. 
# For now I will so I will use the same default values included for the dadabase example that ran through flask 
cluster_samples = 5
cluster_min_samples = 10
cluster_selection_epsilon =  0.05
cluster_umap_id = "umap-" + scope_number

# settings for LLM labeller
#chat_model_id = "transformers-HuggingFaceH4___zephyr-7b-beta"
chat_model_id = "transformers-TinyLlama___TinyLlama-1.1B-Chat-v1.0"
label_length = 10
chat_model_instructions_before = "Below is a list of items each starting with [item].  Each item is a response from a different person to a survey. These items all have a similar theme.  The list begins below."
chat_model_instructions_after = f"That was the last item in the list.  Now return a concise label for the items in this list that describes the theme.  This label should not be fully verbatim text from any individual item.  Your label should contain no more than {label_length} words."
label_cluster_id = "cluster-" + scope_number

# settings for scope file
scope_labels_id = label_cluster_id + "-labels-" + scope_number
scope_label = "Scope" + scope_number
scope_description = "First full test with responses separated into sentences"



## Run `latent-scope`

In [None]:
# ingest the data into latent-scope
ls.ingest(dataset_id, data, text_column = text_column)

In [None]:
# calculate the embeddings

# dataset_id, text_column, model_id, prefix, rerun, dimensions
# NOTE: the example notebook online did not have rerun or dimensions.  I looked at the code, and I think rerun should be None
#       dimensions from the flask server = 384 (not sure where this number comes from!)

ls.embed(dataset_id, text_column, embedding_model_id, "", None, embedding_n_dimensions)

In [None]:
# run UMAP dimension reduction

# dataset_id, embedding_id, n_neighbors, min_dist
# NOTE: I added the n_components arg
     
ls.umap(dataset_id, umap_embedding_id, umap_n_neighbors, umap_min_dist, n_components = umap_n_components)

In [None]:
# run HDBSCAN to cluster (on UMAP vectors)

# dataset_id, umap_id, samples, min_samples
# NOTE: the example from latent-scope's GitHub repo is missing an argument for "cluster_selection_epsilon"... 
ls.cluster(dataset_id, cluster_umap_id, cluster_samples, cluster_min_samples, cluster_selection_epsilon)

In [None]:
# dataset_id, text_column, cluster_id, model_id, unused, rerun, instructions_before, instructions_after, label_length
# NOTE: the code from GitHub was outdated and needed the last arg : rerun = None (or a value that points to a label), I added label_legth
ls.label(dataset_id, text_column, label_cluster_id, chat_model_id, "", None, chat_model_instructions_before, chat_model_instructions_after,  label_length)

In [None]:
# print out the labels
labels = pd.read_parquet(os.path.join(ls.get_data_dir(), dataset_id, "clusters", scope_labels_id + ".parquet"))
labels['label'].to_list()

## Save these results as a 'scope'

In [None]:
# dataset_id, embedding_id, umap_id, cluster_id, labels_id, label, description
ls.scope(dataset_id, umap_embedding_id, cluster_umap_id, label_cluster_id, scope_labels_id, scope_label, scope_description)

## Create a bar chart showing the number of responses in each theme

In [None]:
# get the labels and read in the original data
labels = pd.read_parquet(os.path.join(ls.get_data_dir(), dataset_id, "clusters", scope_labels_id + ".parquet"))
data = pd.read_csv("../../data/ITP_CourseArtifacts_June_2021_END_of_Course_DeIDENTIFIED_Course_Meta_App_cleaned_split_into_sentences.csv")

In [None]:

# match the indices from labels to the original data IDs and count the number of unique entries
labels_list = []
labels_num = []
for index, row in labels.iterrows():
    labels_list.append(row['label'])
    labels_num.append(len(data.iloc[row['indices']]['ID'].unique()))
    print(labels_list[-1], len(row['indices']), labels_num[-1])
labels_frac = np.array(labels_num)/len(data)

In [None]:
# sort (by creating a DataFrame)
df = pd.DataFrame()
df['label'] = labels_list
df['frac'] = labels_frac
df['num'] = labels_num
df.sort_values(by = 'num', inplace = True, ascending = False)
df

In [None]:
f, ax = plt.subplots(figsize = (10,10))

y_pos = np.arange(len(df['label']))
ax.barh(y_pos, df['frac'], align = 'center')
ax.set_yticks(y_pos, labels = df['label'])
ax.invert_yaxis()  # labels read top-to-bottom
ax.set_xlabel('Fraction of responses including the given theme')
ax.set_title('Themes from survey responses')

f.savefig('plots/' + dataset_id + '.png', bbox_inches = 'tight')

## Run the server to investigate and visualize these results

In [None]:
#ls.serve()