# Testing Bennett's Data

`ITP_CourseArtifacts_June 2021_END_of_Course_DeIDENTIFIED.xlsx` using the `Course Meta SelfEff` sheet

In [None]:
import os
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
%load_ext autoreload
%autoreload 2

from latentscope_helper import latentscope_helper

## Read in and clean the data

These next two cells only need to be run once.  (If rerunning this notebook, you can start after the next markdown cell.)

In [None]:
# set this to True if you want to change the data the is used 
read_in_original_data_file = False

In [None]:
# Function to split text into sentences
def split_into_sentences(text):
    sentences = nltk.sent_tokenize(text)
    return sentences

In [None]:
if (read_in_original_data_file):
    # read in the data
    df = pd.read_excel("../../data/ITP_CourseArtifacts_June 2021_END_of_Course_DeIDENTIFIED.xlsx", sheet_name = "Course Meta SelfEff")
    print(f"length of original DataFrame = {len(df)}")

    # save the question
    question = df.columns[1]
    # only take the columns we need and rename them to remove spaces and special characters
    data_tmp = df.rename(columns = {question:'student_responses'})

    # remove extra newlines
    data_tmp['student_responses'] = data_tmp['student_responses'].str.replace('\n', ' ')
    
    # remove rows with short answers (otherwise the sentence finder might choke -- not sure why)
    n_min = 5
    data_tmp = data_tmp[data_tmp['student_responses'].str.split().str.len().gt(n_min)]  

    # save to .csv file
    data_tmp.to_csv("../../data/ITP_CourseArtifacts_June_2021_END_of_Course_DeIDENTIFIED_Course_Meta_SelfEff_cleaned.csv", index=False)

    # split into sentences
    data = pd.DataFrame(columns=data_tmp.columns)
    for index, row in data_tmp.iterrows():
        # Split the response into sentences
        sentences = split_into_sentences(row['student_responses'])
        
        # Create a new row for each sentence and append it to the new DataFrame
        for sentence in sentences:
            new_row = row.copy()
            new_row['student_responses'] = sentence
            data = data._append(new_row, ignore_index=True)

    # remove rows with short answers (again)
    n_min = 5
    data = data[data['student_responses'].str.split().str.len().gt(n_min)]  

    print(f"length of new DataFrame (after cleaning and sentence splitting) = {len(data)}")

    # Save the new DataFrame to a new file (since this takes a while to run)
    data.to_csv("../../data/ITP_CourseArtifacts_June_2021_END_of_Course_DeIDENTIFIED_Course_Meta_SelfEff_cleaned_split_into_sentences.csv", index=False)


## Initialize and run `latent-scope` using my Class

In [None]:
# This question has multiple components; so definitely best to take the version with split sentences
data = pd.read_csv("../../data/ITP_CourseArtifacts_June_2021_END_of_Course_DeIDENTIFIED_Course_Meta_SelfEff_cleaned_split_into_sentences.csv")

# initialize my helper object
worker = latentscope_helper(
    latent_scope_dir = "../../latent-scope_data", # directory where the latentscope files are stored
    dataset_id = "ITP_CourseArtifacts_June_2021_END_of_Course_DeIDENTIFIED_Course_Meta_SelfEff", # data set name, for the sub-directory name within latent_scope_dir for this project
    data = data, # pandas DataFrame that contains the data to analyze
    text_column = "student_responses", # response column name from data_file
    scope_number = 'new', # number that will be appended to all latentscope files 
    remove_old_files = True, # set this to True if you want to clean the latent-scope directories and start fresh
    imax = 999, # maximum number of scopes that it should search through
    run_embedding = True, # whether to run the embedding step (and potentially remove previous files)
    run_umap = True, # whether to run the umap step (and potentially remove previous files)
    run_cluster = True, # whether to run the clustering step (and potentially remove previous files)
    run_label = True, # whether to run the labeling step (and potentially remove previous files)
    embedding_model_id = "transformers-jinaai___jina-embeddings-v2-small-en", # embeddings model name
    embedding_n_dimensions = 512, # number of dimensions for embedding.  reading the jina docs, they often use this number as an example for the number of dimensions (not sure this is a recommendation though)
    umap_n_components = 2, # number of UMAP dimensions
    umap_n_neighbors = 10, # "controls how UMAP balances local versus global structure in the data." Larger values mean UMAP will look at larger distances for neighbors (15 is default)
    umap_min_dist = 0, # "controls how tightly UMAP is allowed to pack points together" (default is 0.1)
    cluster_samples = 5, # min_cluster_size in HDBSCAN : "the smallest size grouping that you wish to consider a cluster"
    cluster_min_samples = 12, # min_samples in HDBSCAN : "provide a measure of how conservative you want your clustering to be. The larger the value of min_samples you provide, the more conservative the clustering – more points will be declared as noise, and clusters will be restricted to progressively more dense areas."
    cluster_selection_epsilon =  0.05, # cluster_selection_epsilon in HDBSCAN : distance measure between clusters that "ensures that clusters below the given threshold are not split up any further"
    chat_model_id = "transformers-TinyLlama___TinyLlama-1.1B-Chat-v1.0", # LLM to use for labeling the clusters
    label_length = 10, # max length to tell the LLM to use when generating a given label (not always respected by the LLM!)
    chat_model_instructions_before = "Below is a list of items each starting with [item].  Each item is a response from a different person to a survey. These items all have a similar theme.  The list begins below.", # string of text to provide the LLM as instructions before the list of cluster items is given
    chat_model_instructions_after = "That was the last item in the list.  Now return a concise label for the items in this list that describes the theme.  This label should not be fully verbatim text from any individual item.  Your label should contain no more than 10 words.", # string of text to provide the LLM as instructions after the list of cluster items is given
    scope_description = "First full test with responses separated into sentences", #label to give the scope (when using the latentscope server to view the data)
)

In [None]:
# run latent-scope (using the inputs from above)

worker.run_embedding = False

worker.initialize_files_and_numbering()
worker.initialize_latentscope()
worker.run_latentscope()

## Print and save the output

In [None]:
# if you want to plot a specific scope number, you can define it here (and you don't need to actually run latentscope in the previous cell)
worker.remove_old_files = False
worker.scope_number = '001'
worker.initialize_files_and_numbering()
worker.initialize_latentscope()

In [None]:
# print the labels
# self.scope_labels_id = '001'
worker.print_labels()

In [None]:
# create a bar chart of the labels
f, ax = worker.create_bar_chart(filename = os.path.join('plots', worker.dataset_id + '_scope' + worker.scope_number + '.png'))

In [None]:
# create an Excel workbook to review the results
# The first sheet will have the raw data.  
# The second sheet will have a map between cluster label and sheet name.  
# Subsequent sheets will be one per cluster containing the cluster data.
data_raw = pd.read_excel("../../data/ITP_CourseArtifacts_June 2021_END_of_Course_DeIDENTIFIED.xlsx", sheet_name = "Course Meta SelfEff")
worker.create_excel_workbook(data_raw, os.path.join('tables', worker.dataset_id + '_clusters_scope' + worker.scope_number + '.xlsx'))

## Calculate metrics to assess the quality of this analysis

Ideally, I would want to do this for a number of runs each changing some parameter and returning a different number of clusters.

In [None]:

# returns inertia, Silhouette Coefficient, Calinski-Harabasz Index, Davies-Bouldin Index
# - a lower inertia value is generally better
# - a higher Silhouette Coefficient score relates to a model with better defined clusters. 
# - a higher Calinski-Harabasz score relates to a model with better defined clusters.
# - a lower Davies-Bouldin index relates to a model with better separation between the clusters.
worker.calculate_metrics(embedding_number = '001', cluster_number = '001')

In [None]:
#%%capture
# %%capture magic to suppress output
# TO DO: Figure out how to supress the figures!


# a test looping over one HDBSCAN parameter to check the resulting metrics
worker.suppress_latentscope_output = True
worker.suppress_helper_output = True
worker.remove_old_files = False
worker.initialize_files_and_numbering()

worker.embedding_number = '001'
worker.umap_number = '001'
worker.cluster_number = '001'
worker.run_embedding = worker.run_umap = worker.run_label = worker.save_scope = False
worker.run_cluster = True

cluster_min_samples = [5, 10, 15, 20, 25, 30]
cluster_selection_epsilon = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1]
cn = 2 # starting number for the files

file_numbers = []
for ms in cluster_min_samples:
    for eps in cluster_selection_epsilon:
        print(f'file_number = {cn}, min_samples = {ms}, selection_epsilon = {eps}')
        worker.cluster_min_samples = ms
        worker.cluster_selection_epsilon = eps
        worker.cluster_number = str(cn).zfill(3)
        worker.initialize_latentscope_filenames()
        worker.run_latentscope()
        
        file_numbers.append(cn)
        cn += 1

In [None]:
metrics = []
inertia = []
sc = []
ch = []
db = []
min_samples = []
selection_epsilon = []
n_clusters = []
for cn in file_numbers:
    #print(cn)
    m = worker.calculate_metrics(embedding_number = '001', cluster_number = str(cn).zfill(3))
    metrics.append(m)
    inertia.append(m['inertia'])
    sc.append(m['silhouette_coefficient'])
    ch.append(m['calinski_harabasz_index'])
    db.append(m['davies_bouldin_index'])
    min_samples.append(m['cluster_info']['min_samples'])
    selection_epsilon.append(m['cluster_info']['cluster_selection_epsilon'])
    n_clusters.append(m['cluster_info']['n_clusters'])
df = pd.DataFrame()
df['file_number'] = file_numbers
df['min_samples'] = min_samples
df['selection_epsilon'] = selection_epsilon
df['n_clusters'] = n_clusters
df['inertia'] = inertia
df['silhouette_coefficient'] = sc
df['calinski_harabasz_index'] = ch
df['davies_bouldin_index'] = db
df.set_index('file_number', drop = True, inplace = True)

df


In [None]:
g = sns.pairplot(df, corner = True, diag_kind='kde')

In [None]:
x_vars = ['min_samples','selection_epsilon','n_clusters']
y_vars = ['silhouette_coefficient','calinski_harabasz_index','davies_bouldin_index', 'inertia']
g = sns.PairGrid(df, x_vars = x_vars, y_vars = y_vars, hue = 'n_clusters', palette = 'viridis')
g.map(sns.scatterplot)
g.map_diag(sns.histplot)

# I played around with the loc statement to isolate what appears to be the best clusters
# (and asked ChatGPT how to create this overlay)
df_best = df.loc[(df['n_clusters'] > 5) & (df['n_clusters'] < 30) & (df['silhouette_coefficient'] > 0.415)]
def custom_scatter(x, y, **kwargs):
    sns.scatterplot(x=x, y=y, **kwargs)
    sns.scatterplot(data=df_best, x=x.name, y=y.name, marker='o', color='black', edgecolor='black', facecolors='none', s=100)
g.map(custom_scatter)



## Run the server to investigate and visualize these results

In [None]:
#ls.serve()