# Attempting to label each answer by the correct theme

I will use an embedding model to calculate the vectors for each answer and each theme in a given question, then calculate the distances.

# !!! I NEED TO CHECK WITH BENNETT ABOUT HOW HE WANTS TO LABEL THEMES !!!
(should we allow for multiple themes for a given answer?  How do we want to determine the threshold for a theme to be associated with an answer?  Currently I take anything less than the median of all min distances, or the min value if all are > the median)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sentence_transformers import SentenceTransformer, util

In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
# this will download the model if needed
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [None]:
froots = {
    "M0":"INCLU1x_IF_Responses_-_ALL_RUNS_041924_M0_IF_Reflection_Questions_cleaned",
    "M1":"INCLU1x_IF_Responses_-_ALL_RUNS_041924_M1_IF_Reflection_Question_cleaned",
    "M2":"INCLU1x_IF_Responses_-_ALL_RUNS_041924_M2_IF_Reflection_Question_cleaned",
    "M3":"INCLU1x_IF_Responses_-_ALL_RUNS_041924_M3_IF_Reflection_Question_cleaned",
    "M4":"INCLU1x_IF_Responses_-_ALL_RUNS_041924_M4_IF_Reflection_Question_cleaned",
    "M5":"INCLU1x_IF_Responses_-_ALL_RUNS_041924_M5_IF_Reflection_Question_cleaned" ,
}

In [None]:
qname = 'M0'

answers_df = pd.read_csv("../../data/" + froots[qname] + ".csv")
themes_df = pd.read_csv("tables/openAI/Aaron_ChatGPT_summary/final_merged/" + qname + "_merged.csv")

In [None]:
answers_df

In [None]:
answers = answers_df['student_responses'].to_list()
themes = themes_df['theme'].to_list()


answers_embeddings = model.encode(answers)
themes_embeddings = model.encode(themes)

similarity = np.array(util.cos_sim(answers_embeddings, themes_embeddings))


In [None]:
# some diagnostics

print(similarity.shape, len(answers), len(themes))

# Get the minimum value in each row
min_values = np.min(similarity, axis=1)

# Get the column indices of the minimum values in each row
min_indices = np.argmin(similarity, axis=1)

s_lo, s_med, s_hi = np.percentile(similarity.ravel(), [16, 50, 84])
print(s_lo, s_med, s_hi)

smin_lo, smin_med, smin_hi = np.percentile(min_values, [16, 50, 84])
print(smin_lo, smin_med, smin_hi)

In [None]:
f, ax = plt.subplots()
_ = ax.hist(similarity.ravel(), bins = 100)
ax.axvline(s_med, color = 'black')
ax.axvline(s_lo, color = 'black', linestyle = 'dashed')
ax.axvline(s_hi, color = 'black', linestyle = 'dashed')
ax.set_xlim(0,1)


In [None]:
f, ax = plt.subplots()
_ = ax.hist(min_values, bins = 100)
ax.axvline(smin_med, color = 'black')
ax.axvline(smin_lo, color = 'black', linestyle = 'dashed')
ax.axvline(smin_hi, color = 'black', linestyle = 'dashed')
ax.set_xlim(0,1)


In [None]:
# Create a DataFrame from the array
df = pd.DataFrame(similarity, columns=themes_df['theme_number'])

# take any column that has a value less than the median for the min values OR the column at the min value if all distances are > smin_med
def find_columns(row, threshold):
    # Find the indices where values are less than the threshold
    indices = row.index[row < threshold].tolist()
    if not indices:  # If no values are less than the threshold
        indices = [row.idxmin()]  # Take the index of the minimum value
    return str(indices)

output_df = pd.DataFrame()
output_df['theme'] = df.apply(lambda row: find_columns(row, smin_med), axis=1)

# add the rest of the values from the original df
for c in answers_df.columns[::-1]:
    output_df.insert(0,c, answers_df[c])

output_df

# Pull everything together into functions so that I can run through all answers

In [None]:
# get all the matching themes for a given set of answers
def get_themes(qname):
    
    # read in the data for this question
    answers_df = pd.read_csv("../../data/" + froots[qname] + ".csv")
    themes_df = pd.read_csv("tables/openAI/Aaron_ChatGPT_summary/final_merged/" + qname + "_merged.csv")

    # grab t6he answers and themes for this question as lists
    answers = answers_df['student_responses'].to_list()
    themes = themes_df['theme'].to_list()

    # calculate embeddings
    answers_embeddings = model.encode(answers)
    themes_embeddings = model.encode(themes)

    # calculate the similarity matrix
    similarity = np.array(util.cos_sim(answers_embeddings, themes_embeddings))

    # Get the minimum value in each row and then get the median value for a threshold (could modify this)
    min_values = np.min(similarity, axis=1)
    smin_lo, smin_med, smin_hi = np.percentile(min_values, [16, 50, 84])

    # Create a DataFrame from the similarity array
    df = pd.DataFrame(similarity, columns=themes_df['theme_number'])

    ####################
    # We may want to modify this, e.g., to only take the min value (1 theme per answer)
    ####################
    # take any column that has a value less than the median for the min values OR the column at the min value if all distances are > smin_med
    def find_columns(row, threshold):
        # Find the indices where values are less than the threshold
        indices = row.index[row < threshold].tolist()
        if not indices:  # If no values are less than the threshold
            indices = [row.idxmin()]  # Take the index of the minimum value
        return str(indices)

    output_df = pd.DataFrame()
    output_df['theme'] = df.apply(lambda row: find_columns(row, smin_med), axis=1)

    # add the rest of the values from the original df
    for c in answers_df.columns[::-1]:
        output_df.insert(0,c, answers_df[c])

    return output_df, themes_df

In [None]:
# create a bar chart (similar to my script from latentscope_helper.py)
def create_bar_chart(themes_df, output_df, filename = None):

    # match the indices from labels to the original data IDs and count the number of unique entries
    
    # first get a list of all the matching themes
    flattened_themes = [int(item) for sublist in output_df['theme'].str.strip("[]").str.split(", ") for item in sublist]
    
    # Count the occurrences of each unique value
    value_counts = pd.Series(flattened_themes).value_counts().sort_index()
    
    # Create a new DataFrame from the counts
    df_counts = pd.DataFrame({
        'theme_number': value_counts.index,
        'count': value_counts.values
    })
    df_counts['frac'] = df_counts['count']/len(output_df.index)

    # merge that with the themes_df
    df = pd.merge(df_counts, themes_df, on='theme_number', how='left')
    
    # sort 
    df.sort_values(by='frac', inplace=True, ascending=False)

    # create the figure and save it
    f, ax = plt.subplots(figsize = (10,10))
    y_pos = np.arange(len(df['theme']))
    hbars = ax.barh(y_pos, df['frac'], align = 'center')
    ax.bar_label(hbars, labels=[f'{v*100:.0f}%' for v in df['frac']], fontsize=14)
    ax.set_yticks(y_pos, labels = df['theme'])
    ax.invert_yaxis()  # labels read top-to-bottom
    ax.spines['top'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.xaxis.set_ticks([])

    if (filename is not None):
        f.savefig(filename, bbox_inches = 'tight')

    return f, ax

In [None]:
# process a given question
output_df, themes_df = get_themes("M5")
_ = create_bar_chart(themes_df, output_df)