# Attempting to label each answer by the correct theme

I will use an embedding model to calculate the vectors for each answer and each theme in a given question, then calculate the distances.

# !!! I NEED TO CHECK WITH BENNETT ABOUT HOW HE WANTS TO LABEL THEMES !!!
(should we allow for multiple themes for a given answer?  How do we want to determine the threshold for a theme to be associated with an answer?  Currently I take anything greater than the median of all max similarity measurements, or the max value if all are <> the median)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sentence_transformers import SentenceTransformer, util

In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
# this will download the model if needed
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [None]:
froots = {
    "M0":"INCLU1x_IF_Responses_-_ALL_RUNS_041924_M0_IF_Reflection_Questions_cleaned",
    "M1":"INCLU1x_IF_Responses_-_ALL_RUNS_041924_M1_IF_Reflection_Question_cleaned",
    "M2":"INCLU1x_IF_Responses_-_ALL_RUNS_041924_M2_IF_Reflection_Question_cleaned",
    "M3":"INCLU1x_IF_Responses_-_ALL_RUNS_041924_M3_IF_Reflection_Question_cleaned",
    "M4":"INCLU1x_IF_Responses_-_ALL_RUNS_041924_M4_IF_Reflection_Question_cleaned",
    "M5":"INCLU1x_IF_Responses_-_ALL_RUNS_041924_M5_IF_Reflection_Question_cleaned" ,
}

In [None]:
qname = 'M0'

answers_df = pd.read_csv("../../data/" + froots[qname] + ".csv")
themes_df = pd.read_csv("tables/openAI/Aaron_ChatGPT_summary/final_merged/" + qname + "_merged.csv")

In [None]:
answers_df

In [None]:
answers = answers_df['student_responses'].to_list()
themes = themes_df['theme'].to_list()


answers_embeddings = model.encode(answers)
themes_embeddings = model.encode(themes)

similarity = np.array(util.cos_sim(answers_embeddings, themes_embeddings))


In [None]:
# some diagnostics

print(similarity.shape, len(answers), len(themes))

# Get the maximum value in each row
max_values = np.max(similarity, axis=1)

# Get the column indices of the maximum values in each row
max_indices = np.argmax(similarity, axis=1)

s_lo, s_med, s_hi = np.percentile(similarity.ravel(), [16, 50, 84])
print(s_lo, s_med, s_hi)

smax_lo, smax_med, smax_hi = np.percentile(max_values, [16, 50, 84])
print(smax_lo, smax_med, smax_hi)

In [None]:
f, ax = plt.subplots()
_ = ax.hist(similarity.ravel(), bins = 100)
ax.axvline(s_med, color = 'black')
ax.axvline(s_lo, color = 'black', linestyle = 'dashed')
ax.axvline(s_hi, color = 'black', linestyle = 'dashed')
ax.set_xlim(0,1)


In [None]:
f, ax = plt.subplots()
_ = ax.hist(max_values, bins = 100)
ax.axvline(smax_med, color = 'black')
ax.axvline(smax_lo, color = 'black', linestyle = 'dashed')
ax.axvline(smax_hi, color = 'black', linestyle = 'dashed')
ax.set_xlim(0,1)


In [None]:
# Create a DataFrame from the array
df = pd.DataFrame(similarity, columns=themes_df['theme_number'])

# take any column that has a value greater than the median for the max values OR the column at the max value if all distances are < smax_med
def find_columns(row, threshold):
    # Find the indices where values are less than the threshold
    indices = row.index[row > threshold].tolist()
    if not indices:  # If no values are less than the threshold
        indices = [row.idxmax()]  # Take the index of the maximum value
    return str(indices)

output_df = pd.DataFrame()
output_df['theme'] = df.apply(lambda row: find_columns(row, smax_med), axis=1)

# add the rest of the values from the original df
for c in answers_df.columns[::-1]:
    output_df.insert(0,c, answers_df[c])

output_df

# Pull everything together into functions so that I can run through all answers

In [None]:
# read in the data
def read_answers_and_themes(qname):
    # read in the data for this question
    answers_df = pd.read_csv("../../data/" + froots[qname] + ".csv")
    themes_df = pd.read_csv("tables/openAI/Aaron_ChatGPT_summary/final_merged/" + qname + "_merged.csv")

    # grab t6he answers and themes for this question as lists
    answers = answers_df['student_responses'].to_list()
    themes = themes_df['theme'].to_list()

    return answers_df, answers, themes_df, themes

def read_themes_and_themes(qname):
    # read in the list of themes for this question
    themes_df = pd.read_csv("tables/openAI/Aaron_ChatGPT_summary/compiled_themes_csvs/" + qname + "_themes_compiled.csv")
    # get the themes, but remove some common phrases
    themes = themes_df['theme'].str.lower().replace('inclusive teaching','').str.replace('inclusive practices','').str.replace('inclusive','')
    themes = themes.to_list()

    return themes_df, themes, themes_df, themes


# get all the matching themes for a given set of answers
def get_themes(qname, read_func = read_answers_and_themes):
    
    answers_df, answers, themes_df, themes = read_func(qname)

    # calculate embeddings
    answers_embeddings = model.encode(answers)
    themes_embeddings = model.encode(themes)

    # calculate the similarity matrix
    similarity = np.array(util.cos_sim(answers_embeddings, themes_embeddings))

    # Get the maximum value in each row and then get the median value for a threshold (could modify this)
    max_values = np.max(similarity, axis=1)
    smax_lo, smax_med, smax_hi = np.percentile(max_values, [16, 50, 84])

    # Create a DataFrame from the similarity array
    df = pd.DataFrame(similarity, columns=themes_df['theme_number'])

    ####################
    # We may want to modify this, e.g., to only take the max value (1 theme per answer)
    ####################
    # take any column that has a value greater than the median for the max values OR the column at the max value if all distances are < smax_med
    def find_columns(row, threshold):
        # Find the indices where values are less than the threshold
        indices = row.index[row > threshold].tolist()
        if not indices:  # If no values are less than the threshold
            indices = [row.idxmax()]  # Take the index of the maximum value
        return str(indices)

    output_df = pd.DataFrame()
    output_df['theme'] = df.apply(lambda row: find_columns(row, smax_med), axis=1)

    # add the rest of the values from the original df
    for c in answers_df.columns[::-1]:
        cinsert = c
        if (c in output_df.columns):
            cinsert = c + '_org'
        output_df.insert(0,cinsert, answers_df[c])

    return output_df, themes_df, df

In [None]:
# create a bar chart (similar to my script from latentscope_helper.py)
def create_bar_chart(themes_df, output_df, filename = None):

    # match the indices from labels to the original data IDs and count the number of unique entries
    
    # first get a list of all the matching themes
    flattened_themes = [int(item) for sublist in output_df['theme'].str.strip("[]").str.split(", ") for item in sublist]
    
    # Count the occurrences of each unique value
    value_counts = pd.Series(flattened_themes).value_counts().sort_index()
    
    # Create a new DataFrame from the counts
    df_counts = pd.DataFrame({
        'theme_number': value_counts.index,
        'count': value_counts.values
    })
    df_counts['frac'] = df_counts['count']/len(output_df.index)

    # merge that with the themes_df
    df = pd.merge(df_counts, themes_df, on='theme_number', how='left')
    
    # sort 
    df.sort_values(by='frac', inplace=True, ascending=False)

    # create the figure and save it
    f, ax = plt.subplots(figsize = (10,10))
    y_pos = np.arange(len(df['theme']))
    hbars = ax.barh(y_pos, df['frac'], align = 'center')
    ax.bar_label(hbars, labels=[f'{v*100:.0f}%' for v in df['frac']], fontsize=14)
    ax.set_yticks(y_pos, labels = df['theme'])
    ax.invert_yaxis()  # labels read top-to-bottom
    ax.spines['top'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.xaxis.set_ticks([])

    if (filename is not None):
        f.savefig(filename, bbox_inches = 'tight')

    return f, ax

In [None]:
# process a given question
output_df, themes_df, similarity_df = get_themes("M0")
_ = create_bar_chart(themes_df, output_df)

## See if I can use embeddings to identify duplicates in the original themes list

I don't know if this is better or worse than simply asking ChatGPT :)  Probably the best thing would be for Bennett to go through the list of themes himself and get to a final unique set (without code).

In [None]:
themes_df, themes, themes_df, themes = read_themes_and_themes("M0")
themes


In [None]:
output_df, themes_df, similarity_df = get_themes("M0", read_func = read_themes_and_themes)
similarity_df.style.background_gradient(cmap='Blues')

In [None]:
# Create a boolean mask for the upper triangle (excluding the diagonal)
mask = np.triu(np.ones(similarity_df.shape), k=1).astype(bool)

# Apply the mask to get the upper triangle values
upper_triangle_values = similarity_df.where(mask).stack()

# Plotting the histogram
_ = plt.hist(upper_triangle_values, bins=20)

In [None]:
def combine_dups(qname, threshold = 0.75):
    output_df, themes_df, similarity_df = get_themes(qname, read_func = read_themes_and_themes)

    # Combine the upper triangle mask with the threshold condition
    upper_triangle_mask = np.triu(np.ones(similarity_df.shape), k=1).astype(bool)
    combined_mask = upper_triangle_mask & (similarity_df > threshold)
    masked_df = similarity_df.where(combined_mask)

    # iterate through the dataframe to identify the duplicates and create a list
    dups = []
    for index,row in masked_df.iterrows():
        values = row.loc[~pd.isna(row)].index.tolist()
        if (values):
            dups.append(values + [index + 1]) # include the original theme that we are comparing to
        else:
            dups.append([index+1])

    print("initial pass : ", dups)

    # there may be cases where values in the dict are identified as being similar to the same theme, but are not included together in a list, e.g., 0: [10, 14, 21, 1], 11: [21, 12].  I think I want to combine all of these.  Not sure there is an easy way to do this...
    dups_iter = dups.copy()
    any_overlap = True
    while (any_overlap):
        any_overlap = False
        exclude_indices = []
        final_list = []

        for i1, v1 in enumerate(dups_iter):
            #print("one", i1, v1)
            values = []
            if (i1 not in exclude_indices):
                values = v1
                for i2, v2 in enumerate(dups_iter):
                    if (i2 > i1 and i2 not in exclude_indices):
                        #print(i2, v2)
                        # check if there is overlap
                        overlap = False
                        for v in v1:
                            if (v in v2):
                                overlap = True
                                any_overlap = True
                        if (overlap):
                            values += v2
                            exclude_indices.append(i2)
                            #print(i1, i2)

            if (values):
                final_list.append(list(set(values)))
        dups_iter = final_list
        #print("here",any_overlap)


    print("final list : ", dups_iter)
    print("")
    # now print the themes that are considered duplicates
    for i,indices in enumerate(dups_iter):
        print(i, indices)
        rows = themes_df.loc[themes_df['theme_number'].isin(indices)]
        for index, row in rows.iterrows():
            print(row['theme'])

    return dups_iter

In [None]:
dups_iter = combine_dups("M1", threshold = 0.65)

For OpenAI : https://platform.openai.com/docs/guides/embeddings/embedding-models

In [None]:
output_df, themes_df = get_themes("M5")
