# DATA IMPORTATION AND PROCESSING

In [1]:
#Importing data processing packages
import pandas as pd
import itertools

In [2]:
#Defining data path
dataset_path = r"C:\Users\Achor\Downloads\dataset.xlsx"

In [3]:
#Reading the dataset from Excel files
dataset = pd.read_excel(dataset_path, sheet_name = 'DATASET', keep_default_na=False)

In [4]:
# splitting lists with ### separator and converting relevant columns to strings
for col in dataset.columns:
    if type(dataset[col][0]) == str:
        if dataset[col].str.contains(" ### ").any():
            new_col = dataset[col].str.split(" ### ")
            dataset[col] = new_col

In [5]:
#Inspecting dataset
dataset.head()

Unnamed: 0,doc_no,annotator,struggle,cluster_expert,cluster_expert_merged,cluster_auto,struggle_original,OT,reflection_candidates,reflection_annotation,...,reframing_annotation,reframing_from_expert,comfort_candidates,comfort_annotation,comfort_from_expert,suggestion_candidates,suggestion_annotation,suggestion_from_expert,reduced_embeddings,full_embeddings
0,1,1,When dieting I often find it hard to track my ...,CALORIE_COUNTING,DIET_PLAN_ISSUES,find_calorie_time,When dieting I often find it hard to track my ...,N,"[So, do you mean that tracking your calorie in...","[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]",...,"[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]",[N/A],[It can happen to anyone to find it difficult ...,"[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]",[N/A],"[Starting from tomorrow, you could try trackin...","[N, Y, Y, N, Y, N, Y, Y, Y, Y]",[N/A],"[-0.38553035, 9.694216, 8.311511]","[-0.04257814213633537, 0.045637574046850204, 0..."
1,2,1,Saying no to alcohol in social settings. I usu...,SOCIAL,SOCIAL,feel_alcohol_friend,Saying no to alcohol in social settings. I usu...,N,[Do you mean that saying no to alcohol in soci...,"[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]",...,"[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]",[N/A],[It's understandable to struggle with saying n...,"[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]",[N/A],[Starting from tomorrow you could try setting ...,"[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]",[N/A],"[1.6147285, 12.11944, 6.081849]","[0.021132370457053185, -0.020406601950526237, ..."
2,3,1,Healthy food is expensive and earning a middle...,SITUATIONAL,SITUATIONAL,feel_food_junk,Healthy food is expensive and earning a middle...,N,[Do you mean that healthy food options are too...,"[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]",...,"[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]",[N/A],[It can happen to feel tempted to grab fast fo...,"[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]",[N/A],"[Starting from tomorrow, you could make a list...","[N, Y, Y, Y, N, Y, Y, Y, Y, Y]",[N/A],"[-0.74341005, 14.221862, 9.163124]","[-0.0352167934179306, 0.06300564110279083, 0.0..."
3,4,1,Working out is hard for me because I'm used to...,MOTIVATION,MOTIVATION,feel_time_gym_day,Working out is hard for me because Im used to ...,N,"[So, do you mean you have trouble creating a n...","[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]",...,"[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]",[N/A],[It's understandable to find it difficult to a...,"[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]",[N/A],"[It could be helpful to set small, achievable ...","[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]",[N/A],"[4.136178, 9.765074, 7.860414]","[0.022051161155104637, -0.0497511550784111, 0...."
4,5,1,When I see pizza I always want to buy and I en...,CRAVING_HABIT,CRAVING_HABIT,feel_food_junk,When I see pizza I always want to buy and I en...,N,"[So, are you saying that you have a hard time ...","[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]",...,"[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]",[N/A],[Sometimes it can happen that we crave certain...,"[Y, Y, Y, Y, Y, N, Y, Y, Y, Y]",[N/A],"[Starting from tomorrow, you could make a plan...","[N, Y, Y, N, Y, Y, Y, Y, Y, Y]",[N/A],"[-1.5897965, 13.701472, 7.329277]","[-0.009577570483088493, 0.09480103105306625, 0..."


## DROP THE 'NOT_APPLICABLE' CLUSTER

In [6]:
# Counting the number of rows where 'cluster_expert_merged' is 'NOT_APPLICABLE'
count_not_applicable = dataset[dataset['cluster_expert_merged'] == 'NOT_APPLICABLE'].shape[0]

# Using query to filter out 'NOT_APPLICABLE'
filtered_dataset = dataset.query("cluster_expert_merged != 'NOT_APPLICABLE'")
filtered_dataset.head()

# Calculating the number of rows dropped
rows_dropped = count_not_applicable

# Calculating the number of rows remaining
rows_remaining = filtered_dataset.shape[0]

# Outputing the counts
print("Number of rows dropped:", rows_dropped)
print("Number of rows remaining:", rows_remaining)

Number of rows dropped: 98
Number of rows remaining: 2322


In [7]:
# Renaming the DataFrame
dataset = filtered_dataset

# Counting the number of rows in the new DataFrame
rows_remaining = dataset.shape[0]

# Outputing the count
print("Number of rows remaining after filtering:", rows_remaining)

Number of rows remaining after filtering: 2322


# INSTALLING AND IMPORTTING SBERT PACKAGES

In [8]:
!pip install sentence-transformers



In [9]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

In [10]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1" #suppress warnings about disk space usage

# COSINE SIMILARITY CALCULATION

## REFLECTION CANDIDATES

In [11]:
# Reading the reflection candidates into a list with bespoke names
reflective_statements = []
reflective_labels = []
for index, row in dataset.iterrows():
    for statement, annotation in zip(row['reflection_candidates'], row['reflection_annotation']):
        reflective_statements.append(statement)
        reflective_labels.append(1 if annotation == 'Y' else 0)

In [12]:
#Seperating statements by annotation
safe_statements = [stmt for stmt, label in zip(reflective_statements, reflective_labels) if label == 1]
unsafe_statements = [stmt for stmt, label in zip(reflective_statements, reflective_labels) if label == 0]

In [13]:
# Counting the number of safe statements
num_safe_statements = len(safe_statements)

# Counting the number of unsafe statements
num_unsafe_statements = len(unsafe_statements)

# Printing the counts
print(f"Number of safe statements: {num_safe_statements}")
print(f"Number of unsafe statements: {num_unsafe_statements}")

Number of safe statements: 20177
Number of unsafe statements: 3043


In [14]:
#creating embeddings
safe_embeddings = model.encode(safe_statements)
unsafe_embeddings = model.encode(unsafe_statements)

In [34]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculating cosine similarity between safe and unsafe embeddings
similarity_matrix = cosine_similarity(safe_embeddings, unsafe_embeddings)

In [39]:
# Finding indices where cosine similarity is exactly 1
exact_match_indices = np.where(similarity_matrix == 1)

# Tracking unique indices
matched_safe_indices = set(exact_match_indices[0])  # Unique indices of safe statements with matches
matched_unsafe_indices = set(exact_match_indices[1])  # Unique indices of unsafe statements with matches

# Countng unique matches
num_unique_unsafe_with_exact_match = len(matched_unsafe_indices)
num_unique_safe_with_exact_match = len(matched_safe_indices)

# Outputing the counts
print(f"{num_unique_unsafe_with_exact_match} unsafe statements have a cosine similarity of 1 with any safe statement.")
print(f"{num_unique_safe_with_exact_match} safe statements have a cosine similarity of 1 with any unsafe statement.")

# Storing up to four example pairs of matching statements
example_pairs = []

# Looping through the indices to extract example pairs
for i, j in zip(*exact_match_indices):
    if len(example_pairs) < 4:  # Collecting only up to 4 examples
        safe_statement = safe_statements[i]
        unsafe_statement = unsafe_statements[j]
        example_pairs.append((safe_statement, unsafe_statement))

# Printing example pairs
print("\nExamples of pairs with cosine similarity of 1:")
for safe, unsafe in example_pairs:
    print(f"Safe Statement: {safe}")
    print(f"Unsafe Statement: {unsafe}")
    print("---")

6 unsafe statements have a cosine similarity of 1 with any safe statement.
8 safe statements have a cosine similarity of 1 with any unsafe statement.

Examples of pairs with cosine similarity of 1:
Safe Statement: So, are you saying that your busy schedule makes it difficult to maintain a healthy diet?
Unsafe Statement: So, are you saying that your busy schedule makes it difficult to maintain a healthy diet?
---
Safe Statement: So, do you mean that you find it hard to maintain a healthy lifestyle?
Unsafe Statement: So, do you mean that you find it hard to maintain a healthy lifestyle?
---
Safe Statement: So, do you mean that you have a hard time resisting sugary foods and drinks?
Unsafe Statement: So, do you mean that you have a hard time resisting sugary foods and drinks?
---
Safe Statement:  So, are you saying that you find it difficult to maintain a healthy lifestyle?
Unsafe Statement: So, are you saying that you find it difficult to maintain a healthy lifestyle?
---


In [40]:
# Finding indices where cosine similarity is 0.9 or higher
high_similarity_indices = np.where(similarity_matrix >= 0.9)

# Tracking unique indices
matched_safe_indices = set(high_similarity_indices[0])  # Unique indices of safe statements with high similarity matches
matched_unsafe_indices = set(high_similarity_indices[1])  # Unique indices of unsafe statements with high similarity matches

# Counting unique matches
num_unique_unsafe_with_high_similarity = len(matched_unsafe_indices)
num_unique_safe_with_high_similarity = len(matched_safe_indices)

# Outputing the counts
print(f"{num_unique_unsafe_with_high_similarity} unsafe statements have a cosine similarity of 0.9 or higher with any safe statement.")
print(f"{num_unique_safe_with_high_similarity} safe statements have a cosine similarity of 0.9 or higher with any unsafe statement.")

# Storing up to four example pairs of matching statements
high_similarity_examples = []

# Looping through the indices to extract example pairs
for i, j in zip(*high_similarity_indices):
    if len(high_similarity_examples) < 4:  # Collect only up to 4 examples
        safe_statement = safe_statements[i]
        unsafe_statement = unsafe_statements[j]
        high_similarity_examples.append((safe_statement, unsafe_statement))

# Printing example pairs
print("\nExamples of pairs with cosine similarity of 0.9 or higher:")
for safe, unsafe in high_similarity_examples:
    print(f"Safe Statement: {safe}")
    print(f"Unsafe Statement: {unsafe}")
    print("---")

992 unsafe statements have a cosine similarity of 0.9 or higher with any safe statement.
2189 safe statements have a cosine similarity of 0.9 or higher with any unsafe statement.

Examples of pairs with cosine similarity of 0.9 or higher:
Safe Statement: Are you saying that you feel guilty for not sticking to your diet plan?
Unsafe Statement: So, are you saying that you feel guilty for not being able to stick to the diet plan?
---
Safe Statement: So, are you saying that you are not in control of your eating habits?
Unsafe Statement: Are you saying that you are unable to control your eating habits?
---
Safe Statement: So, are you saying that you are not in control of your eating habits?
Unsafe Statement: So, are you saying that you are unable to control your eating habits?
---
Safe Statement: So, are you saying that you find it hard to find the motivation to exercise?
Unsafe Statement: So, are you saying that you are finding it difficult to find motivation to start exercising?
---


## REFRAMING CANDIDATES

In [22]:
# Reading the reframing candidates into a list with bespoke names
reframing_statements = []
reframing_labels = []
for index, row in dataset.iterrows():
    for statement, annotation in zip(row['reframing_candidates'], row['reframing_annotation']):
        reframing_statements.append(statement)
        reframing_labels.append(1 if annotation == 'Y' else 0)

In [23]:
# Separate statements by annotation for reframing candidates
safe_reframing_statements = [stmt for stmt, label in zip(reframing_statements, reframing_labels) if label == 1]
unsafe_reframing_statements = [stmt for stmt, label in zip(reframing_statements, reframing_labels) if label == 0]

In [29]:
# Count the number of safe reframing statements
num_safe_reframing_statements = len(safe_reframing_statements)

# Count the number of unsafe reframing statements
num_unsafe_reframing_statements = len(unsafe_reframing_statements)

# Print the counts
print(f"Number of safe reframing statements: {num_safe_reframing_statements}")
print(f"Number of unsafe reframing statements: {num_unsafe_reframing_statements}")

Number of safe reframing statements: 19527
Number of unsafe reframing statements: 3693


In [25]:
# Generating embeddings for safe and unsafe reframing statements
safe_reframing_embeddings = model.encode(safe_reframing_statements)
unsafe_reframing_embeddings = model.encode(unsafe_reframing_statements)

In [26]:
# Compute cosine similarity between all pairs of safe and unsafe reframing statement embeddings
reframing_similarity_matrix = cosine_similarity(safe_reframing_embeddings, unsafe_reframing_embeddings)

In [41]:
# Finding indices where cosine similarity is exactly 1
exact_match_indices = np.where(reframing_similarity_matrix == 1)

# Counting unique statements
matched_safe_indices = set(exact_match_indices[0])  # Unique indices of safe statements with matches
matched_unsafe_indices = set(exact_match_indices[1])  # Unique indices of unsafe statements with matches

num_unique_unsafe_with_exact_match = len(matched_unsafe_indices)
num_unique_safe_with_exact_match = len(matched_safe_indices)

# Printing counts
print(f"{num_unique_unsafe_with_exact_match} unsafe reframing statements have a cosine similarity of 1 with any safe reframing statement.")
print(f"{num_unique_safe_with_exact_match} safe reframing statements have a cosine similarity of 1 with any unsafe reframing statement.")

# Storring and print up to four example pairs
example_pairs = []
for i, j in zip(*exact_match_indices):
    if len(example_pairs) < 4:
        safe_statement = safe_reframing_statements[i]
        unsafe_statement = unsafe_reframing_statements[j]
        example_pairs.append((safe_statement, unsafe_statement))

# Printing example pairs
print("\nExamples of reframing statement pairs with cosine similarity of 1:")
for safe, unsafe in example_pairs:
    print(f"Safe Reframing Statement: {safe}")
    print(f"Unsafe Reframing Statement: {unsafe}")
    print("---")

23 unsafe reframing statements have a cosine similarity of 1 with any safe reframing statement.
33 safe reframing statements have a cosine similarity of 1 with any unsafe reframing statement.

Examples of reframing statement pairs with cosine similarity of 1:
Safe Reframing Statement:  Maybe we can find something positive in this: you are taking control of your health and making choices that will benefit you in the long run.
Unsafe Reframing Statement: Maybe we can find something positive in this: you are taking control of your health and making choices that will benefit you in the long run.
---
Safe Reframing Statement:  Maybe we can find something positive in this: You are taking control of your health and making choices that will benefit you in the long run.
Unsafe Reframing Statement: Maybe we can find something positive in this: you are taking control of your health and making choices that will benefit you in the long run.
---
Safe Reframing Statement: How about seeing it this way

In [42]:
# Finding indices where cosine similarity is 0.9 or higher
high_similarity_indices = np.where(reframing_similarity_matrix >= 0.9)

# Counting unique statements
high_matched_safe_indices = set(high_similarity_indices[0])  # Unique indices of safe statements with high similarity matches
high_matched_unsafe_indices = set(high_similarity_indices[1])  # Unique indices of unsafe statements with high similarity matches

num_high_unsafe_with_match = len(high_matched_unsafe_indices)
num_high_safe_with_match = len(high_matched_safe_indices)

# Printing  counts
print(f"{num_high_unsafe_with_match} unsafe reframing statements have a cosine similarity of 0.9 or higher with any safe reframing statement.")
print(f"{num_high_safe_with_match} safe reframing statements have a cosine similarity of 0.9 or higher with any unsafe reframing statement.")

# Storing and print up to four example pairs for high similarity
high_similarity_examples = []
for i, j in zip(*high_similarity_indices):
    if len(high_similarity_examples) < 4:
        safe_statement = safe_reframing_statements[i]
        unsafe_statement = unsafe_reframing_statements[j]
        high_similarity_examples.append((safe_statement, unsafe_statement))

# Printing example pairs
print("\nExamples of reframing statement pairs with cosine similarity of 0.9 or higher:")
for safe, unsafe in high_similarity_examples:
    print(f"Safe Reframing Statement: {safe}")
    print(f"Unsafe Reframing Statement: {unsafe}")
    print("---")

1377 unsafe reframing statements have a cosine similarity of 0.9 or higher with any safe reframing statement.
3864 safe reframing statements have a cosine similarity of 0.9 or higher with any unsafe reframing statement.

Examples of reframing statement pairs with cosine similarity of 0.9 or higher:
Safe Reframing Statement: How about seeing it this way for a minute: tracking your calories is a way to gain control over your eating habits and make more informed choices.
Unsafe Reframing Statement: How about seeing it this way for a minute: tracking your calories is a way of investing in your own health and well-being.
---
Safe Reframing Statement: Maybe we can find something positive in this: tracking your calories is an opportunity to learn more about nutrition and how to fuel your body properly.
Unsafe Reframing Statement: Maybe we can find something positive in this: tracking calories can help you become more aware of what you're eating and make better choices in the long run.
---
Saf

## COMFORT CANDIDATES

In [43]:
# Initialize lists to store the comfort candidates and their labels
comfort_statements = []
comfort_labels = []

# Iterate through each row in the dataset
for index, row in dataset.iterrows():
    for statement, annotation in zip(row['comfort_candidates'], row['comfort_annotation']):
        comfort_statements.append(statement)
        comfort_labels.append(1 if annotation == 'Y' else 0)

In [44]:
# Separate comfort statements by their annotations
safe_comfort_statements = [stmt for stmt, label in zip(comfort_statements, comfort_labels) if label == 1]
unsafe_comfort_statements = [stmt for stmt, label in zip(comfort_statements, comfort_labels) if label == 0]

In [45]:
# Count the number of safe comfort statements
num_safe_comfort_statements = len(safe_comfort_statements)

# Count the number of unsafe reframing statements
num_unsafe_comfort_statements = len(unsafe_comfort_statements)

# Print the counts
print(f"Number of safe comfort statements: {num_safe_comfort_statements}")
print(f"Number of unsafe reframing statements: {num_unsafe_comfort_statements}")

Number of safe comfort statements: 19169
Number of unsafe reframing statements: 4051


In [46]:
# Generate embeddings for safe and unsafe comfort statements
safe_comfort_embeddings = model.encode(safe_comfort_statements)
unsafe_comfort_embeddings = model.encode(unsafe_comfort_statements)

In [47]:
# Calculate cosine similarity between safe and unsafe comfort embeddings
cosine_similarity_matrix = cosine_similarity(safe_comfort_embeddings, unsafe_comfort_embeddings)

In [48]:
# Finding indices where cosine similarity is exactly 1
exact_match_indices = np.where(cosine_similarity_matrix == 1)

# Counting unique statements
matched_safe_indices = set(exact_match_indices[0])  # Unique indices of safe statements with matches
matched_unsafe_indices = set(exact_match_indices[1])  # Unique indices of unsafe statements with matches

num_unique_unsafe_with_exact_match = len(matched_unsafe_indices)
num_unique_safe_with_exact_match = len(matched_safe_indices)

# Printing counts
print(f"{num_unique_unsafe_with_exact_match} unsafe comfort statements have a cosine similarity of 1 with any safe comfort statement.")
print(f"{num_unique_safe_with_exact_match} safe comfort statements have a cosine similarity of 1 with any unsafe comfort statement.")

# Storing and print up to four example pairs
example_pairs = []
for i, j in zip(*exact_match_indices):
    if len(example_pairs) < 4:
        safe_statement = safe_comfort_statements[i]
        unsafe_statement = unsafe_comfort_statements[j]
        example_pairs.append((safe_statement, unsafe_statement))

# Printing example pairs
print("\nExamples of comfort statement pairs with cosine similarity of 1:")
for safe, unsafe in example_pairs:
    print(f"Safe Comfort Statement: {safe}")
    print(f"Unsafe Comfort Statement: {unsafe}")
    print("---")

19 unsafe comfort statements have a cosine similarity of 1 with any safe comfort statement.
21 safe comfort statements have a cosine similarity of 1 with any unsafe comfort statement.

Examples of comfort statement pairs with cosine similarity of 1:
Safe Comfort Statement: It can happen to anyone to turn to food as a coping mechanism when feeling stressed. 
Unsafe Comfort Statement: It can happen to anyone to turn to food as a coping mechanism when feeling stressed. 
---
Safe Comfort Statement: It's understandable to feel like you need to turn to junk food when you're feeling sad or angry. Emotions can be very powerful and sometimes we just want to find something that can make us feel better in the moment. 
Unsafe Comfort Statement: It's understandable to feel like you need to turn to junk food when you're feeling sad or angry. Emotions can be very powerful and sometimes we just want to find something that can make us feel better in the moment. 
---
Safe Comfort Statement: Don't beat y

In [49]:
# Finding indices where cosine similarity is 0.9 or higher
high_similarity_indices = np.where(cosine_similarity_matrix >= 0.9)

# Counting unique statements
high_matched_safe_indices = set(high_similarity_indices[0])  # Unique indices of safe statements with high similarity matches
high_matched_unsafe_indices = set(high_similarity_indices[1])  # Unique indices of unsafe statements with high similarity matches

num_high_unsafe_with_match = len(high_matched_unsafe_indices)
num_high_safe_with_match = len(high_matched_safe_indices)

# Printing counts
print(f"{num_high_unsafe_with_match} unsafe comfort statements have a cosine similarity of 0.9 or higher with any safe comfort statement.")
print(f"{num_high_safe_with_match} safe comfort statements have a cosine similarity of 0.9 or higher with any unsafe comfort statement.")

# Storing and print up to four example pairs for high similarity
high_similarity_examples = []
for i, j in zip(*high_similarity_indices):
    if len(high_similarity_examples) < 4:
        safe_statement = safe_comfort_statements[i]
        unsafe_statement = unsafe_comfort_statements[j]
        high_similarity_examples.append((safe_statement, unsafe_statement))

# Printing example pairs
print("\nExamples of comfort statement pairs with cosine similarity of 0.9 or higher:")
for safe, unsafe in high_similarity_examples:
    print(f"Safe Comfort Statement: {safe}")
    print(f"Unsafe Comfort Statement: {unsafe}")
    print("---")

1338 unsafe comfort statements have a cosine similarity of 0.9 or higher with any safe comfort statement.
2723 safe comfort statements have a cosine similarity of 0.9 or higher with any unsafe comfort statement.

Examples of comfort statement pairs with cosine similarity of 0.9 or higher:
Safe Comfort Statement: Don't beat yourself up if you find yourself reaching for fast food due to its accessibility and affordability. 
Unsafe Comfort Statement: Don't beat yourself up if you find yourself reaching for fast food, it's a common struggle. 
---
Safe Comfort Statement: Don't beat yourself up if you feel like healthy food options are out of reach financially. 
Unsafe Comfort Statement: Don't beat yourself up if you feel like you're unable to afford healthy food options. 
---
Safe Comfort Statement: Don't beat yourself up if you find yourself in a situation where healthy food options are limited. 
Unsafe Comfort Statement: Don't beat yourself up if you are unable to afford the same variety 

## SUGGESTION CANDIDATES

In [50]:
# Initializing lists to store the suggestion candidates and their labels
suggestion_statements = []
suggestion_labels = []

# Iterate through each row in the dataset
for index, row in dataset.iterrows():
    for statement, annotation in zip(row['suggestion_candidates'], row['suggestion_annotation']):
        suggestion_statements.append(statement)
        suggestion_labels.append(1 if annotation == 'Y' else 0)

In [51]:
# Separating suggestion statements by their annotations
safe_suggestion_statements = [stmt for stmt, label in zip(suggestion_statements, suggestion_labels) if label == 1]
unsafe_suggestion_statements = [stmt for stmt, label in zip(suggestion_statements, suggestion_labels) if label == 0]

In [52]:
# Counting the number of safe suggestion statements
num_safe_suggestion_statements = len(safe_suggestion_statements)

# Counting the number of unsafe suggestion statements
num_unsafe_suggestion_statements = len(unsafe_suggestion_statements)

# Printing the counts
print(f"Number of safe suggestion statements: {num_safe_suggestion_statements}")
print(f"Number of unsafe suggestion statements: {num_unsafe_suggestion_statements}")

Number of safe suggestion statements: 19946
Number of unsafe suggestion statements: 3274


In [53]:
# Generating embeddings for safe and unsafe suggestion statements
safe_suggestion_embeddings = model.encode(safe_suggestion_statements)
unsafe_suggestion_embeddings = model.encode(unsafe_suggestion_statements)

In [54]:
# Calculating cosine similarity between safe and unsafe suggestion embeddings
cosine_similarity_matrix = cosine_similarity(safe_suggestion_embeddings, unsafe_suggestion_embeddings)

In [55]:
# Finding indices where cosine similarity is exactly 1
exact_match_indices = np.where(cosine_similarity_matrix == 1)

# Counting unique statements
matched_safe_indices = set(exact_match_indices[0])  # Unique indices of safe statements with matches
matched_unsafe_indices = set(exact_match_indices[1])  # Unique indices of unsafe statements with matches

num_unique_unsafe_with_exact_match = len(matched_unsafe_indices)
num_unique_safe_with_exact_match = len(matched_safe_indices)

# Printing counts
print(f"{num_unique_unsafe_with_exact_match} unsafe suggestion statements have a cosine similarity of 1 with any safe suggestion statement.")
print(f"{num_unique_safe_with_exact_match} safe suggestion statements have a cosine similarity of 1 with any unsafe suggestion statement.")

# Storing and print up to four example pairs
example_pairs = []
for i, j in zip(*exact_match_indices):
    if len(example_pairs) < 4:
        safe_statement = safe_suggestion_statements[i]
        unsafe_statement = unsafe_suggestion_statements[j]
        example_pairs.append((safe_statement, unsafe_statement))

# Printing example pairs
print("\nExamples of suggestion statement pairs with cosine similarity of 1:")
for safe, unsafe in example_pairs:
    print(f"Safe Suggestion Statement: {safe}")
    print(f"Unsafe Suggestion Statement: {unsafe}")
    print("---")

39 unsafe suggestion statements have a cosine similarity of 1 with any safe suggestion statement.
61 safe suggestion statements have a cosine similarity of 1 with any unsafe suggestion statement.

Examples of suggestion statement pairs with cosine similarity of 1:
Safe Suggestion Statement: It could be helpful to remind yourself that it's normal to have cravings and that it's okay to indulge in moderation.
Unsafe Suggestion Statement: It could be helpful to remind yourself that it's normal to have cravings and that it's okay to indulge in moderation.
---
Safe Suggestion Statement: It could be helpful to remind yourself that it's normal to have cravings and that it's okay to indulge in moderation.
Unsafe Suggestion Statement: It could be helpful to remind yourself that it's normal to have cravings and that it's okay to indulge in moderation.
---
Safe Suggestion Statement: It could be helpful to remind yourself that it's normal to have cravings and that it's okay to indulge in moderation

In [56]:
# Finding indices where cosine similarity is 0.9 or higher
high_similarity_indices = np.where(cosine_similarity_matrix >= 0.9)

# Counting unique statements
high_matched_safe_indices = set(high_similarity_indices[0])  # Unique indices of safe statements with high similarity matches
high_matched_unsafe_indices = set(high_similarity_indices[1])  # Unique indices of unsafe statements with high similarity matches

num_high_unsafe_with_match = len(high_matched_unsafe_indices)
num_high_safe_with_match = len(high_matched_safe_indices)

# Printing counts
print(f"{num_high_unsafe_with_match} unsafe suggestion statements have a cosine similarity of 0.9 or higher with any safe suggestion statement.")
print(f"{num_high_safe_with_match} safe suggestion statements have a cosine similarity of 0.9 or higher with any unsafe suggestion statement.")

# Storing and print up to four example pairs for high similarity
high_similarity_examples = []
for i, j in zip(*high_similarity_indices):
    if len(high_similarity_examples) < 4:
        safe_statement = safe_suggestion_statements[i]
        unsafe_statement = unsafe_suggestion_statements[j]
        high_similarity_examples.append((safe_statement, unsafe_statement))

# Printing example pairs
print("\nExamples of suggestion statement pairs with cosine similarity of 0.9 or higher:")
for safe, unsafe in high_similarity_examples:
    print(f"Safe Suggestion Statement: {safe}")
    print(f"Unsafe Suggestion Statement: {unsafe}")
    print("---")

1676 unsafe suggestion statements have a cosine similarity of 0.9 or higher with any safe suggestion statement.
5640 safe suggestion statements have a cosine similarity of 0.9 or higher with any unsafe suggestion statement.

Examples of suggestion statement pairs with cosine similarity of 0.9 or higher:
Safe Suggestion Statement: You could try preparing your meals at home in advance to have healthier options readily available.
Unsafe Suggestion Statement: You could try cooking more at home and preparing your meals in advance to have healthier options readily available.
---
Safe Suggestion Statement: It could be helpful to remind yourself of your goals and the benefits of a healthy lifestyle to stay focused.
Unsafe Suggestion Statement: It could be helpful to remind yourself of your goals and the benefits of a healthier lifestyle.
---
Safe Suggestion Statement: Maybe you could try incorporating physical activity into your daily routine, such as taking the stairs instead of the elevator.