# DATA IMPORTATION AND PROCESSING

In [1]:
#Importing data processing packages
import pandas as pd
import itertools

In [2]:
#Defining data path
dataset_path = r"C:\Users\Achor\Downloads\dataset.xlsx"

In [3]:
#Reading the dataset from Excel files
dataset = pd.read_excel(dataset_path, sheet_name = 'DATASET', keep_default_na=False)

In [4]:
# splitting lists with ### separator and converting relevant columns to strings
for col in dataset.columns:
    if type(dataset[col][0]) == str:
        if dataset[col].str.contains(" ### ").any():
            new_col = dataset[col].str.split(" ### ")
            dataset[col] = new_col

In [5]:
#Inspecting dataset
dataset.head()

Unnamed: 0,doc_no,annotator,struggle,cluster_expert,cluster_expert_merged,cluster_auto,struggle_original,OT,reflection_candidates,reflection_annotation,...,reframing_annotation,reframing_from_expert,comfort_candidates,comfort_annotation,comfort_from_expert,suggestion_candidates,suggestion_annotation,suggestion_from_expert,reduced_embeddings,full_embeddings
0,1,1,When dieting I often find it hard to track my ...,CALORIE_COUNTING,DIET_PLAN_ISSUES,find_calorie_time,When dieting I often find it hard to track my ...,N,"[So, do you mean that tracking your calorie in...","[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]",...,"[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]",[N/A],[It can happen to anyone to find it difficult ...,"[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]",[N/A],"[Starting from tomorrow, you could try trackin...","[N, Y, Y, N, Y, N, Y, Y, Y, Y]",[N/A],"[-0.38553035, 9.694216, 8.311511]","[-0.04257814213633537, 0.045637574046850204, 0..."
1,2,1,Saying no to alcohol in social settings. I usu...,SOCIAL,SOCIAL,feel_alcohol_friend,Saying no to alcohol in social settings. I usu...,N,[Do you mean that saying no to alcohol in soci...,"[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]",...,"[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]",[N/A],[It's understandable to struggle with saying n...,"[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]",[N/A],[Starting from tomorrow you could try setting ...,"[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]",[N/A],"[1.6147285, 12.11944, 6.081849]","[0.021132370457053185, -0.020406601950526237, ..."
2,3,1,Healthy food is expensive and earning a middle...,SITUATIONAL,SITUATIONAL,feel_food_junk,Healthy food is expensive and earning a middle...,N,[Do you mean that healthy food options are too...,"[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]",...,"[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]",[N/A],[It can happen to feel tempted to grab fast fo...,"[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]",[N/A],"[Starting from tomorrow, you could make a list...","[N, Y, Y, Y, N, Y, Y, Y, Y, Y]",[N/A],"[-0.74341005, 14.221862, 9.163124]","[-0.0352167934179306, 0.06300564110279083, 0.0..."
3,4,1,Working out is hard for me because I'm used to...,MOTIVATION,MOTIVATION,feel_time_gym_day,Working out is hard for me because Im used to ...,N,"[So, do you mean you have trouble creating a n...","[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]",...,"[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]",[N/A],[It's understandable to find it difficult to a...,"[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]",[N/A],"[It could be helpful to set small, achievable ...","[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]",[N/A],"[4.136178, 9.765074, 7.860414]","[0.022051161155104637, -0.0497511550784111, 0...."
4,5,1,When I see pizza I always want to buy and I en...,CRAVING_HABIT,CRAVING_HABIT,feel_food_junk,When I see pizza I always want to buy and I en...,N,"[So, are you saying that you have a hard time ...","[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]",...,"[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]",[N/A],[Sometimes it can happen that we crave certain...,"[Y, Y, Y, Y, Y, N, Y, Y, Y, Y]",[N/A],"[Starting from tomorrow, you could make a plan...","[N, Y, Y, N, Y, Y, Y, Y, Y, Y]",[N/A],"[-1.5897965, 13.701472, 7.329277]","[-0.009577570483088493, 0.09480103105306625, 0..."


## DROP 'NOT_APPLICABLE' CLUSTER

In [6]:
# Counting the number of rows where 'cluster_expert_merged' is 'NOT_APPLICABLE'
count_not_applicable = dataset[dataset['cluster_expert_merged'] == 'NOT_APPLICABLE'].shape[0]

# Using query to filter out 'NOT_APPLICABLE'
filtered_dataset = dataset.query("cluster_expert_merged != 'NOT_APPLICABLE'")
filtered_dataset.head()

# Calculating the number of rows dropped
rows_dropped = count_not_applicable

# Calculating the number of rows remaining
rows_remaining = filtered_dataset.shape[0]

# Outputing the counts
print("Number of rows dropped:", rows_dropped)
print("Number of rows remaining:", rows_remaining)

Number of rows dropped: 98
Number of rows remaining: 2322


In [9]:
# Renaming the DataFrame
dataset = filtered_dataset

# Counting the number of rows in the new DataFrame
rows_remaining = dataset.shape[0]

# Outputing the count
print("Number of rows remaining after filtering:", rows_remaining)

Number of rows remaining after filtering: 2322


# INSTALLING AND IMPORTING SBERT PACKAGES

In [7]:
!pip install sentence-transformers



In [8]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

In [9]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1" #suppress warnings about disk space usage

# ISOLATING DISSIMILAR REFLECTION CANDIDATES

In [10]:
# Reading the reflection candidates into a list with bespoke names
reflective_statements = []
reflective_labels = []
for index, row in dataset.iterrows():
    for statement, annotation in zip(row['reflection_candidates'], row['reflection_annotation']):
        reflective_statements.append(statement)
        reflective_labels.append(1 if annotation == 'Y' else 0)

In [11]:
#Seperate statements by annotation
safe_statements = [stmt for stmt, label in zip(reflective_statements, reflective_labels) if label == 1]
unsafe_statements = [stmt for stmt, label in zip(reflective_statements, reflective_labels) if label == 0]

In [12]:
# Count the number of safe statements
num_safe_statements = len(safe_statements)

# Count the number of unsafe statements
num_unsafe_statements = len(unsafe_statements)

# Print the counts
print(f"Number of safe statements: {num_safe_statements}")
print(f"Number of unsafe statements: {num_unsafe_statements}")

Number of safe statements: 20177
Number of unsafe statements: 3043


In [13]:
safe_embeddings = model.encode(safe_statements)
unsafe_embeddings = model.encode(unsafe_statements)

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity between safe and unsafe embeddings
similarity_matrix = cosine_similarity(safe_embeddings, unsafe_embeddings)

In [19]:
import numpy as np

# Count how many similarities are exactly 1
exact_matches = np.sum(similarity_matrix == 1)

# Print the number of exact matches
print(f"There are {exact_matches} pairs of statements with a cosine similarity of 1.")

There are 8 pairs of statements with a cosine similarity of 1.


In [17]:
# Count the number of similarities that are 0.9 or higher
high_similarity_count = np.sum(similarity_matrix >= 0.9)

# Print the count of such pairs
print(f"There are {high_similarity_count} pairs of statements with a cosine similarity of 0.9 or higher.")

There are 4236 pairs of statements with a cosine similarity of 0.9 or higher.


In [18]:
# Create a boolean mask where each value is True if the similarity is 0.9 or higher
high_similarity_mask = similarity_matrix >= 0.9

# Find columns (unsafe statements) that have at least one True value (high similarity with any safe statement)
columns_with_high_similarity = np.any(high_similarity_mask, axis=0)

# Count how many unique unsafe statements have high similarity
num_unsafe_with_high_similarity = np.sum(columns_with_high_similarity)

# Print the count
print(f"There are {num_unsafe_with_high_similarity} unique unsafe statements that have a cosine similarity of 0.9 or higher with at least one safe statement.")

There are 992 unique unsafe statements that have a cosine similarity of 0.9 or higher with at least one safe statement.


## APPLYING THE MINIMIZING THE MAXIMUM TECHNIQUE TO SELECT ISOLOLATE STATEMENTS WITH LOWEST SIMILARITY.

In [21]:
# Calculating the maximum similarity for each safe statement to any unsafe statement
max_similarity_safe_to_unsafe = np.max(similarity_matrix, axis=1)

# Calculating the maximum similarity for each unsafe statement to any safe statement
max_similarity_unsafe_to_safe = np.max(similarity_matrix, axis=0)

# Getting indices of the safe and unsafe statements with the lowest maximum similarities
sorted_indices_safe = np.argsort(max_similarity_safe_to_unsafe)
sorted_indices_unsafe = np.argsort(max_similarity_unsafe_to_safe)

# Selecting the top 2650 safe and unsafe statements with the lowest maximum similarities
selected_safe_indices = sorted_indices_safe[:2650]
selected_unsafe_indices = sorted_indices_unsafe[:2650]

# Retrieving the corresponding statements using these indices
filtered_safe_statements = [safe_statements[i] for i in selected_safe_indices]
filtered_unsafe_statements = [unsafe_statements[j] for j in selected_unsafe_indices]

# Outputting or using these selected statements
print(f"Selected {len(filtered_safe_statements)} safe and {len(filtered_unsafe_statements)} unsafe statements with the lowest similarity to the opposite class.")

# Printing one sample from each class
print("Sample Safe Statement:")
print(filtered_safe_statements[0] if filtered_safe_statements else "No safe statements selected.")

print("\nSample Unsafe Statement:")
print(filtered_unsafe_statements[1] if filtered_unsafe_statements else "No unsafe statements selected.")

Selected 2650 safe and 2650 unsafe statements with the lowest similarity to the opposite class.
Sample Safe Statement:
Do you mean that you have not been able to find reliable information on the internet?

Sample Unsafe Statement:
So, do you mean that your love for Nutella is something that has been present since childhood?


In [22]:
# Selecting specified columns to create a new DataFrame
selected_columns = ['struggle', 'reflection_candidates', 'reflection_annotation']
new_reflection_dataset = dataset[selected_columns].copy()

# Display the first few rows of the new DataFrame to verify
print(new_reflection_dataset.head())

                                            struggle  \
0  When dieting I often find it hard to track my ...   
1  Saying no to alcohol in social settings. I usu...   
2  Healthy food is expensive and earning a middle...   
3  Working out is hard for me because I'm used to...   
4  When I see pizza I always want to buy and I en...   

                               reflection_candidates  \
0  [So, do you mean that tracking your calorie in...   
1  [Do you mean that saying no to alcohol in soci...   
2  [Do you mean that healthy food options are too...   
3  [So, do you mean you have trouble creating a n...   
4  [So, are you saying that you have a hard time ...   

            reflection_annotation  
0  [Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]  
1  [Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]  
2  [Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]  
3  [Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]  
4  [Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]  


In [23]:
# Initialize an empty list to hold the new row data
expanded_rows = []

# Iterate through each row in the DataFrame
for _, row in new_reflection_dataset.iterrows():
    # Extract the row values
    struggle = row['struggle']
    candidates = row['reflection_candidates']
    annotations = row['reflection_annotation']
    
    # Iterating over the index and value of the candidates list
    for index, candidate in enumerate(candidates):
        # Extracting the corresponding annotation using the index
        annotation = annotations[index]
        
        # Appending the extracted data as a new row in expanded_rows list
        expanded_rows.append({'struggle': struggle, 'reflection_candidate': candidate, 'reflection_annotation': annotation})

# Converting the expanded_rows list into a new DataFrame
expanded_dataset = pd.DataFrame(expanded_rows)

# Displaying the first few rows of the expanded DataFrame to verify
print(expanded_dataset.head())

                                            struggle  \
0  When dieting I often find it hard to track my ...   
1  When dieting I often find it hard to track my ...   
2  When dieting I often find it hard to track my ...   
3  When dieting I often find it hard to track my ...   
4  When dieting I often find it hard to track my ...   

                                reflection_candidate reflection_annotation  
0  So, do you mean that tracking your calorie int...                     Y  
1  So, are you saying that it can be tedious to t...                     Y  
2  So, do you mean that you find it challenging t...                     Y  
3  So, do you mean that tracking calories is impo...                     Y  
4  So, do you mean that the lack of calorie infor...                     Y  


In [24]:
# Strip leading and trailing whitespaces from the reflection_annotation column
expanded_dataset['reflection_annotation'] = expanded_dataset['reflection_annotation'].str.strip()

In [25]:
# Combine safe and unsafe statements into a set for quick lookup
selected_statements = set(filtered_safe_statements + filtered_unsafe_statements)
# Initial filter to get rows where reflection_candidates match the selected statements
initial_filtered = expanded_dataset[expanded_dataset['reflection_candidate'].isin(selected_statements)]
# Filter out up to 2650 rows for each 'Y' and 'N' annotation
filtered_Y = initial_filtered[initial_filtered['reflection_annotation'] == 'Y'].sample(n=min(2650, len(initial_filtered[initial_filtered['reflection_annotation'] == 'Y'])), random_state=1)
filtered_N = initial_filtered[initial_filtered['reflection_annotation'] == 'N'].sample(n=min(2650, len(initial_filtered[initial_filtered['reflection_annotation'] == 'N'])), random_state=1)

# Combine the filtered datasets back into a single DataFrame
filtered_expanded_dataset = pd.concat([filtered_Y, filtered_N])

# Print the number of rows in the final filtered dataset
print(f"Number of 'Y' rows: {filtered_Y.shape[0]}")
print(f"Number of 'N' rows: {filtered_N.shape[0]}")
print(f"Total rows in filtered dataset: {filtered_expanded_dataset.shape[0]}")

Number of 'Y' rows: 2650
Number of 'N' rows: 2650
Total rows in filtered dataset: 5300


In [26]:
filtered_expanded_dataset.head()

Unnamed: 0,struggle,reflection_candidate,reflection_annotation
153,Cooking takes too much time (getting the groce...,"So, are you saying that you live in a small st...",Y
10848,"I am often off-put by the pain of working out,...","So, do you mean that you are not sure what you...",Y
17272,I struggle with cravings especially sweets or ...,"So, are you saying that your family's words ma...",Y
10004,Sometimes when I feel really stressed at work ...,"So, are you saying that when you're feeling st...",Y
6159,"Whenever I go grocery shopping, I see a lot of...",Are you saying that your grocery shopping exp...,Y


In [27]:
filtered_expanded_dataset.tail()

Unnamed: 0,struggle,reflection_candidate,reflection_annotation
3901,"It's hard to keep going for walks, or to a gym...","So, are you saying that physical fatigue is a ...",N
3779,"When I just finish a workout, I feel that some...","So, do you mean that the feeling of having ""e...",N
4249,I can't workout systematically. I need to plan...,"So, do you mean that the need to plan your ha...",N
1998,Sometime when I'm finally on a good path depre...,Are you saying that depression can make it har...,N
4144,"I always have a snack before going to bed, whi...",Do you mean that the timing of your snack befo...,N


## FORMATTING DATAFRAME FOR INSTRUCTION TUNING GEMMA

In [28]:
# Counting the occurrences of each annotation in the reflection_annotation column
annotation_counts = filtered_expanded_dataset['reflection_annotation'].value_counts()

# Printing the counts
print("Distribution of annotations in the reflection_annotation column:")
print(annotation_counts)

Distribution of annotations in the reflection_annotation column:
reflection_annotation
Y    2650
N    2650
Name: count, dtype: int64


In [29]:
# Creating the 'input' column with formatted strings
filtered_expanded_dataset['input'] = 'Struggle: ' + filtered_expanded_dataset['struggle'].astype(str) + \
                                     ' Supportive Text: ' + filtered_expanded_dataset['reflection_candidate'].astype(str)
# Moving the 'input' column to the first position
column_order = ['input'] + [col for col in filtered_expanded_dataset.columns if col != 'input']
filtered_expanded_dataset = filtered_expanded_dataset[column_order]
# Renaming the DataFrame
balanced_reflection_dataset = filtered_expanded_dataset
# displaying the first few rows to verify the new DataFrame
balanced_reflection_dataset.head()

Unnamed: 0,input,struggle,reflection_candidate,reflection_annotation
153,Struggle: Cooking takes too much time (getting...,Cooking takes too much time (getting the groce...,"So, are you saying that you live in a small st...",Y
10848,Struggle: I am often off-put by the pain of wo...,"I am often off-put by the pain of working out,...","So, do you mean that you are not sure what you...",Y
17272,Struggle: I struggle with cravings especially ...,I struggle with cravings especially sweets or ...,"So, are you saying that your family's words ma...",Y
10004,Struggle: Sometimes when I feel really stresse...,Sometimes when I feel really stressed at work ...,"So, are you saying that when you're feeling st...",Y
6159,"Struggle: Whenever I go grocery shopping, I se...","Whenever I go grocery shopping, I see a lot of...",Are you saying that your grocery shopping exp...,Y


In [30]:
# Creating the 'output' column using .loc
balanced_reflection_dataset.loc[:, 'output'] = balanced_reflection_dataset['reflection_annotation'].apply(
    lambda x: "This supportive text is safe" if x == 'Y' else "This supportive text is unsafe")

# Creating a list of all columns except 'input' and 'output'
columns_except_input_output = [col for col in balanced_reflection_dataset.columns if col not in ['input', 'output']]

# Now, define the new column order with 'input' first, 'output' second, and then the rest
new_column_order = ['input', 'output'] + columns_except_input_output

# Reordering the DataFrame according to the new column order
balanced_reflection_dataset = balanced_reflection_dataset[new_column_order]

# Displaying the first few rows to verify the changes
balanced_reflection_dataset.head()


Unnamed: 0,input,output,struggle,reflection_candidate,reflection_annotation
153,Struggle: Cooking takes too much time (getting...,This supportive text is safe,Cooking takes too much time (getting the groce...,"So, are you saying that you live in a small st...",Y
10848,Struggle: I am often off-put by the pain of wo...,This supportive text is safe,"I am often off-put by the pain of working out,...","So, do you mean that you are not sure what you...",Y
17272,Struggle: I struggle with cravings especially ...,This supportive text is safe,I struggle with cravings especially sweets or ...,"So, are you saying that your family's words ma...",Y
10004,Struggle: Sometimes when I feel really stresse...,This supportive text is safe,Sometimes when I feel really stressed at work ...,"So, are you saying that when you're feeling st...",Y
6159,"Struggle: Whenever I go grocery shopping, I se...",This supportive text is safe,"Whenever I go grocery shopping, I see a lot of...",Are you saying that your grocery shopping exp...,Y


In [35]:
# Copy the DataFrame
balanced_reflection_dataset = balanced_reflection_dataset.copy()

# Modify dataframe to add instruction
balanced_reflection_dataset['instruction'] = "Classify the Supportive Text as either safe or unsafe in relation to the Struggle."

#Defining column order
columns_before_instruction = ['input', 'output']
column_after_instruction = [col for col in balanced_reflection_dataset.columns if col not in columns_before_instruction + ['instruction']]
new_column_order = columns_before_instruction + ['instruction'] + column_after_instruction

# Reordering column
balanced_reflection_dataset = balanced_reflection_dataset[new_column_order]

#Displaying the first few rows 
balanced_reflection_dataset.head()

Unnamed: 0,input,output,instruction,struggle,reflection_candidate,reflection_annotation
153,Struggle: Cooking takes too much time (getting...,This supportive text is safe,Classify the Supportive Text as either safe or...,Cooking takes too much time (getting the groce...,"So, are you saying that you live in a small st...",Y
10848,Struggle: I am often off-put by the pain of wo...,This supportive text is safe,Classify the Supportive Text as either safe or...,"I am often off-put by the pain of working out,...","So, do you mean that you are not sure what you...",Y
17272,Struggle: I struggle with cravings especially ...,This supportive text is safe,Classify the Supportive Text as either safe or...,I struggle with cravings especially sweets or ...,"So, are you saying that your family's words ma...",Y
10004,Struggle: Sometimes when I feel really stresse...,This supportive text is safe,Classify the Supportive Text as either safe or...,Sometimes when I feel really stressed at work ...,"So, are you saying that when you're feeling st...",Y
6159,"Struggle: Whenever I go grocery shopping, I se...",This supportive text is safe,Classify the Supportive Text as either safe or...,"Whenever I go grocery shopping, I see a lot of...",Are you saying that your grocery shopping exp...,Y


In [36]:
balanced_reflection_dataset.head()

Unnamed: 0,input,output,instruction,struggle,reflection_candidate,reflection_annotation
153,Struggle: Cooking takes too much time (getting...,This supportive text is safe,Classify the Supportive Text as either safe or...,Cooking takes too much time (getting the groce...,"So, are you saying that you live in a small st...",Y
10848,Struggle: I am often off-put by the pain of wo...,This supportive text is safe,Classify the Supportive Text as either safe or...,"I am often off-put by the pain of working out,...","So, do you mean that you are not sure what you...",Y
17272,Struggle: I struggle with cravings especially ...,This supportive text is safe,Classify the Supportive Text as either safe or...,I struggle with cravings especially sweets or ...,"So, are you saying that your family's words ma...",Y
10004,Struggle: Sometimes when I feel really stresse...,This supportive text is safe,Classify the Supportive Text as either safe or...,Sometimes when I feel really stressed at work ...,"So, are you saying that when you're feeling st...",Y
6159,"Struggle: Whenever I go grocery shopping, I se...",This supportive text is safe,Classify the Supportive Text as either safe or...,"Whenever I go grocery shopping, I see a lot of...",Are you saying that your grocery shopping exp...,Y


In [37]:
# Define the full file path, including the directory and the file name
csv_file_path = r"C:\Users\Achor\Downloads\balanced_reflection_dataset.csv"

# Save the DataFrame to a CSV file at the specified path
balanced_reflection_dataset.to_csv(csv_file_path, index=False)

# Print a confirmation message
print(f"DataFrame has been saved to {csv_file_path}")

DataFrame has been saved to C:\Users\Achor\Downloads\balanced_reflection_dataset.csv


# ISOLATING DISSIMILAR REFRAMING CANDIDATES

In [38]:
# Reading the reframing candidates into a list with bespoke names
reframing_statements = []
reframing_labels = []
for index, row in dataset.iterrows():
    for statement, annotation in zip(row['reframing_candidates'], row['reframing_annotation']):
        reframing_statements.append(statement)
        reframing_labels.append(1 if annotation == 'Y' else 0)


In [39]:
# Separating statements by annotation for reframing candidates
safe_reframing_statements = [stmt for stmt, label in zip(reframing_statements, reframing_labels) if label == 1]
unsafe_reframing_statements = [stmt for stmt, label in zip(reframing_statements, reframing_labels) if label == 0]


In [40]:
# Counting the number of safe reframing statements
num_safe_reframing_statements = len(safe_reframing_statements)

# Counting the number of unsafe reframing statements
num_unsafe_reframing_statements = len(unsafe_reframing_statements)

# Printing the counts
print(f"Number of safe reframing statements: {num_safe_reframing_statements}")
print(f"Number of unsafe reframing statements: {num_unsafe_reframing_statements}")


Number of safe reframing statements: 19527
Number of unsafe reframing statements: 3693


In [41]:
# Generate embeddings for safe and unsafe reframing statements
safe_reframing_embeddings = model.encode(safe_reframing_statements)
unsafe_reframing_embeddings = model.encode(unsafe_reframing_statements)

In [43]:
# Calculate cosine similarity between safe and unsafe reframing embeddings
similarity_matrix = cosine_similarity(safe_reframing_embeddings, unsafe_reframing_embeddings)

In [44]:
# Create a boolean mask where each value is True if the similarity is 0.9 or higher
high_similarity_mask = similarity_matrix >= 0.9

# Find columns (unsafe statements) that have at least one True value (high similarity with any safe statement)
columns_with_high_similarity = np.any(high_similarity_mask, axis=0)

# Count how many unique unsafe statements have high similarity
num_unsafe_with_high_similarity = np.sum(columns_with_high_similarity)

# Print the count
print(f"There are {num_unsafe_with_high_similarity} unique unsafe statements that have a cosine similarity of 0.9 or higher with at least one safe statement.")

There are 1377 unique unsafe statements that have a cosine similarity of 0.9 or higher with at least one safe statement.


## APPLYING THE MINIMIZING THE MAXIMUM TECHNIQUE TO SELECT ISOLOLATE STATEMENTS WITH LOWEST SIMILARITY.

In [45]:
# Calculating the maximum similarity for each safe statement to any unsafe statement
max_similarity_safe_to_unsafe = np.max(similarity_matrix, axis=1)

# Calculating the maximum similarity for each unsafe statement to any safe statement
max_similarity_unsafe_to_safe = np.max(similarity_matrix, axis=0)

# Getting indices of the safe and unsafe statements with the lowest maximum similarities
sorted_indices_safe = np.argsort(max_similarity_safe_to_unsafe)
sorted_indices_unsafe = np.argsort(max_similarity_unsafe_to_safe)

# Selecting the top 2650 safe and unsafe statements with the lowest maximum similarities
selected_safe_indices = sorted_indices_safe[:2650]
selected_unsafe_indices = sorted_indices_unsafe[:2650]

# Retrieving the corresponding statements using these indices
filtered_safe_reframing_statements = [safe_reframing_statements[i] for i in selected_safe_indices]
filtered_unsafe_reframing_statements = [unsafe_reframing_statements[j] for j in selected_unsafe_indices]

# Outputting or using these selected statements
print(f"Selected {len(filtered_safe_reframing_statements)} safe and {len(filtered_unsafe_reframing_statements)} unsafe statements with the lowest similarity to the opposite class.")

# Printing one sample from each class
print("Sample Safe Statement:")
print(filtered_safe_reframing_statements[0] if filtered_safe_reframing_statements else "No safe statements selected.")

print("\nSample Unsafe Statement:")
print(filtered_unsafe_reframing_statements[1] if filtered_unsafe_reframing_statements else "No unsafe statements selected.")

Selected 2650 safe and 2650 unsafe statements with the lowest similarity to the opposite class.
Sample Safe Statement:
A more positive way to think about this could be that arguing with your girlfriend allows you to work through any issues and potentially improve your relationship.

Sample Unsafe Statement:
How about seeing it this way for a minute: you already have a great base to start from with being under your ideal weight.


In [46]:
# Selecting specified columns to create a new DataFrame focused on reframing candidates
selected_columns = ['struggle', 'reframing_candidates', 'reframing_annotation']
new_reframing_dataset = dataset[selected_columns].copy()

# Displaying the first few rows of the new DataFrame to verify
new_reframing_dataset.head()

Unnamed: 0,struggle,reframing_candidates,reframing_annotation
0,When dieting I often find it hard to track my ...,[How about seeing it this way for a minute: tr...,"[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]"
1,Saying no to alcohol in social settings. I usu...,[Something positive you could consider about t...,"[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]"
2,Healthy food is expensive and earning a middle...,[How about seeing it this way for a minute: Ma...,"[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]"
3,Working out is hard for me because I'm used to...,[Maybe we can find something positive in this:...,"[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]"
4,When I see pizza I always want to buy and I en...,[Maybe we can find something positive in this:...,"[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]"


In [47]:
# Initialize an empty list to hold the new row data
expanded_rows = []

# Iterating through each row in the new_reframing_dataset DataFrame
for _, row in new_reframing_dataset.iterrows():
    # Extract the row values
    struggle = row['struggle']
    candidates = row['reframing_candidates']
    annotations = row['reframing_annotation']
    
    # Iterating over the index and value of the candidates list
    for index, candidate in enumerate(candidates):
        # Extract the corresponding annotation using the index
        annotation = annotations[index]
        
        # Append the extracted data as a new row in expanded_rows list
        expanded_rows.append({'struggle': struggle, 'reframing_candidate': candidate, 'reframing_annotation': annotation})

# Converting the expanded_rows list into a new DataFrame
expanded_reframing_dataset = pd.DataFrame(expanded_rows)

# Displaying the first few rows of the expanded reframing DataFrame to verify
expanded_reframing_dataset.head()

Unnamed: 0,struggle,reframing_candidate,reframing_annotation
0,When dieting I often find it hard to track my ...,How about seeing it this way for a minute: tra...,Y
1,When dieting I often find it hard to track my ...,How about seeing it this way for a minute: tra...,Y
2,When dieting I often find it hard to track my ...,Something positive you could consider about th...,Y
3,When dieting I often find it hard to track my ...,Something positive you could consider about th...,Y
4,When dieting I often find it hard to track my ...,Maybe we can find something positive in this: ...,Y


In [48]:
# Stripping leading and trailing whitespaces from the reframing_annotation column
expanded_reframing_dataset['reframing_annotation'] = expanded_reframing_dataset['reframing_annotation'].str.strip()

In [50]:
# Combining safe and unsafe reframing statements into a set for quick lookup
selected_statements = set(filtered_safe_reframing_statements + filtered_unsafe_reframing_statements)

# Initial filter to get rows where reframing_candidates match the selected statements
initial_filtered = expanded_reframing_dataset[expanded_reframing_dataset['reframing_candidate'].isin(selected_statements)]

# Filter out up to 2650 rows for each 'Y' and 'N' annotation
filtered_Y = initial_filtered[initial_filtered['reframing_annotation'] == 'Y'].sample(n=min(2650, len(initial_filtered[initial_filtered['reframing_annotation'] == 'Y'])), random_state=1)
filtered_N = initial_filtered[initial_filtered['reframing_annotation'] == 'N'].sample(n=min(2650, len(initial_filtered[initial_filtered['reframing_annotation'] == 'N'])), random_state=1)

# Combining the filtered datasets back into a single DataFrame
filtered_expanded_reframing_dataset = pd.concat([filtered_Y, filtered_N])

# Printing the number of rows in the final filtered dataset
print(f"Number of 'Y' rows: {filtered_Y.shape[0]}")
print(f"Number of 'N' rows: {filtered_N.shape[0]}")
print(f"Total rows in filtered dataset: {filtered_expanded_reframing_dataset.shape[0]}")
filtered_expanded_reframing_dataset.head()

Number of 'Y' rows: 2650
Number of 'N' rows: 2650
Total rows in filtered dataset: 5300


Unnamed: 0,struggle,reframing_candidate,reframing_annotation
192,"I struggle to avoid OH, specially bread. Refin...",Something positive you could consider about th...,Y
11043,"Working out is fun, but finding the motivation...",A more positive way to think about this could ...,Y
17334,When I have more chores is hard for me to work...,Maybe we can find something positive in this: ...,Y
10283,I can't take dinner when I am nervous.,Something positive you could consider about th...,Y
5407,"After a few days, the training sessions become...",How about seeing it this way for a minute: Ins...,Y


## FORMATTING DATAFRAME FOR INSTRUCTION TUNING GEMMA

In [51]:
# Creating the 'input' column with formatted strings for reframing dataset
filtered_expanded_reframing_dataset['input'] = 'Struggle: ' + filtered_expanded_reframing_dataset['struggle'].astype(str) + \
                                               ' Supportive Text: ' + filtered_expanded_reframing_dataset['reframing_candidate'].astype(str)

# Moving the 'input' column to the first position
column_order = ['input'] + [col for col in filtered_expanded_reframing_dataset.columns if col != 'input']
filtered_expanded_reframing_dataset = filtered_expanded_reframing_dataset[column_order]

# Renaming the DataFrame to balanced_reframing_dataset
balanced_reframing_dataset = filtered_expanded_reframing_dataset

# Displaying the first few rows to verify the new DataFrame
balanced_reframing_dataset.head()


Unnamed: 0,input,struggle,reframing_candidate,reframing_annotation
192,"Struggle: I struggle to avoid OH, specially br...","I struggle to avoid OH, specially bread. Refin...",Something positive you could consider about th...,Y
11043,"Struggle: Working out is fun, but finding the ...","Working out is fun, but finding the motivation...",A more positive way to think about this could ...,Y
17334,Struggle: When I have more chores is hard for ...,When I have more chores is hard for me to work...,Maybe we can find something positive in this: ...,Y
10283,Struggle: I can't take dinner when I am nervou...,I can't take dinner when I am nervous.,Something positive you could consider about th...,Y
5407,"Struggle: After a few days, the training sessi...","After a few days, the training sessions become...",How about seeing it this way for a minute: Ins...,Y


In [52]:
# Creating the 'output' column using .loc to avoid SettingWithCopyWarning
balanced_reframing_dataset.loc[:, 'output'] = balanced_reframing_dataset['reframing_annotation'].apply(
    lambda x: "This supportive text is safe" if x == 'Y' else "This supportive text is unsafe")

# creating a list of all columns except 'input' and 'output'
columns_except_input_output = [col for col in balanced_reframing_dataset.columns if col not in ['input', 'output']]

# Defining the new column order with 'input' first, 'output' second, and then the rest
new_column_order = ['input', 'output'] + columns_except_input_output

# Reordering the DataFrame according to the new column order
balanced_reframing_dataset = balanced_reframing_dataset[new_column_order]

# Displaying the first few rows to verify the changes
balanced_reframing_dataset.head()


Unnamed: 0,input,output,struggle,reframing_candidate,reframing_annotation
192,"Struggle: I struggle to avoid OH, specially br...",This supportive text is safe,"I struggle to avoid OH, specially bread. Refin...",Something positive you could consider about th...,Y
11043,"Struggle: Working out is fun, but finding the ...",This supportive text is safe,"Working out is fun, but finding the motivation...",A more positive way to think about this could ...,Y
17334,Struggle: When I have more chores is hard for ...,This supportive text is safe,When I have more chores is hard for me to work...,Maybe we can find something positive in this: ...,Y
10283,Struggle: I can't take dinner when I am nervou...,This supportive text is safe,I can't take dinner when I am nervous.,Something positive you could consider about th...,Y
5407,"Struggle: After a few days, the training sessi...",This supportive text is safe,"After a few days, the training sessions become...",How about seeing it this way for a minute: Ins...,Y


In [53]:
# Copy the DataFrame
balanced_reframing_dataset = balanced_reframing_dataset.copy()

# Add the 'instruction' column with the specified text for all rows
balanced_reframing_dataset['instruction'] = "Classify the Supportive Text as either safe or unsafe in relation to the Struggle."

# Define the new column order
columns_before_instruction = ['input', 'output']
column_after_instruction = [col for col in balanced_reframing_dataset.columns if col not in columns_before_instruction + ['instruction']]
new_column_order = columns_before_instruction + ['instruction'] + column_after_instruction

# Reordering the DataFrame according to the new column order
balanced_reframing_dataset = balanced_reframing_dataset[new_column_order]

# Displaying the first few rows to verify the changes
balanced_reframing_dataset.head()

Unnamed: 0,input,output,instruction,struggle,reframing_candidate,reframing_annotation
192,"Struggle: I struggle to avoid OH, specially br...",This supportive text is safe,Classify the Supportive Text as either safe or...,"I struggle to avoid OH, specially bread. Refin...",Something positive you could consider about th...,Y
11043,"Struggle: Working out is fun, but finding the ...",This supportive text is safe,Classify the Supportive Text as either safe or...,"Working out is fun, but finding the motivation...",A more positive way to think about this could ...,Y
17334,Struggle: When I have more chores is hard for ...,This supportive text is safe,Classify the Supportive Text as either safe or...,When I have more chores is hard for me to work...,Maybe we can find something positive in this: ...,Y
10283,Struggle: I can't take dinner when I am nervou...,This supportive text is safe,Classify the Supportive Text as either safe or...,I can't take dinner when I am nervous.,Something positive you could consider about th...,Y
5407,"Struggle: After a few days, the training sessi...",This supportive text is safe,Classify the Supportive Text as either safe or...,"After a few days, the training sessions become...",How about seeing it this way for a minute: Ins...,Y


In [54]:
# Define the full file path, including the directory and the file name
csv_file_path = r"C:\Users\Achor\Downloads\balanced_reframing_dataset.csv"

# Save the DataFrame to a CSV file at the specified path
balanced_reframing_dataset.to_csv(csv_file_path, index=False)

# Print a confirmation message
print(f"DataFrame has been saved to {csv_file_path}")

DataFrame has been saved to C:\Users\Achor\Downloads\balanced_reframing_dataset.csv


# ISOLATING DISSIMILAR COMFORT CANDIDATES

In [55]:
# Initializing lists to store the comfort candidates and their labels
comfort_statements = []
comfort_labels = []

# Iterating through each row in the dataset
for index, row in dataset.iterrows():
    for statement, annotation in zip(row['comfort_candidates'], row['comfort_annotation']):
        comfort_statements.append(statement)
        comfort_labels.append(1 if annotation == 'Y' else 0)

In [56]:
# Separating comfort statements by their annotations
safe_comfort_statements = [stmt for stmt, label in zip(comfort_statements, comfort_labels) if label == 1]
unsafe_comfort_statements = [stmt for stmt, label in zip(comfort_statements, comfort_labels) if label == 0]

In [57]:
# Counting the number of safe comfort statements
num_safe_comfort_statements = len(safe_comfort_statements)

# Counting the number of unsafe reframing statements
num_unsafe_comfort_statements = len(unsafe_comfort_statements)

# Printing the counts
print(f"Number of safe comfort statements: {num_safe_comfort_statements}")
print(f"Number of unsafe reframing statements: {num_unsafe_comfort_statements}")

Number of safe comfort statements: 19169
Number of unsafe reframing statements: 4051


In [58]:
# Generate embeddings for safe and unsafe comfort statements
safe_comfort_embeddings = model.encode(safe_comfort_statements)
unsafe_comfort_embeddings = model.encode(unsafe_comfort_statements)

In [59]:
# Calculate cosine similarity between safe and unsafe comfort embeddings
cosine_similarity_matrix = cosine_similarity(safe_comfort_embeddings, unsafe_comfort_embeddings)

In [60]:
import numpy as np
# Create a boolean mask where each value is True if the similarity is 0.9 or higher
high_similarity_mask = cosine_similarity_matrix >= 0.9

# Find columns (unsafe statements) that have at least one True value (high similarity with any safe statement)
columns_with_high_similarity = np.any(high_similarity_mask, axis=0)

# Count how many unique unsafe statements have high similarity
num_unsafe_with_high_similarity = np.sum(columns_with_high_similarity)

# Print the count
print(f"There are {num_unsafe_with_high_similarity} unique unsafe comfort statements that have a cosine similarity of 0.9 or higher with at least one safe statement.")

There are 1338 unique unsafe comfort statements that have a cosine similarity of 0.9 or higher with at least one safe statement.


## APPLYING THE MINIMIZING THE MAXIMUM TECHNIQUE TO SELECT ISOLOLATE STATEMENTS WITH LOWEST SIMILARITY.

In [61]:
# Calculating the maximum similarity for each safe statement to any unsafe statement
max_similarity_safe_to_unsafe = np.max(cosine_similarity_matrix, axis=1)

# Calculating the maximum similarity for each unsafe statement to any safe statement
max_similarity_unsafe_to_safe = np.max(cosine_similarity_matrix, axis=0)

# Getting indices of the safe and unsafe statements with the lowest maximum similarities
sorted_indices_safe = np.argsort(max_similarity_safe_to_unsafe)
sorted_indices_unsafe = np.argsort(max_similarity_unsafe_to_safe)

# Selecting the top 2650 safe and unsafe statements with the lowest maximum similarities
selected_safe_indices = sorted_indices_safe[:2650]
selected_unsafe_indices = sorted_indices_unsafe[:2650]

# Retrieving the corresponding statements using these indices
filtered_safe_comfort_statements = [safe_comfort_statements[i] for i in selected_safe_indices]
filtered_unsafe_comfort_statements = [unsafe_comfort_statements[j] for j in selected_unsafe_indices]

# Outputting or using these selected statements
print(f"Selected {len(filtered_safe_comfort_statements)} safe and {len(filtered_unsafe_comfort_statements)} unsafe statements with the lowest similarity to the opposite class.")

# Printing one sample from each class
print("Sample Safe Statement:")
print(filtered_safe_comfort_statements[0] if filtered_safe_comfort_statements else "No safe statements selected.")

print("\nSample Unsafe Statement:")
print(filtered_unsafe_comfort_statements[1] if filtered_unsafe_comfort_statements else "No unsafe statements selected.")

Selected 2650 safe and 2650 unsafe statements with the lowest similarity to the opposite class.
Sample Safe Statement:
It's understandable to feel out of breath quickly when you smoke, as it can impact lung function. 

Sample Unsafe Statement:
 It's understandable to want to take care of your hair, it's normal to have those moments. 


In [62]:
# Selecting specified columns to create a new DataFrame focused on comfort candidates
selected_columns = ['struggle', 'comfort_candidates', 'comfort_annotation']
new_comfort_dataset = dataset[selected_columns].copy()

# Displaying the first few rows of the new DataFrame to verify
new_comfort_dataset.head()

Unnamed: 0,struggle,comfort_candidates,comfort_annotation
0,When dieting I often find it hard to track my ...,[It can happen to anyone to find it difficult ...,"[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]"
1,Saying no to alcohol in social settings. I usu...,[It's understandable to struggle with saying n...,"[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]"
2,Healthy food is expensive and earning a middle...,[It can happen to feel tempted to grab fast fo...,"[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]"
3,Working out is hard for me because I'm used to...,[It's understandable to find it difficult to a...,"[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]"
4,When I see pizza I always want to buy and I en...,[Sometimes it can happen that we crave certain...,"[Y, Y, Y, Y, Y, N, Y, Y, Y, Y]"


In [63]:
# Initializing an empty list to hold the new row data
expanded_rows = []

# Iterating through each row in the new_comfort_dataset DataFrame
for _, row in new_comfort_dataset.iterrows():
    # Extract the row values
    struggle = row['struggle']
    candidates = row['comfort_candidates']
    annotations = row['comfort_annotation']
    
    # Iterating over the index and value of the candidates list
    for index, candidate in enumerate(candidates):
        # Extract the corresponding annotation using the index
        annotation = annotations[index]
        
        # Append the extracted data as a new row in expanded_rows list
        expanded_rows.append({'struggle': struggle, 'comfort_candidate': candidate, 'comfort_annotation': annotation})

# Converting the expanded_rows list into a new DataFrame
expanded_comfort_dataset = pd.DataFrame(expanded_rows)

# Displaying the first few rows of the expanded comfort DataFrame to verify
expanded_comfort_dataset.head()

Unnamed: 0,struggle,comfort_candidate,comfort_annotation
0,When dieting I often find it hard to track my ...,It can happen to anyone to find it difficult t...,Y
1,When dieting I often find it hard to track my ...,It can happen to find it hard to maintain a ca...,Y
2,When dieting I often find it hard to track my ...,It can happen to feel frustrated when trying t...,Y
3,When dieting I often find it hard to track my ...,Don't beat yourself up if tracking every calor...,Y
4,When dieting I often find it hard to track my ...,Don't beat yourself up if you struggle to main...,Y


In [64]:
print("Number of rows in expanded_comfort_dataset:", expanded_comfort_dataset.shape[0])

Number of rows in expanded_comfort_dataset: 23220


In [65]:
# Striping leading and trailing whitespaces from the comfort_annotation column
expanded_comfort_dataset['comfort_annotation'] = expanded_comfort_dataset['comfort_annotation'].str.strip()

In [66]:
# Combining safe and unsafe comfort statements into a set for quick lookup
selected_statements = set(filtered_safe_comfort_statements + filtered_unsafe_comfort_statements)

# Initial filter to get rows where comfort_candidates match the selected statements
initial_filtered = expanded_comfort_dataset[expanded_comfort_dataset['comfort_candidate'].isin(selected_statements)]

# Filter out up to 2650 rows for each 'Y' and 'N' annotation
filtered_Y = initial_filtered[initial_filtered['comfort_annotation'] == 'Y'].sample(n=min(2650, len(initial_filtered[initial_filtered['comfort_annotation'] == 'Y'])), random_state=1)
filtered_N = initial_filtered[initial_filtered['comfort_annotation'] == 'N'].sample(n=min(2650, len(initial_filtered[initial_filtered['comfort_annotation'] == 'N'])), random_state=1)

# Combining the filtered datasets back into a single DataFrame
filtered_expanded_comfort_dataset = pd.concat([filtered_Y, filtered_N])

# Printing the number of rows in the final filtered dataset
print(f"Number of 'Y' rows: {filtered_Y.shape[0]}")
print(f"Number of 'N' rows: {filtered_N.shape[0]}")
print(f"Total rows in filtered dataset: {filtered_expanded_comfort_dataset.shape[0]}")
filtered_expanded_comfort_dataset.head()

Number of 'Y' rows: 2650
Number of 'N' rows: 2650
Total rows in filtered dataset: 5300


Unnamed: 0,struggle,comfort_candidate,comfort_annotation
171,Preparing healthy foods comes at more of an ex...,It can happen to feel overwhelmed by the cost ...,Y
10917,The Christmas holidays are just around the cor...,"It can happen to anyone, to find it challengin...",Y
17330,When I have more chores is hard for me to work...,It's understandable to feel tired and lazy whe...,Y
10156,My family is very numerous from both my dad's ...,It's understandable to want to enjoy the foods...,Y
5769,"Since I have little free time and energy, I us...",Sometimes it can happen that we don't have en...,Y


## FORMATTING DATAFRAME FOR INSTRUCTION TUNING GEMMA

In [67]:
# Creating the 'input' column with formatted strings for comfort dataset
filtered_expanded_comfort_dataset['input'] = 'Struggle: ' + filtered_expanded_comfort_dataset['struggle'].astype(str) + \
                                             ' Supportive Text: ' + filtered_expanded_comfort_dataset['comfort_candidate'].astype(str)

# Moving the 'input' column to the first position
column_order = ['input'] + [col for col in filtered_expanded_comfort_dataset.columns if col != 'input']
filtered_expanded_comfort_dataset = filtered_expanded_comfort_dataset[column_order]

# Renaming the DataFrame to balanced_comfort_dataset
balanced_comfort_dataset = filtered_expanded_comfort_dataset

# Displaying the first few rows to verify the new DataFrame
balanced_comfort_dataset.head()

Unnamed: 0,input,struggle,comfort_candidate,comfort_annotation
171,Struggle: Preparing healthy foods comes at mor...,Preparing healthy foods comes at more of an ex...,It can happen to feel overwhelmed by the cost ...,Y
10917,Struggle: The Christmas holidays are just arou...,The Christmas holidays are just around the cor...,"It can happen to anyone, to find it challengin...",Y
17330,Struggle: When I have more chores is hard for ...,When I have more chores is hard for me to work...,It's understandable to feel tired and lazy whe...,Y
10156,Struggle: My family is very numerous from both...,My family is very numerous from both my dad's ...,It's understandable to want to enjoy the foods...,Y
5769,Struggle: Since I have little free time and en...,"Since I have little free time and energy, I us...",Sometimes it can happen that we don't have en...,Y


In [68]:
# Creating the 'output' column using .loc to avoid SettingWithCopyWarning
balanced_comfort_dataset.loc[:, 'output'] = balanced_comfort_dataset['comfort_annotation'].apply(
    lambda x: "This supportive text is safe" if x == 'Y' else "This supportive text is unsafe")

# Creating a list of all columns except 'input' and 'output'
columns_except_input_output = [col for col in balanced_comfort_dataset.columns if col not in ['input', 'output']]

# Defining the new column order with 'input' first, 'output' second, and then the rest
new_column_order = ['input', 'output'] + columns_except_input_output

# Reordering the DataFrame according to the new column order
balanced_comfort_dataset = balanced_comfort_dataset[new_column_order]

# Displaying the first few rows to verify the changes
balanced_comfort_dataset.head()

Unnamed: 0,input,output,struggle,comfort_candidate,comfort_annotation
171,Struggle: Preparing healthy foods comes at mor...,This supportive text is safe,Preparing healthy foods comes at more of an ex...,It can happen to feel overwhelmed by the cost ...,Y
10917,Struggle: The Christmas holidays are just arou...,This supportive text is safe,The Christmas holidays are just around the cor...,"It can happen to anyone, to find it challengin...",Y
17330,Struggle: When I have more chores is hard for ...,This supportive text is safe,When I have more chores is hard for me to work...,It's understandable to feel tired and lazy whe...,Y
10156,Struggle: My family is very numerous from both...,This supportive text is safe,My family is very numerous from both my dad's ...,It's understandable to want to enjoy the foods...,Y
5769,Struggle: Since I have little free time and en...,This supportive text is safe,"Since I have little free time and energy, I us...",Sometimes it can happen that we don't have en...,Y


In [69]:
# Copy the DataFrame
balanced_comfort_dataset = balanced_comfort_dataset.copy()

# Adding the 'instruction' column with the specified text for all rows
balanced_comfort_dataset['instruction'] = "Classify the Supportive Text as either safe or unsafe in relation to the Struggle."

# Defining the new column order, placing 'instruction' third
columns_before_instruction = ['input', 'output']
column_after_instruction = [col for col in balanced_comfort_dataset.columns if col not in columns_before_instruction + ['instruction']]
new_column_order = columns_before_instruction + ['instruction'] + column_after_instruction

# Reordering the DataFrame according to the new column order
balanced_comfort_dataset = balanced_comfort_dataset[new_column_order]

# Displaying the first few rows to verify the changes
balanced_comfort_dataset.head()

Unnamed: 0,input,output,instruction,struggle,comfort_candidate,comfort_annotation
171,Struggle: Preparing healthy foods comes at mor...,This supportive text is safe,Classify the Supportive Text as either safe or...,Preparing healthy foods comes at more of an ex...,It can happen to feel overwhelmed by the cost ...,Y
10917,Struggle: The Christmas holidays are just arou...,This supportive text is safe,Classify the Supportive Text as either safe or...,The Christmas holidays are just around the cor...,"It can happen to anyone, to find it challengin...",Y
17330,Struggle: When I have more chores is hard for ...,This supportive text is safe,Classify the Supportive Text as either safe or...,When I have more chores is hard for me to work...,It's understandable to feel tired and lazy whe...,Y
10156,Struggle: My family is very numerous from both...,This supportive text is safe,Classify the Supportive Text as either safe or...,My family is very numerous from both my dad's ...,It's understandable to want to enjoy the foods...,Y
5769,Struggle: Since I have little free time and en...,This supportive text is safe,Classify the Supportive Text as either safe or...,"Since I have little free time and energy, I us...",Sometimes it can happen that we don't have en...,Y


In [70]:
# Defining the full file path, including the directory and the file name
csv_file_path = r"C:\Users\Achor\Downloads\balanced_comfort_dataset.csv"

# Save the DataFrame to a CSV file at the specified path
balanced_comfort_dataset.to_csv(csv_file_path, index=False)

# Print a confirmation message
print(f"DataFrame has been saved to {csv_file_path}")

DataFrame has been saved to C:\Users\Achor\Downloads\balanced_comfort_dataset.csv


# ISOLATING DISSIMILAR SUGGESTION CANDIDATES

In [71]:
# Initializing lists to store the suggestion candidates and their labels
suggestion_statements = []
suggestion_labels = []

# Iterating through each row in the dataset
for index, row in dataset.iterrows():
    for statement, annotation in zip(row['suggestion_candidates'], row['suggestion_annotation']):
        suggestion_statements.append(statement)
        suggestion_labels.append(1 if annotation == 'Y' else 0)

In [72]:
# Separating suggestion statements by their annotations
safe_suggestion_statements = [stmt for stmt, label in zip(suggestion_statements, suggestion_labels) if label == 1]
unsafe_suggestion_statements = [stmt for stmt, label in zip(suggestion_statements, suggestion_labels) if label == 0]

In [73]:
# Counting the number of safe suggestion statements
num_safe_suggestion_statements = len(safe_suggestion_statements)

# Counting the number of unsafe suggestion statements
num_unsafe_suggestion_statements = len(unsafe_suggestion_statements)

# Printing the counts
print(f"Number of safe suggestion statements: {num_safe_suggestion_statements}")
print(f"Number of unsafe suggestion statements: {num_unsafe_suggestion_statements}")

Number of safe suggestion statements: 19946
Number of unsafe suggestion statements: 3274


In [74]:
# Generating embeddings for safe and unsafe suggestion statements
safe_suggestion_embeddings = model.encode(safe_suggestion_statements)
unsafe_suggestion_embeddings = model.encode(unsafe_suggestion_statements)

In [75]:
# Calculating cosine similarity between safe and unsafe suggestion embeddings
cosine_similarity_matrix = cosine_similarity(safe_suggestion_embeddings, unsafe_suggestion_embeddings)

In [76]:
# Creating a boolean mask where each value is True if the similarity is 0.9 or higher
high_similarity_mask = cosine_similarity_matrix >= 0.9

# Finding columns (unsafe statements) that have at least one True value (high similarity with any safe statement)
columns_with_high_similarity = np.any(high_similarity_mask, axis=0)

# Counting how many unique unsafe statements have high similarity
num_unsafe_with_high_similarity = np.sum(columns_with_high_similarity)

# Printing the count
print(f"There are {num_unsafe_with_high_similarity} unique unsafe suggestion statements that have a cosine similarity of 0.9 or higher with at least one safe statement.")

There are 1676 unique unsafe suggestion statements that have a cosine similarity of 0.9 or higher with at least one safe statement.


## APPLYING THE MINIMIZING THE MAXIMUM TECHNIQUE TO SELECT ISOLOLATE STATEMENTS WITH LOWEST SIMILARITY.

In [77]:
# Calculating the maximum similarity for each safe statement to any unsafe statement
max_similarity_safe_to_unsafe = np.max(cosine_similarity_matrix, axis=1)

# Calculating the maximum similarity for each unsafe statement to any safe statement
max_similarity_unsafe_to_safe = np.max(cosine_similarity_matrix, axis=0)

# Getting indices of the safe and unsafe statements with the lowest maximum similarities
sorted_indices_safe = np.argsort(max_similarity_safe_to_unsafe)
sorted_indices_unsafe = np.argsort(max_similarity_unsafe_to_safe)

# Selecting the top 2650 safe and unsafe statements with the lowest maximum similarities
selected_safe_indices = sorted_indices_safe[:2650]
selected_unsafe_indices = sorted_indices_unsafe[:2650]

# Retrieving the corresponding statements using these indices
filtered_safe_suggestion_statements = [safe_suggestion_statements[i] for i in selected_safe_indices]
filtered_unsafe_suggestion_statements = [unsafe_suggestion_statements[j] for j in selected_unsafe_indices]

# Outputting or using these selected statements
print(f"Selected {len(filtered_safe_suggestion_statements)} safe and {len(filtered_unsafe_suggestion_statements)} unsafe statements with the lowest similarity to the opposite class.")

# Printing one sample from each class
print("Sample Safe Statement:")
print(filtered_safe_suggestion_statements[0] if filtered_safe_suggestion_statements else "No safe statements selected.")

print("\nSample Unsafe Statement:")
print(filtered_unsafe_suggestion_statements[1] if filtered_unsafe_suggestion_statements else "No unsafe statements selected.")

Selected 2650 safe and 2650 unsafe statements with the lowest similarity to the opposite class.
Sample Safe Statement:
It could be helpful to schedule a visit with your dentist to discuss any concerns you have about cavities and to get a professional cleaning.

Sample Unsafe Statement:
Maybe you could try reading credible sources such as scientific journals or government health websites to get accurate information.


In [78]:
# Selecting specified columns to create a new DataFrame focused on suggestion candidates
selected_columns = ['struggle', 'suggestion_candidates', 'suggestion_annotation']
new_suggestion_dataset = dataset[selected_columns].copy()

# Displaying the first few rows of the new DataFrame to verify
new_suggestion_dataset.head()

Unnamed: 0,struggle,suggestion_candidates,suggestion_annotation
0,When dieting I often find it hard to track my ...,"[Starting from tomorrow, you could try trackin...","[N, Y, Y, N, Y, N, Y, Y, Y, Y]"
1,Saying no to alcohol in social settings. I usu...,[Starting from tomorrow you could try setting ...,"[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]"
2,Healthy food is expensive and earning a middle...,"[Starting from tomorrow, you could make a list...","[N, Y, Y, Y, N, Y, Y, Y, Y, Y]"
3,Working out is hard for me because I'm used to...,"[It could be helpful to set small, achievable ...","[Y, Y, Y, Y, Y, Y, Y, Y, Y, Y]"
4,When I see pizza I always want to buy and I en...,"[Starting from tomorrow, you could make a plan...","[N, Y, Y, N, Y, Y, Y, Y, Y, Y]"


In [79]:
# Initializing an empty list to hold the new row data
expanded_rows = []

# Iterating through each row in the new_suggestion_dataset DataFrame
for _, row in new_suggestion_dataset.iterrows():
    # Extract the row values
    struggle = row['struggle']
    candidates = row['suggestion_candidates']
    annotations = row['suggestion_annotation']
    
    # Iterating over the index and value of the candidates list
    for index, candidate in enumerate(candidates):
        # Extracting the corresponding annotation using the index
        annotation = annotations[index]
        
        # Appending the extracted data as a new row in expanded_rows list
        expanded_rows.append({'struggle': struggle, 'suggestion_candidate': candidate, 'suggestion_annotation': annotation})

# Converting the expanded_rows list into a new DataFrame
expanded_suggestion_dataset = pd.DataFrame(expanded_rows)

# Displaying the first few rows of the expanded suggestion DataFrame to verify
expanded_suggestion_dataset.head()

Unnamed: 0,struggle,suggestion_candidate,suggestion_annotation
0,When dieting I often find it hard to track my ...,"Starting from tomorrow, you could try tracking...",N
1,When dieting I often find it hard to track my ...,It could be helpful to invest in a food scale ...,Y
2,When dieting I often find it hard to track my ...,It could be helpful to focus on tracking the m...,Y
3,When dieting I often find it hard to track my ...,Maybe you could try planning your meals ahead ...,N
4,When dieting I often find it hard to track my ...,You could try searching for calorie informatio...,Y


In [80]:
print("Number of rows in expanded_suggestion_dataset:", expanded_suggestion_dataset.shape[0])

Number of rows in expanded_suggestion_dataset: 23220


In [81]:
# Stripping leading and trailing whitespaces from the suggestion_annotation column
expanded_suggestion_dataset['suggestion_annotation'] = expanded_suggestion_dataset['suggestion_annotation'].str.strip()

In [82]:
# Combining safe and unsafe suggestion statements into a set for quick lookup
selected_statements = set(filtered_safe_suggestion_statements + filtered_unsafe_suggestion_statements)

# Initial filter to get rows where suggestion_candidates match the selected statements
initial_filtered = expanded_suggestion_dataset[expanded_suggestion_dataset['suggestion_candidate'].isin(selected_statements)]

# Filter out up to 2650 rows for each 'Y' and 'N' annotation
filtered_Y = initial_filtered[initial_filtered['suggestion_annotation'] == 'Y'].sample(n=min(2650, len(initial_filtered[initial_filtered['suggestion_annotation'] == 'Y'])), random_state=1)
filtered_N = initial_filtered[initial_filtered['suggestion_annotation'] == 'N'].sample(n=min(2650, len(initial_filtered[initial_filtered['suggestion_annotation'] == 'N'])), random_state=1)

# Combining the filtered datasets back into a single DataFrame
filtered_expanded_suggestion_dataset = pd.concat([filtered_Y, filtered_N])

# Printing the number of rows in the final filtered dataset
print(f"Number of 'Y' rows: {filtered_Y.shape[0]}")
print(f"Number of 'N' rows: {filtered_N.shape[0]}")
print(f"Total rows in filtered dataset: {filtered_expanded_suggestion_dataset.shape[0]}")
filtered_expanded_suggestion_dataset.head()

Number of 'Y' rows: 2650
Number of 'N' rows: 2650
Total rows in filtered dataset: 5300


Unnamed: 0,struggle,suggestion_candidate,suggestion_annotation
216,It is hard to plan in advance all week meals a...,It could be helpful to find healthy and conven...,Y
10059,"When I'm ""pmsing"" I struggle to control the ur...",Starting from tomorrow you could try to set a...,Y
16843,I struggle with going out to get groceries. I ...,You could try enlisting the help of a friend o...,Y
9417,I struggle with exercise. I feel really sore t...,It could be helpful to consult a doctor or a p...,Y
5070,Doing the dishes. If I cook the kitchen is a m...,It could be helpful to remind yourself that it...,Y


In [83]:
# Creating the 'input' column with formatted strings for suggestion dataset
filtered_expanded_suggestion_dataset['input'] = 'Struggle: ' + filtered_expanded_suggestion_dataset['struggle'].astype(str) + \
                                                ' Supportive Text: ' + filtered_expanded_suggestion_dataset['suggestion_candidate'].astype(str)

# Moving the 'input' column to the first position
column_order = ['input'] + [col for col in filtered_expanded_suggestion_dataset.columns if col != 'input']
filtered_expanded_suggestion_dataset = filtered_expanded_suggestion_dataset[column_order]

# Renaming the DataFrame to balanced_suggestion_dataset
balanced_suggestion_dataset = filtered_expanded_suggestion_dataset

# Displaying the first few rows to verify the new DataFrame
balanced_suggestion_dataset.head()

Unnamed: 0,input,struggle,suggestion_candidate,suggestion_annotation
216,Struggle: It is hard to plan in advance all we...,It is hard to plan in advance all week meals a...,It could be helpful to find healthy and conven...,Y
10059,"Struggle: When I'm ""pmsing"" I struggle to cont...","When I'm ""pmsing"" I struggle to control the ur...",Starting from tomorrow you could try to set a...,Y
16843,Struggle: I struggle with going out to get gro...,I struggle with going out to get groceries. I ...,You could try enlisting the help of a friend o...,Y
9417,Struggle: I struggle with exercise. I feel rea...,I struggle with exercise. I feel really sore t...,It could be helpful to consult a doctor or a p...,Y
5070,Struggle: Doing the dishes. If I cook the kitc...,Doing the dishes. If I cook the kitchen is a m...,It could be helpful to remind yourself that it...,Y


## FORMATTING DATAFRAME FOR INSTRUCTION TUNING GEMMA

In [84]:
# Creating the 'output' column using .loc
balanced_suggestion_dataset.loc[:, 'output'] = balanced_suggestion_dataset['suggestion_annotation'].apply(
    lambda x: "This supportive text is safe" if x == 'Y' else "This supportive text is unsafe")

# Modifying the column order to ensure 'output' is right after 'input'
columns_except_input_output = [col for col in balanced_suggestion_dataset.columns if col not in ['input', 'output']]

#Defining the new column order with 'input' first, 'output' second, and then the rest
new_column_order = ['input', 'output'] + columns_except_input_output

# Reordering the DataFrame according to the new column order
balanced_suggestion_dataset = balanced_suggestion_dataset[new_column_order]

# Displaying the first few rows to verify the changes
balanced_suggestion_dataset.head()

Unnamed: 0,input,output,struggle,suggestion_candidate,suggestion_annotation
216,Struggle: It is hard to plan in advance all we...,This supportive text is safe,It is hard to plan in advance all week meals a...,It could be helpful to find healthy and conven...,Y
10059,"Struggle: When I'm ""pmsing"" I struggle to cont...",This supportive text is safe,"When I'm ""pmsing"" I struggle to control the ur...",Starting from tomorrow you could try to set a...,Y
16843,Struggle: I struggle with going out to get gro...,This supportive text is safe,I struggle with going out to get groceries. I ...,You could try enlisting the help of a friend o...,Y
9417,Struggle: I struggle with exercise. I feel rea...,This supportive text is safe,I struggle with exercise. I feel really sore t...,It could be helpful to consult a doctor or a p...,Y
5070,Struggle: Doing the dishes. If I cook the kitc...,This supportive text is safe,Doing the dishes. If I cook the kitchen is a m...,It could be helpful to remind yourself that it...,Y


In [85]:
# Copy the DataFrame
balanced_suggestion_dataset = balanced_suggestion_dataset.copy()

# Adding the 'instruction' column with specified text for all rows
balanced_suggestion_dataset['instruction'] = "Classify the Supportive Text as either safe or unsafe in relation to the Struggle."

# Defining the new column order, placing 'instruction' third
columns_before_instruction = ['input', 'output']
column_after_instruction = [col for col in balanced_suggestion_dataset.columns if col not in columns_before_instruction + ['instruction']]
new_column_order = columns_before_instruction + ['instruction'] + column_after_instruction

# Reordering the DataFrame according to the new column order
balanced_suggestion_dataset = balanced_suggestion_dataset.reindex(columns=new_column_order)

# Displaying the first few rows to verify the changes
balanced_suggestion_dataset.head()

Unnamed: 0,input,output,instruction,struggle,suggestion_candidate,suggestion_annotation
216,Struggle: It is hard to plan in advance all we...,This supportive text is safe,Classify the Supportive Text as either safe or...,It is hard to plan in advance all week meals a...,It could be helpful to find healthy and conven...,Y
10059,"Struggle: When I'm ""pmsing"" I struggle to cont...",This supportive text is safe,Classify the Supportive Text as either safe or...,"When I'm ""pmsing"" I struggle to control the ur...",Starting from tomorrow you could try to set a...,Y
16843,Struggle: I struggle with going out to get gro...,This supportive text is safe,Classify the Supportive Text as either safe or...,I struggle with going out to get groceries. I ...,You could try enlisting the help of a friend o...,Y
9417,Struggle: I struggle with exercise. I feel rea...,This supportive text is safe,Classify the Supportive Text as either safe or...,I struggle with exercise. I feel really sore t...,It could be helpful to consult a doctor or a p...,Y
5070,Struggle: Doing the dishes. If I cook the kitc...,This supportive text is safe,Classify the Supportive Text as either safe or...,Doing the dishes. If I cook the kitchen is a m...,It could be helpful to remind yourself that it...,Y


In [86]:
# Defining the full file path, including the directory and the file name
csv_file_path = r"C:\Users\Achor\Downloads\balanced_suggestion_dataset.csv"

# Saving the DataFrame to a CSV file at the specified path
balanced_suggestion_dataset.to_csv(csv_file_path, index=False)

# Printing a confirmation message
print(f"DataFrame has been saved to {csv_file_path}")

DataFrame has been saved to C:\Users\Achor\Downloads\balanced_suggestion_dataset.csv
