In [2]:
import numpy as np
import pandas as pd
from dotenv import load_dotenv
import os

import nest_asyncio
nest_asyncio.apply()

import asyncio
import importlib
import reddit_data_fetcher
importlib.reload(reddit_data_fetcher)

from reddit_data_fetcher import RedditFetcher
import spacy
from collections import Counter
from sklearn.metrics import classification_report, confusion_matrix

*Get Reddit client and user agent info*

In [4]:
load_dotenv()  # Loads from .env by default

client_id = os.getenv("CLIENT_ID")
client_secret = os.getenv("CLIENT_SECRET")
user_agent = os.getenv("USER_AGENT")

print(client_id)

None


*Run the reddit scraper*

In [None]:
subreddits = ['domesticviolence', 'abusiverelationships']
search_terms = ['abuse', 'violence', 'help', 'domestic violence', 'survivor', 'kill', 'murder', 'signs', 'strangle']

# Create fetcher object with all required arguments
fetcher = RedditFetcher(
    client_id=client_id,
    client_secret=client_secret,
    user_agent=user_agent,
    subreddit_names=subreddits,
    search_terms=search_terms,
    posts_file_path='./data',
    limit_num=100,            # Number of posts per search term
    search_time_filter='all'  # Can be 'year', 'month', 'all', etc.
)

# ✅ Run the async fetch_search_data() directly
asyncio.run(fetcher.fetch_search_data())

# ✅ Combine subreddit files into one master file
fetcher.combine_subreddit_files(combined_file_name='reddit_combined_posts.csv')

🚀 Starting Reddit search fetch...

🔎 Searching in r/domesticviolence
🗂️ Loaded 408 existing posts from domesticviolence_posts_data.csv

🔍 Searching for: abuse
🔴 Rate limit reached! Pausing for 60.00 seconds...
✅ Found 0 new posts for search term 'abuse'.

🔍 Searching for: violence
🔴 Rate limit reached! Pausing for 60.00 seconds...
🔴 Rate limit reached! Pausing for 60.00 seconds...
✅ Found 0 new posts for search term 'violence'.

🔍 Searching for: help
🔴 Rate limit reached! Pausing for 60.00 seconds...
🔴 Rate limit reached! Pausing for 60.00 seconds...
✅ Found 0 new posts for search term 'help'.

🔍 Searching for: domestic violence
🔴 Rate limit reached! Pausing for 60.00 seconds...
✅ Found 0 new posts for search term 'domestic violence'.

🔍 Searching for: survivor
🔴 Rate limit reached! Pausing for 60.00 seconds...
🔴 Rate limit reached! Pausing for 60.00 seconds...
✅ Found 0 new posts for search term 'survivor'.

🔍 Searching for: killmurder
✅ Found 0 new posts for search term 'killmurder'.

*Check the downloaded data*

In [4]:
df = pd.read_csv('./data/reddit_combined_posts.csv')

print(df.columns)
print("\n ------------------------- ")
print(df.shape)
print("\n =================================== \n")
df.head(10)

Index(['id', 'title', 'author', 'score', 'num_comments', 'url', 'selftext',
       'created_at', 'scrape_time', 'search_term'],
      dtype='object')

 ------------------------- 
(1202, 10)




Unnamed: 0,id,title,author,score,num_comments,url,selftext,created_at,scrape_time,search_term
0,1dpd86x,My ex abuser was just sentenced to death,TattoedTigerTrainer,544,180,https://www.reddit.com/r/domesticviolence/comm...,My ex (Wade Wilson) was sentenced to death and...,2024-06-26 23:47:20,2025-03-17 03:01:01,abuse
1,1jajams,Abused wants to tell the abuser that she’s lea...,MyMonkeyCircus,19,84,https://www.reddit.com/r/domesticviolence/comm...,UPDATE. SHE LEFT and is in a hotel until she c...,2025-03-13 18:40:30,2025-03-17 03:01:03,abuse
2,1j7vvhu,Does your abuser scare you during sex?,ChildhoodFrequent208,25,61,https://www.reddit.com/r/domesticviolence/comm...,Sill working on plan on getting out.. But duri...,2025-03-10 10:51:39,2025-03-17 03:01:04,abuse
3,1idcwpj,Does your abuser sleep like a baby after?,Kastle69,112,47,https://www.reddit.com/r/domesticviolence/comm...,"I swear after every fight, he sleeps like a ba...",2025-01-30 03:16:20,2025-03-17 03:01:06,abuse
4,1fj5ixt,My abuser gave me an incurable STD. Life is over,nonya17,57,100,https://www.reddit.com/r/domesticviolence/comm...,My life is officially over. It wasn’t bad enou...,2024-09-17 17:38:01,2025-03-17 03:01:07,abuse
5,1j1rx03,Ex abuser has a new gf,Illustrious-Bus-3547,29,53,https://www.reddit.com/r/domesticviolence/comm...,My ex- partner is due to go to trial in April ...,2025-03-02 14:22:26,2025-03-17 03:01:09,abuse
6,1j6fl1n,I know my relationship is abusive & I can’t leave,Illustrious_Elk_7292,20,53,https://www.reddit.com/r/domesticviolence/comm...,"\n(Warning long post sorry)\n\nI am 22, my boy...",2025-03-08 11:53:25,2025-03-17 03:01:10,abuse
7,1iahvds,Did your abuser pay for dinner on the first date?,bengalbear24,18,65,https://www.reddit.com/r/domesticviolence/comm...,I’m trying to study all the red flags so I can...,2025-01-26 15:49:45,2025-03-17 03:01:11,abuse
8,1izrey2,Kids of survivors do you blame the abused pare...,Azalearose2,30,52,https://www.reddit.com/r/domesticviolence/comm...,I grew up in a domestic violence household and...,2025-02-27 21:45:51,2025-03-17 03:01:13,abuse
9,1dnuez7,I'm curious...is everyone's abuser here an alc...,daisy97xo,68,109,https://www.reddit.com/r/domesticviolence/comm...,I'm just wondering if anyones abuser is actual...,2024-06-25 01:43:45,2025-03-17 03:01:14,abuse


##### Create postgresql DB

*Test connection*

In [32]:
import psycopg2

DATABASE_URL = "postgresql://postgres:mypassword@postgres:5432/redditdb"

try:
    conn = psycopg2.connect(DATABASE_URL)
    cur = conn.cursor()

    cur.execute("SELECT NOW();")
    result = cur.fetchone()
    print("Connected! The time is:", result)

    cur.close()
    conn.close()

except Exception as e:
    print("Failed to connect:", e)

Connected! The time is: (datetime.datetime(2025, 3, 25, 17, 16, 51, 914101, tzinfo=datetime.timezone.utc),)


*Import reddit combined data df*

In [3]:
# Path to your CSV file inside the container
csv_path = 'data/reddit_combined_posts.csv'

# Load the CSV into a DataFrame
df = pd.read_csv(csv_path)

# Display column names
print(df.columns.tolist())

['id', 'title', 'author', 'score', 'num_comments', 'url', 'selftext', 'created_at', 'scrape_time', 'search_term']


*Create SQL table*

In [4]:
conn = psycopg2.connect(DATABASE_URL)
cur = conn.cursor()

cur.execute("""
CREATE TABLE IF NOT EXISTS reddit_posts (
    id TEXT PRIMARY KEY,
    title TEXT,
    author TEXT,
    score INTEGER,
    num_comments INTEGER,
    url TEXT,
    selftext TEXT,
    created_at TIMESTAMP,
    scrape_time TIMESTAMP,
    search_term TEXT
)
""")

conn.commit()
cur.close()
conn.close()

print("Table created successfully!")


Table created successfully!


*Insert reddit data into SQL table*

In [5]:
from sqlalchemy import create_engine

# Load your CSV
csv_path = 'data/reddit_combined_posts.csv'
df = pd.read_csv(csv_path)

# Connect to your Postgres DB
engine = create_engine(DATABASE_URL)

# Insert data into reddit_posts table
df.to_sql('reddit_posts', engine, if_exists='append', index=False)

print("CSV imported successfully!")

IntegrityError: (psycopg2.errors.UniqueViolation) duplicate key value violates unique constraint "reddit_posts_pkey"
DETAIL:  Key (id)=(1dpd86x) already exists.

[SQL: INSERT INTO reddit_posts (id, title, author, score, num_comments, url, selftext, created_at, scrape_time, search_term) VALUES (%(id__0)s, %(title__0)s, %(author__0)s, %(score__0)s, %(num_comments__0)s, %(url__0)s, %(selftext__0)s, %(created_at__0)s,  ... 183674 characters truncated ... s, %(url__999)s, %(selftext__999)s, %(created_at__999)s, %(scrape_time__999)s, %(search_term__999)s)]
[parameters: {'scrape_time__0': '2025-03-17 03:01:01', 'author__0': 'TattoedTigerTrainer', 'selftext__0': 'My ex (Wade Wilson) was sentenced to death and will be formally sentenced next month. This man tried to kill me and the kidnapped and SAd me and the  ... (172 characters truncated) ... e. Reach out. I will always talk to you. You can do it. You can be strong. It’s hard. It was hard for me and I didn’t leave until I was nearly dead. ', 'search_term__0': 'abuse', 'score__0': 544, 'url__0': 'https://www.reddit.com/r/domesticviolence/comments/1dpd86x/my_ex_abuser_was_just_sentenced_to_death/', 'created_at__0': '2024-06-26 23:47:20', 'title__0': 'My ex abuser was just sentenced to death', 'num_comments__0': 180, 'id__0': '1dpd86x', 'scrape_time__1': '2025-03-17 03:01:03', 'author__1': 'MyMonkeyCircus', 'selftext__1': 'UPDATE. SHE LEFT and is in a hotel until she can connect with local resources. She is a mess, but we’ll take it one step at a time.\r\n\r\nShe left a ... (1943 characters truncated) ... ely no way she can fight back if he attacks her again. I believe she is not dead only because he let her live.\r\n\r\nI need some tips/advice please.', 'search_term__1': 'abuse', 'score__1': 19, 'url__1': 'https://www.reddit.com/r/domesticviolence/comments/1jajams/abused_wants_to_tell_the_abuser_that_shes_leaving/', 'created_at__1': '2025-03-13 18:40:30', 'title__1': 'Abused wants to tell the abuser that she’s leaving', 'num_comments__1': 84, 'id__1': '1jajams', 'scrape_time__2': '2025-03-17 03:01:04', 'author__2': 'ChildhoodFrequent208', 'selftext__2': "Sill working on plan on getting out.. But during sex, He is scary. Its like he gets off on making me scared? Always hands back of neck or front, of pulling hair.. Is this normal?  (only partner,) Maybe I'm just noticing due to education my self on abuse.. But IDK.", 'search_term__2': 'abuse', 'score__2': 25, 'url__2': 'https://www.reddit.com/r/domesticviolence/comments/1j7vvhu/does_your_abuser_scare_you_during_sex/', 'created_at__2': '2025-03-10 10:51:39', 'title__2': 'Does your abuser scare you during sex?', 'num_comments__2': 61, 'id__2': '1j7vvhu', 'scrape_time__3': '2025-03-17 03:01:06', 'author__3': 'Kastle69', 'selftext__3': 'I swear after every fight, he sleeps like a baby. After verbally tormenting me for hours, screaming and throwing things and mocking me and calling me ... (350 characters truncated) ... s so he can’t continue to treat me like crap. But like. Damn. He’s so peaceful after tearing me apart? He really don’t love me at all, huh? 🥴😵\u200d💫', 'search_term__3': 'abuse', 'score__3': 112, 'url__3': 'https://www.reddit.com/r/domesticviolence/comments/1idcwpj/does_your_abuser_sleep_like_a_baby_after/', 'created_at__3': '2025-01-30 03:16:20', 'title__3': 'Does your abuser sleep like a baby after?', 'num_comments__3': 47, 'id__3': '1idcwpj', 'scrape_time__4': '2025-03-17 03:01:07', 'author__4': 'nonya17', 'selftext__4': 'My life is officially over. It wasn’t bad enough to be sexually, physically and mentally abused by my ex. He had to give me an incurable STD as well. ... (144 characters truncated) ... ationship. Now I know that’s the truth. No one will ever want me like this. My life is over. I don’t know how to keep going. I just want to give up. ', 'search_term__4': 'abuse', 'score__4': 57, 'url__4': 'https://www.reddit.com/r/domesticviolence/comments/1fj5ixt/my_abuser_gave_me_an_incurable_std_life_is_over/', 'created_at__4': '2024-09-17 17:38:01', 'title__4': 'My abuser gave me an incurable STD. Life is over ', 'num_comments__4': 100, 'id__4': '1fj5ixt' ... 9900 parameters truncated ... 'scrape_time__995': '2025-03-17 03:38:05', 'author__995': 'joblessinperth', 'selftext__995': 'I’m 26 and my parents seperated when I was a week old. \r\n\r\nIn short, my father is mentally ill and was extremely verbally and emotionally abusive ... (2096 characters truncated) ... undary setting etc in therapy but I’d appreciate any insight from this sub on navigating this and processing what happened. \r\n\r\nLove to you all 💖', 'search_term__995': 'survivor', 'score__995': 1, 'url__995': 'https://www.reddit.com/r/abusiverelationships/comments/tree5s/my_mom_is_an_abuse_survivor_i_never_witnessed_the/', 'created_at__995': '2022-03-29 18:47:40', 'title__995': 'My mom is an abuse survivor - I never witnessed the abuse and am coming to terms with how it’s impacted my own life.', 'num_comments__995': 2, 'id__995': 'tree5s', 'scrape_time__996': '2025-03-17 03:38:06', 'author__996': 'laurabt1', 'selftext__996': 'It\'s an inner part of it, as I\'m realizing.\r\n\r\nI was never the kind of person that easily "hates" or "hold grudges."\r\n\r\nBecause I don\'t li ... (1045 characters truncated) ...  hates me now for stopping her blood-sucking.  It feeds her ego to believe how everyone else deserves her hate while she\'s so perfect and victmized.', 'search_term__996': 'survivor', 'score__996': 7, 'url__996': 'https://www.reddit.com/r/abusiverelationships/comments/paspz3/about_how_us_survivors_believe_in_karma_so_much/', 'created_at__996': '2021-08-24 18:06:04', 'title__996': "About how us survivors believe in karma so much, and whether it is or it's not a good thing", 'num_comments__996': 6, 'id__996': 'paspz3', 'scrape_time__997': '2025-03-17 03:38:08', 'author__997': 'MammothButton', 'selftext__997': 'Please delete if not allowed. I just worry about my bud.', 'search_term__997': 'survivor', 'score__997': 28, 'url__997': 'https://www.reddit.com/r/abusiverelationships/comments/b0ve4k/survivors_what_do_you_wish_your_friends_had_done/', 'created_at__997': '2019-03-14 03:23:50', 'title__997': 'Survivors-- what do you wish your friends had done for you, if anything? My severely depressed friend is in an abusive marriage and I want to know how to be the best friend I can be to him.', 'num_comments__997': 17, 'id__997': 'b0ve4k', 'scrape_time__998': '2025-03-17 03:38:09', 'author__998': 'KillMariner', 'selftext__998': "Hi all! I hope it is ok to post here. This is my first ever post on reddit, but I just don't know where else to go. My gf left an abusive marriage a  ... (1646 characters truncated) ...  a different forum I could post this in, because I really don't want to take up space here if it is meant for people more directly impacted by abuse?", 'search_term__998': 'survivor', 'score__998': 2, 'url__998': 'https://www.reddit.com/r/abusiverelationships/comments/taims0/advice_for_partners_of_abuse_survivors/', 'created_at__998': '2022-03-09 21:53:37', 'title__998': 'Advice for partners of abuse survivors?', 'num_comments__998': 2, 'id__998': 'taims0', 'scrape_time__999': '2025-03-17 03:38:10', 'author__999': 'Lucky_Habit8335', 'selftext__999': 'Hard getting over him, and start to think again they were good? I hate this process.', 'search_term__999': 'survivor', 'score__999': 2, 'url__999': 'https://www.reddit.com/r/abusiverelationships/comments/u862f3/survivors_of_stockholm_syndrome_whats_your_story/', 'created_at__999': '2022-04-20 20:45:20', 'title__999': "Survivors of Stockholm Syndrome, what's your story?", 'num_comments__999': 0, 'id__999': 'u862f3'}]
(Background on this error at: https://sqlalche.me/e/20/gkpj)

*Explore data*

In [7]:
conn = psycopg2.connect(DATABASE_URL)
cur = conn.cursor()

# Count posts
cur.execute("SELECT COUNT(*) FROM reddit_posts;")
print("Total posts:", cur.fetchone())

# Sample some data
cur.execute("SELECT id, title, selftext, url FROM reddit_posts LIMIT 5;")
rows = cur.fetchall()
for row in rows:
    print(row)

cur.close()
conn.close()

Total posts: (1202,)
('1dpd86x', 'My ex abuser was just sentenced to death', 'My ex (Wade Wilson) was sentenced to death and will be formally sentenced next month. This man tried to kill me and the kidnapped and SAd me and the police did nothing. He then went to murder two women on the west coast of Florida. \r\n\r\nPlease. Leave. If you’re afraid, it’s ok. If you need someone to talk to, I’m here. Reach out. I will always talk to you. You can do it. You can be strong. It’s hard. It was hard for me and I didn’t leave until I was nearly dead. ', 'https://www.reddit.com/r/domesticviolence/comments/1dpd86x/my_ex_abuser_was_just_sentenced_to_death/')
('1jajams', 'Abused wants to tell the abuser that she’s leaving', 'UPDATE. SHE LEFT and is in a hotel until she can connect with local resources. She is a mess, but we’ll take it one step at a time.\r\n\r\nShe left a day earlier because his weekend shift got cancelled and she realized he will likely “celebrate” that extra free day with a drink

*Select titles and text*

In [33]:
from sqlalchemy import create_engine

engine = create_engine(DATABASE_URL)

query = """
SELECT title, selftext
FROM reddit_posts
WHERE selftext IS NOT NULL AND title IS NOT NULL
"""

title_text_df = pd.read_sql_query(query, con=engine)

# Combine title and selftext
title_text_df['text'] = title_text_df['title'].fillna('') + ' ' + title_text_df['selftext'].fillna('')
title_text_df = title_text_df.drop_duplicates().reset_index(drop=True)

print(title_text_df.shape)

(1093, 3)


*Process posts*

In [143]:
import importlib
import reddit_analysis_functions as raf

importlib.reload(raf)

<module 'reddit_analysis_functions' from '/workspace/reddit_analysis_functions.py'>

*Test finding similar phrases*

In [144]:
model = raf.ViolenceModel()
matches = model.find_similar_phrases("He hit and strangled her.", threshold=0.7)
print(matches)

print("\n Sample post =================================== \n")
sample_post = title_text_df['text'][0]
print(sample_post)

print("\n =================================== \n")
matches = model.find_similar_phrases(sample_post, threshold=0.7)
print(matches)


[('hit and strangled', 1.000000238418579), ('He hit and strangled', 0.8690845966339111), ('hit and strangled her', 0.8512629270553589)]


My ex abuser was just sentenced to death My ex (Wade Wilson) was sentenced to death and will be formally sentenced next month. This man tried to kill me and the kidnapped and SAd me and the police did nothing. He then went to murder two women on the west coast of Florida. 

Please. Leave. If you’re afraid, it’s ok. If you need someone to talk to, I’m here. Reach out. I will always talk to you. You can do it. You can be strong. It’s hard. It was hard for me and I didn’t leave until I was nearly dead. 


[('tried to kill', 0.8930404782295227), ('tried to kill me', 1.000000238418579), ('went to murder', 0.7322460412979126)]


*Assign label to each post - whether it contains a description of a violent episode or not*

In [145]:
from tqdm.notebook import tqdm
tqdm.pandas()

threshold = 0.73  # 👈 Set threshold here

# Apply and return both label and matches
results = title_text_df['text'].progress_apply(lambda x: raf.label_post_as_violent(x, model, threshold=threshold))

# Split results into two new columns
title_text_df['violent_label'], title_text_df['matched_phrases'] = zip(*results)

  0%|          | 0/1093 [00:00<?, ?it/s]

In [146]:
# print the counts
label_counts = title_text_df['violent_label'].value_counts()

print("Label counts:")
print(label_counts)

Label counts:
violent_label
0    665
1    428
Name: count, dtype: int64


In [147]:
ind = 12
curr_label = 0

# Print the 'text' of the violent post at position `ind`
print(title_text_df[title_text_df['violent_label'] == curr_label].iloc[ind]['matched_phrases'])
print("\n =================================== \n")
print(title_text_df[title_text_df['violent_label'] == curr_label].iloc[ind]['text'])

[]


What counts as abuse? Does one episode where a partner grabbed another partners shirt and violently shook them count as abuse? They immediately apologized, will be starting solo therapy soon, already in couples counseling, signed up for a group therapy online. Is this something I should leave him over? Is this domestic violence? 


*Create labelled data for SetFit*

In [148]:
# Separate violent and non-violent posts
group_samples = 30
violent_posts = title_text_df[title_text_df['violent_label'] == 1].sample(group_samples, random_state=42)
non_violent_posts = title_text_df[title_text_df['violent_label'] == 0].sample(group_samples, random_state=42)

# Combine them into one sample DataFrame and reset the index
sample_df = pd.concat([violent_posts, non_violent_posts]).reset_index(drop=True)

# Add an empty column for your manual labels
sample_df['manual_label'] = None

In [149]:
import sys
from IPython.display import clear_output

for i, row in sample_df.iterrows():
    clear_output(wait=True)  # Clears the previous cell output

    # Print separator and post content
    print("=" * 50)
    print(f"Post #{i}:\n")
    print(row['violent_label'])
    print(row['matched_phrases'])
    print(row['text'])
    print("\n" + "=" * 50)

    sys.stdout.flush()  # Make sure the print appears immediately

    # Label input
    label = input(f"\nLabel Post #{i} as violent (1) or non-violent (0): ")

    # Save the label in the DataFrame
    sample_df.at[i, 'manual_label'] = int(label)

Post #59:

0
[]
Domestic violence, depression Has anyone been in the middle of this and can’t get out of bed? I’m stuck in his house because the market sucks. I want to leave. He is gone i made the steps to call the police but now I wait? 



In [150]:
# Save as csv
sample_df.to_csv('sample_df_labelled.csv', index=False)

*Check manual label counts*
*Load sample df labelled*

In [3]:
sample_df = pd.read_csv('sample_df_labelled.csv')
sample_df.head()

Unnamed: 0,title,selftext,text,violent_label,matched_phrases,manual_label
0,"Abused by husband, his friends, and his family...","I've been drugged with fentanyl, strangled 5 t...","Abused by husband, his friends, and his family...",1,"[('strangled 5 times my', 0.7533119320869446)]",1
1,It has been 13 years since he last put his han...,"As if the constant nightmares, flashbacks, and...",It has been 13 years since he last put his han...,1,"[('the physical violence', 0.8168271780014038)...",1
2,PTSD after being strangled,It’s been a year since my ex attempt to strang...,PTSD after being strangled It’s been a year si...,1,"[('attempt to strangle me', 0.9232717752456665...",1
3,Does this count as domestic violence or abuse?...,If I had the person I’ve been in a relationshi...,Does this count as domestic violence or abuse?...,1,"[('was being choked', 0.8508139848709106), ('I...",1
4,Having trouble going to the police about my (E...,My (ex) boyfriend strangled me last Thursday. ...,Having trouble going to the police about my (E...,1,"[('strangled me last Thursday', 0.762654960155...",1


In [4]:
# print the counts
label_counts = sample_df['manual_label'].value_counts()

print("Label counts:")
print(label_counts)

Label counts:
manual_label
0    34
1    26
Name: count, dtype: int64


*Train SetFit model*

In [5]:
from datasets import Dataset
from sklearn.model_selection import train_test_split

# Your labeled data (30 examples)
texts = sample_df['text'].tolist()
labels = sample_df['manual_label'].tolist()

# Split into train (80%) and test (20%)
X_train, X_test, y_train, y_test = train_test_split(
    texts, labels,
    test_size=0.2,    # 6 test examples
    stratify=labels,  # Keep label proportions
    random_state=42
)

In [6]:
print("Train size:", len(X_train))
print("Test size:", len(X_test))
print(y_test)

Train size: 48
Test size: 12
[0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0]


In [7]:
import sys
sys.path.insert(0, "/home/vscode/.local/lib/python3.11/site-packages")

# Verify immediately:
import accelerate
print(accelerate.__version__)

1.5.2


In [8]:
from setfit import SetFitModel, Trainer, TrainingArguments
from datasets import Dataset

train_dataset = Dataset.from_dict({"text": X_train, "label": y_train})
test_dataset = Dataset.from_dict({"text": X_test, "label": y_test})

# explicitly define labels if desired (optional)
model = SetFitModel.from_pretrained(
    "sentence-transformers/all-MiniLM-L6-v2",
    labels=list(set(y_train))
)

args = TrainingArguments(
    batch_size=8,
    num_epochs=1,
    num_iterations=25,  # SetFit-specific parameter goes here now
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    metric="accuracy",
    column_mapping={"text": "text", "label": "label"}
)

trainer.train()
metrics = trainer.evaluate()
print(metrics)

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to the training dataset
Applying column mapping to the evaluation dataset


Map:   0%|          | 0/48 [00:00<?, ? examples/s]

***** Running training *****
  Num unique pairs = 2400
  Batch size = 8
  Num epochs = 1


Epoch,Training Loss,Validation Loss
1,0.0022,0.343614


***** Running evaluation *****


{'accuracy': 0.5833333333333334}


##### Sentence level SetFit classifier

*Review individual posts to select positive and negative sentence examples*

In [177]:
ind = 147
# text = sample_df['text'][ind]
text = title_text_df['text'][ind]

nlp = spacy.load('en_core_web_sm')
doc = nlp(text)

sentences = [sent.text.strip() for sent in doc.sents]

for sentence in sentences:
    print(sentence)

DV survivor going through it again I (27) grew up with domestic violence, the abuser being my father.
I have undiagnosed anxiety and most likely PTSD.
( yes I’ve attempted to get help but let’s say that experience was less than ideal , I’ll try again soon I’m sure just traumatized at the moment )
but my father has relapsed and I don’t know what to do.
Any advice?
Here’s the back story:
My dad has been an alcoholic since I could remember.
He was very abusive to my mom and I could never understand why.
He would always go on these drunk rampages about stupid things like not locking the door or us dropping something on accident.
I’ve learned to not slam things or stomp my feet purely to make sure I couldn’t make him mad.
I’ve now adopted this into my normal life that I get a mini panic attack if someone starts yelling or if I heard someone slam something.
Two years ago I finally had enough and couldn’t take it anymore.
I lived at home with my sick mom and my dad.
He came home one night so 

*Read sample sentences csv*

In [10]:
sentence_df = pd.read_csv("sample_sentences.csv")

print(sentence_df.head())

print("\n =================================== \n")
print(sentence_df['label'].isna().sum())
sentence_df[sentence_df['label'].isna()]

                                                text  label
0                           He tried to strangle me.      1
1                                It was sunny today.      0
2  I've been drugged with fentanyl, strangled 5 t...      1
3  My family won't help, police won't help, nobod...      0
4  As if the constant nightmares, flashbacks, and...      0


0


Unnamed: 0,text,label


In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    sentence_df['text'].tolist(),
    sentence_df['label'].tolist(),
    test_size=0.25,
    stratify=sentence_df['label'],
    random_state=42
)

train_dataset = Dataset.from_dict({"text": X_train, "label": y_train})
test_dataset = Dataset.from_dict({"text": X_test, "label": y_test})

print("Train label distribution:", Counter(y_train))
print("Test label distribution:", Counter(y_test))


Train label distribution: Counter({0: 48, 1: 30})
Test label distribution: Counter({0: 16, 1: 10})


In [12]:
model = SetFitModel.from_pretrained(
    "sentence-transformers/all-MiniLM-L6-v2",
    labels=list(set(y_train))
)

args = TrainingArguments(
    batch_size=8,
    num_epochs=1,
    num_iterations=25,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    metric="accuracy",
    column_mapping={"text": "text", "label": "label"}
)

# ✅ Train & evaluate
trainer.train()
metrics = trainer.evaluate()
print(metrics)

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to the training dataset
Applying column mapping to the evaluation dataset


Map:   0%|          | 0/78 [00:00<?, ? examples/s]

***** Running training *****
  Num unique pairs = 3900
  Batch size = 8
  Num epochs = 1


Epoch,Training Loss,Validation Loss
1,0.0021,0.1007


***** Running evaluation *****


{'accuracy': 0.9230769230769231}


In [None]:
y_pred = model.predict(X_test)

print("🔍 Classification Report:")
print(classification_report(y_test, y_pred, digits=3))

print("🧾 Confusion Matrix [TN, FP, FN, TP]:")
print(confusion_matrix(y_test, y_pred))

🔍 Classification Report:
              precision    recall  f1-score   support

           0      0.938     0.938     0.938        16
           1      0.900     0.900     0.900        10

    accuracy                          0.923        26
   macro avg      0.919     0.919     0.919        26
weighted avg      0.923     0.923     0.923        26

🧾 Confusion Matrix [TN, FP, FN, TP]:
[[15  1]
 [ 1  9]]


*Save sentence level SetFit model*

In [14]:
model.save_pretrained("saved_model_directory")

*Apply SetFit model to sentences in each post*