In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import numpy as np
import pandas as pd
from helper_functions import load_huggingface_dataset, preprocess_dataframe_column
from sklearn.preprocessing import LabelEncoder

In [3]:
train_df, test_df = load_huggingface_dataset("solomonk/reddit_mental_health_posts", train_split=0.8, verbose = True)

Repo card metadata block was not found. Setting CardData to empty.


Dataset loaded: solomonk/reddit_mental_health_posts
Train size: 70909
Test size: 17778


In [4]:
train_df = train_df[:800]
test_df = test_df[:200]

In [5]:
train_df.head()

Unnamed: 0,author,body,created_utc,id,num_comments,score,subreddit,title,upvote_ratio,url
0,Breadfan-,But in the end people only see the outside of ...,2021-12-22T02:41:23.000Z,rluy20,4,18,aspergers,It’s awful that you have to have so much stren...,0.96,https://www.reddit.com/r/aspergers/comments/rl...
1,gogo--yubari,I finally recently cut off these friendships b...,2021-12-07T23:36:59.000Z,rbcrva,4,13,ADHD,I have adhd…. But I do not like spending time ...,1.0,https://www.reddit.com/r/ADHD/comments/rbcrva/...
2,pathetic_gay_mess,I was testing this for a few days and my psych...,2021-09-28T12:54:49.000Z,px5rug,10,18,OCD,Technique aproved by my psychiatrist,0.88,https://www.reddit.com/r/OCD/comments/px5rug/t...
3,LilyWolf32,Does anyone else have trust issues regarding m...,2021-11-27T19:38:39.000Z,r3l3q4,24,88,ptsd,Dating and trauma,0.99,https://www.reddit.com/r/ptsd/comments/r3l3q4/...
4,Megan56789000,Waiting for Partial hospitalization treatment ...,2021-05-02T04:57:52.000Z,n2zgau,2,17,OCD,I hate OCD. I wish help was more quick to find...,0.96,https://www.reddit.com/r/OCD/comments/n2zgau/i...


In [6]:
train_df['body'] = train_df['body'].astype(str)
test_df['body'] = test_df['body'].astype(str)

In [7]:
test_df = test_df.dropna()

In [8]:
train_df = preprocess_dataframe_column(train_df, 'body')
test_df = preprocess_dataframe_column(test_df, 'body')

Preprocessing text in the body column...
Preprocessing text in the body column...


Label Encoding

In [9]:
le = LabelEncoder()

train_df['label'] = le.fit_transform(train_df['subreddit'])
test_df['label'] = le.transform(test_df['subreddit'])

Model Training

In [10]:
from model import TextClassificationDataset, BERTClassifier, train, evaluate, predict_subreddit

In [11]:
# parameters

bert_model_name = 'bert-base-uncased'
num_classes = len(le.classes_)
max_length = 128
batch_size = 16
num_epochs = 4
learning_rate = 2e-5

Setup tokenizer, dataset and dataloader

In [12]:
train_df

Unnamed: 0,author,body,created_utc,id,num_comments,score,subreddit,title,upvote_ratio,url,label
0,Breadfan-,end people see outside come weak nobody see wa...,2021-12-22T02:41:23.000Z,rluy20,4,18,aspergers,It’s awful that you have to have so much stren...,0.96,https://www.reddit.com/r/aspergers/comments/rl...,2
1,gogo--yubari,finally recently cut friendship kind shallow m...,2021-12-07T23:36:59.000Z,rbcrva,4,13,ADHD,I have adhd…. But I do not like spending time ...,1.00,https://www.reddit.com/r/ADHD/comments/rbcrva/...,0
2,pathetic_gay_mess,testing day psychiatrist aproved worked like t...,2021-09-28T12:54:49.000Z,px5rug,10,18,OCD,Technique aproved by my psychiatrist,0.88,https://www.reddit.com/r/OCD/comments/px5rug/t...,1
3,LilyWolf32,anyone else trust issue regarding men dating g...,2021-11-27T19:38:39.000Z,r3l3q4,24,88,ptsd,Dating and trauma,0.99,https://www.reddit.com/r/ptsd/comments/r3l3q4/...,4
4,Megan56789000,waiting partial hospitalization treatment wait...,2021-05-02T04:57:52.000Z,n2zgau,2,17,OCD,I hate OCD. I wish help was more quick to find...,0.96,https://www.reddit.com/r/OCD/comments/n2zgau/i...,1
...,...,...,...,...,...,...,...,...,...,...,...
795,ejrosby,ive seen many psychiatrist therapist always an...,2021-11-13T19:59:37.000Z,qt8usl,6,4,depression,What happens when you tell your therapist you’...,0.76,https://www.reddit.com/r/depression/comments/q...,3
796,[deleted],im dating app girl decided text put shes page ...,2021-03-10T05:12:29.000Z,m1q2ch,3,4,OCD,Triggered by dating app,1.00,https://www.reddit.com/r/OCD/comments/m1q2ch/t...,1
797,_fallen_star_,basically title ocd theme horrible know realev...,2021-11-29T12:42:53.000Z,r4v13f,6,19,OCD,How can real-event OCD be an obsession if it i...,1.00,https://www.reddit.com/r/OCD/comments/r4v13f/h...,1
798,VexxySmexxy,im making post minute ago flash back night bro...,2021-02-18T03:09:23.000Z,lmc8iy,5,2,ptsd,Dealing with post tramautic relationship sydrome,1.00,https://www.reddit.com/r/ptsd/comments/lmc8iy/...,4


In [13]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_df['body'], train_df['label'],  tokenizer, max_length)
val_dataset = TextClassificationDataset(test_df['body'], test_df['label'], tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

Device and model setup

In [14]:
device = torch.device("cpu")
model = BERTClassifier(num_classes).to(device)

In [15]:
device

device(type='cpu')

Setup optimizer and learning rate scheduler

In [16]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs

Model Training

In [18]:
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train(model, train_dataloader, optimizer, device, criterion=nn.CrossEntropyLoss())
    accuracy, report = evaluate(model, val_dataloader, device, criterion=nn.CrossEntropyLoss())
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(report)

Epoch 1/4


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Validation Accuracy: 0.5950
0.9973156543878409
Epoch 2/4


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Validation Accuracy: 0.7450
0.7664683827987084
Epoch 3/4


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Validation Accuracy: 0.7250
0.7120589453440446
Epoch 4/4


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Validation Accuracy: 0.7650
0.6903020854179676


In [46]:

train_df

Unnamed: 0,author,body,created_utc,id,num_comments,score,subreddit,title,upvote_ratio,url,label
0,Breadfan-,end people see outside come weak nobody see wa...,2021-12-22T02:41:23.000Z,rluy20,4,18,aspergers,It’s awful that you have to have so much stren...,0.96,https://www.reddit.com/r/aspergers/comments/rl...,2
2,gogo--yubari,finally recently cut friendship kind shallow m...,2021-12-07T23:36:59.000Z,rbcrva,4,13,ADHD,I have adhd…. But I do not like spending time ...,1.00,https://www.reddit.com/r/ADHD/comments/rbcrva/...,0
4,pathetic_gay_mess,testing day psychiatrist aproved worked like t...,2021-09-28T12:54:49.000Z,px5rug,10,18,OCD,Technique aproved by my psychiatrist,0.88,https://www.reddit.com/r/OCD/comments/px5rug/t...,1
5,LilyWolf32,anyone else trust issue regarding men dating g...,2021-11-27T19:38:39.000Z,r3l3q4,24,88,ptsd,Dating and trauma,0.99,https://www.reddit.com/r/ptsd/comments/r3l3q4/...,4
6,Megan56789000,waiting partial hospitalization treatment wait...,2021-05-02T04:57:52.000Z,n2zgau,2,17,OCD,I hate OCD. I wish help was more quick to find...,0.96,https://www.reddit.com/r/OCD/comments/n2zgau/i...,1
...,...,...,...,...,...,...,...,...,...,...,...
121021,Vessecora,tw incestuous child sexual abuse update since ...,2021-05-20T10:45:11.000Z,ngww23,3,3,ptsd,Emotional flashback and procrastinating law sc...,0.81,https://www.reddit.com/r/ptsd/comments/ngww23/...,4
121025,imuseless99,im invisible done hope good life,2021-12-03T04:08:48.000Z,r7q1ec,132,231,depression,Last post,0.96,https://www.reddit.com/r/depression/comments/r...,3
121026,redneck_lilith,originally diagnosed ptsd psychotic feature ag...,2021-02-21T17:32:17.000Z,lp20mk,0,1,ptsd,missing my psychosis,0.67,https://www.reddit.com/r/ptsd/comments/lp20mk/...,4
121028,Icy-Study-3679,realized ptsd ever time thought must asleep we...,2021-10-02T17:22:16.000Z,pzzebq,1,3,ptsd,Flashback/intrusive images or nightmare,1.00,https://www.reddit.com/r/ptsd/comments/pzzebq/...,4
