In [1]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset
import torch
import pandas as pd

  from pandas.core import (


In [2]:
df = pd.read_csv("Mental-Health-Twitter.csv")

In [3]:
import re

def cleaned_tweets(text):
    text = str(text)
    text = text.replace('\n', ' ')
    text = re.sub(r'http\S+', '', text)  # remove URLs
    text = re.sub(r'@\w+', '', text)     # remove @mentions
    text = re.sub(r'#\s+', '', text)     # remove space after #
    text = re.sub(r'\s+', ' ', text).strip()  # collapse multiple spaces
    return text.lower()

df['cleaned_text'] = df['post_text'].apply(cleaned_tweets)
df = df[['cleaned_text', 'label']]

In [4]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
def tokenize(batch):
    return tokenizer(batch["cleaned_text"], truncation = True)

dataset = Dataset.from_pandas(df)
tokenized_dataset = dataset.map(tokenize, batched= True)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [5]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 2)

args = TrainingArguments(output_dir= "bert_results",
                         eval_strategy = "epoch",
                         save_strategy= "epoch",
                         per_device_train_batch_size= 16,
                         per_device_eval_batch_size= 16, 
                         num_train_epochs=  3, 
                         weight_decay= 0.01, 
                         logging_dir= './logs',
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix

def compute_metrics(s):
    preds = s.predictions.argmax(-1)
    labels = s.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average= 'binary')
    acc = accuracy_score(labels, preds)
    return {"accuracy" : acc, "precision" : precision, "recall": recall, "f1" : f1}

In [None]:
data_collator = DataCollatorWithPadding(tokenizer= tokenizer)

split = tokenized_dataset.train_test_split(test_size= 0.2, seed = 42)
train_dataset = split["train"]
eval_dataset = split["test"]
trainer = Trainer(
            model = model,
            args= args,
            train_dataset= train_dataset,
            eval_dataset= eval_dataset,
            tokenizer = tokenizer,
            data_collator= data_collator,
            compute_metrics = compute_metrics,
)
trainer.train()

TrainOutput(global_step=3000, training_loss=0.26926785659790037, metrics={'train_runtime': 6278.4969, 'train_samples_per_second': 7.645, 'train_steps_per_second': 0.478, 'total_flos': 892267144269120.0, 'train_loss': 0.26926785659790037, 'epoch': 3.0})


In [10]:
predictions_output = trainer.predict(eval_dataset)
y_pred = predictions_output.predictions.argmax(-1)
y_true = predictions_output.label_ids



  0%|          | 0/250 [00:00<?, ?it/s]

In [12]:
results_df = pd.DataFrame({
    "text" : eval_dataset['cleaned_text'],
    'true' : y_true,
    'pred' : y_pred
})

false_positives = results_df[(results_df['true'] == 0) & (results_df['pred'] == 1)]
false_negatives = results_df[(results_df['true'] == 1) & (results_df['pred'] == 0)]

In [13]:
print(classification_report(y_true, y_pred, target_names= ["Not mental Health", "Mental health"]))

                   precision    recall  f1-score   support

Not mental Health       0.95      0.94      0.95      2029
    Mental health       0.94      0.95      0.94      1971

         accuracy                           0.94      4000
        macro avg       0.94      0.94      0.94      4000
     weighted avg       0.94      0.94      0.94      4000



In [16]:
false_negatives['text'].head(10)

52     rt : worrying about what’s going to happen blo...
128    you should carry yourself in a way as such. li...
286                   stop trying to redirect this to me
386                           you got that damn straight
398                                   thank you so much!
422    rt : if this is something you are able to help...
482    make sure that you take time every day to do s...
513    i'm not saying i'm qualified to be judge, jury...
523                             rt : the force is in you
570    when ppl on tv shows say they like to entertai...
Name: text, dtype: object

In [17]:
false_positives['text'].head(10)

4                                   love their tolerance
10     10 people followed me and 8 people unfollowed ...
22                                rt : *rick ross grunt*
35              rt : wiser words have never been spoken.
69                           rt : i make the city move!!
99     rt : the problem with depression &amp; anxiety...
118         rt : bro my best friend don't even text back
243    rt : life hack: don't expect stuff &amp; you'l...
280              rt : don't apologize, then do it again.
288    9 people followed me and 9 people unfollowed m...
Name: text, dtype: object

In [22]:
confusion_matrix(y_true, y_pred)

array([[1899,  130],
       [  91, 1880]], dtype=int64)