In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from torch.utils.data import DataLoader
from datasets import Dataset

# Load Dataset


In [21]:
data = pd.read_csv('CombinedData.csv')
data.dropna(inplace=True)
data.head()

Unnamed: 0.1,Unnamed: 0,statement,status
0,0,oh my gosh,Anxiety
1,1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,3,I've shifted my focus to something else but I'...,Anxiety
4,4,"I'm restless and restless, it's been a month n...",Anxiety


In [22]:
data['status'].unique().tolist()

['Anxiety',
 'Normal',
 'Depression',
 'Suicidal',
 'Stress',
 'Bipolar',
 'Personality disorder']

In [23]:
data.drop(columns=['Unnamed: 0'], inplace=True, errors='ignore')
data.head()

Unnamed: 0,statement,status
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,I've shifted my focus to something else but I'...,Anxiety
4,"I'm restless and restless, it's been a month n...",Anxiety


In [24]:
# data = data.sample(n=10000, random_state=42).reset_index(drop=True)

# Data Preprocessing 

In [None]:
# Cleaning of the text data
import re
import nltk

from nltk.corpus import stopwords

stopwords = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]","", text)
    words = text.split()
    words = [word for word in words if word not in stopwords]
    return " ".join(words) 

clean_text("This is a sample text! It contains punctuation, numbers 123, and stopwords like 'the' and 'is'.")
data['cleaned_text'] = data['statement'].apply(clean_text)

In [26]:
data.head()

Unnamed: 0,statement,status,cleaned_text
0,oh my gosh,Anxiety,oh gosh
1,"trouble sleeping, confused mind, restless hear...",Anxiety,trouble sleeping confused mind restless heart ...
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety,wrong back dear forward doubt stay restless re...
3,I've shifted my focus to something else but I'...,Anxiety,ive shifted focus something else im still worried
4,"I'm restless and restless, it's been a month n...",Anxiety,im restless restless month boy mean


In [27]:
data['status'].value_counts()

status
Normal                  16343
Depression              15404
Suicidal                10652
Anxiety                  3841
Bipolar                  2777
Stress                   2587
Personality disorder     1077
Name: count, dtype: int64

In [28]:
# Balancing the dataset
from imblearn.over_sampling import RandomOverSampler

ROS = RandomOverSampler(sampling_strategy='auto', random_state=42)

X = data.drop('status', axis=1)
Y = data['status']

In [29]:
X_resampled, Y_resampled = ROS.fit_resample(X,Y)
data_balanced = pd.concat([X_resampled, Y_resampled], axis=1)

data_balanced['status'].value_counts()

status
Anxiety                 16343
Normal                  16343
Depression              16343
Suicidal                16343
Stress                  16343
Bipolar                 16343
Personality disorder    16343
Name: count, dtype: int64

In [30]:
label_encoder = LabelEncoder()
data_balanced['label_encoded'] = label_encoder.fit_transform(data_balanced['status'])
data_balanced['label_encoded'].unique()

array([0, 3, 2, 6, 5, 1, 4])

In [31]:
data_balanced.head()

Unnamed: 0,statement,cleaned_text,status,label_encoded
0,oh my gosh,oh gosh,Anxiety,0
1,"trouble sleeping, confused mind, restless hear...",trouble sleeping confused mind restless heart ...,Anxiety,0
2,"All wrong, back off dear, forward doubt. Stay ...",wrong back dear forward doubt stay restless re...,Anxiety,0
3,I've shifted my focus to something else but I'...,ive shifted focus something else im still worried,Anxiety,0
4,"I'm restless and restless, it's been a month n...",im restless restless month boy mean,Anxiety,0


#### Splitting the data into train and test sets

In [32]:
X= data_balanced['cleaned_text']
Y = data_balanced['label_encoded']  

In [33]:
train_texts, test_texts, train_labels, test_labels = train_test_split(X,Y, test_size=0.2, random_state=42, stratify=Y)

#### Tokenization

In [35]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encoded = tokenizer(train_texts.tolist(), padding=True, truncation=True, return_tensors='pt', max_length=250)
test_encoded = tokenizer(test_texts.tolist(), padding=True, truncation=True, return_tensors='pt', max_length=250)

In [36]:
train_encoded

{'input_ids': tensor([[  101,  1046, 18863,  ...,     0,     0,     0],
        [  101,  2034,  2695,  ...,     0,     0,     0],
        [  101,  2514,  2066,  ...,  2102,  4067,   102],
        ...,
        [  101,  6721,  6911,  ...,     0,     0,     0],
        [  101,  2393,  1050,  ...,     0,     0,     0],
        [  101, 15446,  2131,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [38]:
train_dataset = Dataset.from_dict({
    'input_ids': train_encoded['input_ids'],
    'attention_mask': train_encoded['attention_mask'],
    'labels': train_labels.tolist()
})

test_dataset = Dataset.from_dict({
    'input_ids': test_encoded['input_ids'],
    'attention_mask': test_encoded['attention_mask'],
    'labels': test_labels.tolist()
})

#### Fine Tuning BERT for Sentiment Classification

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(data_balanced['label_encoded'].unique()))
trainer = Trainer(
    model=model, 
    args= TrainingArguments(
        output_dir='./results',
        num_train_epochs=5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        learning_rate=2e-5,
        lr_scheduler_type='linear',
        load_best_model_at_end=True,
        metric_for_best_model='eval_loss',
        save_total_limit=3,
        gradient_accumulation_steps=2
    ),
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)