In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report,confusion_matrix
import torch
from transformers import BertTokenizer,BertForSequenceClassification,Trainer,TrainingArguments
import matplotlib.pyplot as plt
import seaborn as sn
import numpy as np
from torch.utils.data import dataloader
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm





In [3]:
import spacy
Preprocessing_Pipeline=spacy.load("en_core_web_sm")

In [4]:
df=pd.read_csv("Combined Data.csv")

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,statement,status
0,0,oh my gosh,Anxiety
1,1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,3,I've shifted my focus to something else but I'...,Anxiety
4,4,"I'm restless and restless, it's been a month n...",Anxiety


In [6]:
df.drop(columns=["Unnamed: 0"],inplace=True)

In [7]:
df.isnull().sum()

statement    362
status         0
dtype: int64

In [8]:
df.dropna(inplace=True)

In [9]:
df.isnull().sum()

statement    0
status       0
dtype: int64

In [10]:
df.status.value_counts()

status
Normal                  16343
Depression              15404
Suicidal                10652
Anxiety                  3841
Bipolar                  2777
Stress                   2587
Personality disorder     1077
Name: count, dtype: int64

In [11]:
data=df.sample(n=6000,random_state=42).reset_index(drop=True)

In [12]:
data.status.value_counts()

status
Normal                  1894
Depression              1730
Suicidal                1219
Anxiety                  429
Stress                   304
Bipolar                  293
Personality disorder     131
Name: count, dtype: int64

In [13]:
def preprocess(sent):
    doc=Preprocessing_Pipeline(sent)
    sent=[word.lemma_.lower() for word in doc if not word.is_punct and not word.is_stop]
    return ' '.join(sent)

data['statement']=data['statement'].apply(preprocess)

In [14]:
from imblearn.over_sampling import RandomOverSampler
ros=RandomOverSampler(sampling_strategy='auto',random_state=42)

X=data.drop(columns=['status'])
Y=data['status']

X_resampled,Y_resampled =ros.fit_resample(X,Y)

data=pd.concat([X_resampled,Y_resampled],axis=1)

data['status'].value_counts()

status
Normal                  1894
Bipolar                 1894
Depression              1894
Suicidal                1894
Stress                  1894
Personality disorder    1894
Anxiety                 1894
Name: count, dtype: int64

In [15]:
data

Unnamed: 0,statement,status
0,lazy complain ba ihh,Normal
1,think wifi iphone break quot connect quot actu...,Normal
2,good tracking app try find app track overall m...,Bipolar
3,recently look reddit find place actually healt...,Depression
4,favorite thing,Normal
...,...,...
13253,afford therapist social anxiety depression adh...,Suicidal
13254,like die simple like suffer anymore.i turn bra...,Suicidal
13255,death day time set affair bedroom paint family...,Suicidal
13256,hard see live life kill know wonder elementary...,Suicidal


In [16]:
lb=LabelEncoder()

data['label']=lb.fit_transform(data.status)

data.head()

Unnamed: 0,statement,status,label
0,lazy complain ba ihh,Normal,3
1,think wifi iphone break quot connect quot actu...,Normal,3
2,good tracking app try find app track overall m...,Bipolar,1
3,recently look reddit find place actually healt...,Depression,2
4,favorite thing,Normal,3


In [17]:
tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')

In [18]:
train_text,test_text,train_labels,test_labels=train_test_split(data['statement'],data['label'],test_size=0.2,random_state=42,stratify=data.label)


In [19]:
train_text

7283     think break toe walk exit basement kick metal ...
1318     don t know like ha happen try sleep bed wake m...
1771     thing s keep end upset family seriously don t ...
1341     know thing currently stay well hate heart want...
2101     get house feel like shit grocery shop favorite...
                               ...                        
709      start believe like animal plant human life one...
6658     anxiety effect school life tw \n\n hi love lt;...
11416    constantly worry snap kid total stranger trivi...
1246                         lamictal good review lamictal
11167    quit weed start have nightmare diagnose ptsd f...
Name: statement, Length: 10606, dtype: object

In [20]:
train_encodings=tokenizer(list(train_text),padding=True,truncation=True,max_length=200)
test_encodings=tokenizer(list(test_text),padding=True,truncation=True,max_length=200)

In [23]:
train_dataset=Dataset.from_dict({'input_ids':train_encodings['input_ids'],"attention_mask":train_encodings['attention_mask'],'labels':train_labels.tolist()})
test_dataset=Dataset.from_dict({'input_ids':test_encodings['input_ids'],"attention_mask":test_encodings['attention_mask'],'labels':test_labels.tolist()})

In [24]:
model=BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=len(lb.classes_))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import Trainer, TrainingArguments
from accelerate import Accelerator

accelerator=Accelerator()
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    lr_scheduler_type="linear",
    warmup_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_total_limit=3,
    gradient_accumulation_steps=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Ensure the training step is wrapped in accelerator context if needed

trainer.train()

  0%|          | 0/1655 [00:00<?, ?it/s]