In [5]:
!pip install transformers



In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV  #finding best model
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [7]:
df = pd.read_csv("Augdata.csv")
df = df.rename(columns = {'Text data': 'data'}, inplace = False)
df.head()

Unnamed: 0,PID,data,Label
0,dev_pid_1,"I enjoyed today, and I still am! Tomorrows dep...",moderate
1,dev_pid_2,I sorta tried to kill myself : I had a total b...,moderate
2,dev_pid_3,Best suicide method? : I like it quick and eas...,moderate
3,dev_pid_4,a story : I remember the time I'd get on my 3D...,moderate
4,dev_pid_5,The world only cares about beautiful people : ...,moderate


In [8]:
df["Label"].value_counts()

Label
moderate          2306
not depression    1830
severe            1440
Name: count, dtype: int64

In [9]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df["Label"] = le.fit_transform(df["Label"])
df.head()

Unnamed: 0,PID,data,Label
0,dev_pid_1,"I enjoyed today, and I still am! Tomorrows dep...",0
1,dev_pid_2,I sorta tried to kill myself : I had a total b...,0
2,dev_pid_3,Best suicide method? : I like it quick and eas...,0
3,dev_pid_4,a story : I remember the time I'd get on my 3D...,0
4,dev_pid_5,The world only cares about beautiful people : ...,0


In [10]:
!pip install nltk




In [11]:
import nltk
nltk.download('punkt')    #tokenizer
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [12]:
import nltk
nltk.download('stopwords')

stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(['.', ',', '!', '?', ';', ':'])

def remove_stopwords(text):
  """
  Removes stopwords from a given text.

  Args:
    text: A string containing the text.

  Returns:
    A string with stopwords removed.
  """
  tokens = nltk.word_tokenize(text)
  filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
  return ' '.join(filtered_tokens)

df['data'] = df['data'].apply(remove_stopwords)
df.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,PID,data,Label
0,dev_pid_1,enjoyed today still Tomorrows depression wait ...,0
1,dev_pid_2,sorta tried kill total breakdown fucking car p...,0
2,dev_pid_3,Best suicide method like quick easy deformitie...,0
3,dev_pid_4,story remember time 'd get 3DS play Nintendogs...,0
4,dev_pid_5,world cares beautiful people 'm born ugly 've ...,0


In [13]:
# For BERT:
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

## Want DistilBERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')


# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [14]:
tokenized = df["data"].apply((lambda x: tokenizer.encode(x, add_special_tokens=True,truncation=True)))

In [15]:
print(tokenized)

0       [101, 5632, 2651, 2145, 4826, 2015, 6245, 3524...
1       [101, 4066, 2050, 2699, 3102, 2561, 12554, 823...
2       [101, 2190, 5920, 4118, 2066, 4248, 3733, 1336...
3       [101, 2466, 3342, 2051, 1005, 1040, 2131, 7605...
4       [101, 2088, 14977, 3376, 2111, 1005, 1049, 214...
                              ...                        
5571    [101, 1031, 1005, 1031, 1036, 1036, 1521, 5458...
5572    [101, 1031, 1005, 1031, 1036, 1036, 2342, 2393...
5573    [101, 1031, 1005, 1031, 1036, 1036, 24209, 130...
5574    [101, 1031, 1036, 1036, 1031, 1005, 6933, 1521...
5575    [101, 1031, 1005, 1031, 1036, 1036, 6224, 6040...
Name: data, Length: 5576, dtype: object


In [16]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [17]:
np.array(padded).shape

(5576, 512)

In [18]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(5576, 512)

In [19]:
import torch


In [20]:


input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)   # which token should be atended and which ignored

# # Truncate input_ids and attention_mask
# input_ids = input_ids[:, :max_length]
# attention_mask = attention_mask[:, :max_length]

# Proceed with the model inference
with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)   #last hidden state


In [21]:
print(attention_mask.size())

torch.Size([5576, 512])


In [22]:
features = last_hidden_states[0][:,0,:].numpy()

In [23]:
labels = df["Label"]

In [24]:
x_train, x_test, y_train, y_test = train_test_split(features, labels)

# **LogisticRegression**

In [25]:
lr_clf = LogisticRegression()
lr_clf.fit(x_train, y_train)

In [26]:
import numpy as np
pred= lr_clf.predict(x_test)

from sklearn.metrics import classification_report
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.60      0.67      0.63       575
           1       0.60      0.54      0.57       491
           2       0.89      0.83      0.86       328

    accuracy                           0.66      1394
   macro avg       0.70      0.68      0.69      1394
weighted avg       0.67      0.66      0.66      1394



# **DecisionTree**

In [27]:
from sklearn import tree

dt_clf = tree.DecisionTreeClassifier()
dt_clf.fit(x_train, y_train)

In [28]:
import numpy as np
pred= dt_clf.predict(x_test)

from sklearn.metrics import classification_report
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.55      0.52      0.53       575
           1       0.51      0.50      0.51       491
           2       0.67      0.74      0.70       328

    accuracy                           0.57      1394
   macro avg       0.58      0.59      0.58      1394
weighted avg       0.56      0.57      0.56      1394



# **RandomForest**

In [29]:
from sklearn.ensemble import RandomForestClassifier

model= RandomForestClassifier(n_estimators=100)

model.fit(x_train, y_train)

In [30]:
import numpy as np
pred= model.predict(x_test)

from sklearn.metrics import classification_report
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.60      0.79      0.68       575
           1       0.67      0.49      0.57       491
           2       0.96      0.78      0.86       328

    accuracy                           0.68      1394
   macro avg       0.74      0.69      0.70      1394
weighted avg       0.71      0.68      0.68      1394



# **SVM**

In [31]:
from sklearn.svm import SVC

model = SVC()

model.fit(x_train, y_train)

In [32]:
import numpy as np
pred= model.predict(x_test)

from sklearn.metrics import classification_report
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.59      0.83      0.69       575
           1       0.69      0.47      0.56       491
           2       0.99      0.77      0.86       328

    accuracy                           0.69      1394
   macro avg       0.76      0.69      0.70      1394
weighted avg       0.72      0.69      0.68      1394



In [33]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.0-py3-none-manylinux_2_28_x86_64.whl (153.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.9/153.9 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.22.3-py3-none-manylinux2014_x86_64.whl (190.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.9/190.9 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.22.3 xgboost-2.1.0


In [34]:
from xgboost import XGBClassifier

model = XGBClassifier()
model.fit(x_train, y_train)

In [35]:
import numpy as np
pred= model.predict(x_test)

from sklearn.metrics import classification_report
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.60      0.75      0.67       575
           1       0.65      0.53      0.58       491
           2       0.94      0.79      0.86       328

    accuracy                           0.68      1394
   macro avg       0.73      0.69      0.70      1394
weighted avg       0.70      0.68      0.68      1394

