In [None]:
#installing transformers in colab only -- remove if using on diff tpu
!pip install transformers



In [None]:
#importing relevant libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [None]:
#reading suicide data csv, batch_1 csv is all relevant data for training
df = pd.read_csv('data_for_model.csv')
batch_1 = df[['megatext_clean', 'is_suicide']]

print(batch_1)

                                         megatext_clean  is_suicide
0     sql witch understand people reply immediately ...           0
1     c irc welcome r depression check post place ta...           0
2     new killer 69 feeling really depressed lonely ...           0
3     jazz le crab literally broke cry asked go home...           0
4     depressed kid 786 kind soul want give depresse...           0
...                                                 ...         ...
1892  big pete 543 hard get bed every morning right ...           1
1893  eli sbt low iq twice tested time got iq around...           1
1894         gay h ovum feel useless im useless useless           1
1895  ev e wish wa way could non existence regret en...           1
1896  pon k ichi want die think want feel pain want ...           1

[1897 rows x 2 columns]


In [None]:
#how many ppl are suicidal(1) vs depressed(0)
batch_1['is_suicide'].value_counts()

1    980
0    917
Name: is_suicide, dtype: int64

In [None]:
#initializing bert base model in the program
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Loading pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [None]:
#tokenizing the actual sentences
tokenized = batch_1['megatext_clean'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=512)))

In [None]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [None]:
np.array(padded).shape

(1897, 512)

In [None]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(1897, 512)

In [None]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [None]:
features = df["megatext_clean"]
print(features)

In [None]:
labels = df['is_suicide']
print(labels)

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [None]:
#currently logistic regression. When we find a neural network, substitute it here and onwards
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

In [None]:
lr_clf.score(test_features, test_labels)

In [None]:
#comparing above score with dummy classifier score
from sklearn.dummy import DummyClassifier
clf = DummyClassifier()

scores = cross_val_score(clf, train_features, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))