In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
import pandas as pd

In [8]:
from sklearn.model_selection import train_test_split

In [19]:
!pip install gensim


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [15]:
import keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [20]:
import numpy as np

In [21]:
from nltk.tokenize import sent_tokenize, word_tokenize
import gensim
from gensim.models import Word2Vec

In [22]:
from collections import Counter

In [62]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [10]:
data = pd.read_csv('drive/My Drive/clean_text.csv', header=0)
data = data.iloc[:,1:]
data.head()

Unnamed: 0,text,label
0,mom made camp hate hate day time midnight call...,0
1,idea simple healthy meal make feel depressed s...,0
2,looming corner back wave moon hear silence fee...,1
3,foodand judged weird view delusional jail life...,1
4,stuck loop ruminating overthinking connect per...,0


In [11]:
EPOCHS = 5
BATCH_SIZE = 32
LEARNING_RATE = 0.001
SEED = 0

In [13]:
data = data.dropna()

In [58]:
train_text, temp_text, train_labels, temp_labels = train_test_split(data['text'], data['label'],
                                                                    random_state=SEED,
                                                                    test_size=0.3,
                                                                    stratify=data['label'])

val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels,
                                                                random_state=SEED,
                                                                test_size=0.5,
                                                                stratify=temp_labels)

In [63]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text)
vocab_size = len(tokenizer.word_index) + 1

In [64]:
vocab_size

9900

In [65]:
vocab = Counter()
tokens_list = [(s.split()) for s in train_text]
for i in tokens_list:
  vocab.update(i)
min_occurance = 2
tokens = [k for k,c in vocab.items() if c >= min_occurance]
print(len(tokens))

4939


In [66]:
def save_list(lines, filename):
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()
 
save_list(vocab, 'vocab.txt')

In [67]:
def clean_line(line, vocab):
  tokens = line.split()
  tokens_clean = [w for w in tokens if w in vocab]
  return [tokens_clean]

def process_lines(data, vocab):
  lines = list()
  for i in data:
    line = clean_line(i, vocab)
    lines += line
  return lines

In [68]:
train_clean = process_lines(train_text, vocab)
test_clean = process_lines(test_text, vocab)

In [69]:
model = Word2Vec(vector_size=200, window=3, min_count=2, epochs=18, seed=SEED)

In [70]:
model.build_vocab(train_clean, progress_per=200)

In [71]:
model.train(train_clean, total_examples=model.corpus_count, epochs=EPOCHS,report_delay=1)

(448416, 548610)

In [72]:
filename = 'embedding_word2vec.txt'
model.wv.save_word2vec_format(filename, binary=False)

In [73]:
model.wv.most_similar('suicide')

[('method', 0.9995301365852356),
 ('note', 0.9995216131210327),
 ('writing', 0.9995215535163879),
 ('recently', 0.9994881749153137),
 ('call', 0.9994874596595764),
 ('attempt', 0.9994804263114929),
 ('decided', 0.9994590282440186),
 ('write', 0.9994583129882812),
 ('failed', 0.9994529485702515),
 ('start', 0.9994462728500366)]

In [74]:
def tokenize_and_encode(text, max_length=70):
    encoded_docs = tokenizer.texts_to_sequences(text)
    padded_sequence = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
    return padded_sequence

tokens_train = tokenize_and_encode(train_text)
tokens_val = tokenize_and_encode(val_text)
tokens_test = tokenize_and_encode(test_text)

In [76]:
def load_embedding(filename):
	file = open(filename,'r')
	lines = file.readlines()[1:]
	file.close()
	embedding = dict()
	for line in lines:
		parts = line.split()
		embedding[parts[0]] = np.asarray(parts[1:], dtype='float32')
	return embedding

In [77]:
def get_weight_matrix(embedding, vocab, embedding_dim):
	vocab_size = len(vocab) + 1
	weight_matrix = np.zeros((vocab_size, embedding_dim))
	for word, i in vocab.items():
		weight_matrix[i] = embedding.get(word)

	return weight_matrix

In [78]:
def create_emb_layer(weights_matrix, non_trainable=False):
    num_embeddings, embedding_dim = weights_matrix.shape[0], weights_matrix.shape[1]
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.load_state_dict({'weight': torch.from_numpy(weights_matrix)})
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim

In [79]:
raw_embedding_word2vec = load_embedding('embedding_word2vec.txt') 
embedding_vectors_word2vec = get_weight_matrix(raw_embedding_word2vec, tokenizer.word_index, 200)
embedding_vectors_word2vec = np.float32(embedding_vectors_word2vec)

In [80]:
from keras.layers import Embedding,Dense,LSTM,Bidirectional,GlobalMaxPooling1D,Input,Dropout,Conv1D,MaxPooling1D,Flatten


In [40]:
from tensorflow.keras.callbacks import EarlyStopping

In [83]:
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=70))
model.add(tf.keras.layers.BatchNormalization())
model.add(Conv1D(filters=32, kernel_size=2, activation='relu'))
model.add(MaxPooling1D(pool_size=3))
model.add(Dropout(0.5))
model.add(Conv1D(filters=32, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=5))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(8, activation='relu',kernel_regularizer='l1'))
model.add(tf.keras.layers.BatchNormalization())
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer="rmsprop", loss='binary_crossentropy', metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
print(model.summary())

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 70, 100)           990000    
                                                                 
 batch_normalization_8 (Batc  (None, 70, 100)          400       
 hNormalization)                                                 
                                                                 
 conv1d_16 (Conv1D)          (None, 69, 32)            6432      
                                                                 
 max_pooling1d_16 (MaxPoolin  (None, 23, 32)           0         
 g1D)                                                            
                                                                 
 dropout_24 (Dropout)        (None, 23, 32)            0         
                                                                 
 conv1d_17 (Conv1D)          (None, 19, 32)           

In [84]:
r=model.fit(tokens_train,train_labels,validation_data=(tokens_val,val_labels),epochs=5,batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
