In [1]:
# General
import os
import warnings
import numpy as np
import pandas as pd
import re

# Data Preps
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

# Stopwords removal and stemming
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from hunspell import Hunspell

# Oversasmpling
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import ADASYN

# FastText
import fasttext
import fasttext.util

# LSTM with keras
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.models import Sequential
from keras.optimizers import Adam

# Evaluation metrics
from sklearn.metrics import confusion_matrix, classification_report

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)




In [2]:
# Load data
filepath = "../dataset/final_dataset.xlsx"

df = pd.read_excel(filepath, sheet_name="10k")
X = df['tweet']
y = df['label']

# Import Pre-trained fastText model on Indonesian language

In [3]:
# Load pre-trained fasttext model on Indonesia language
filepath = "../../fasttext_pretrained_model_indonesia/cc.id.300.bin"

fasttext_model = fasttext.load_model(filepath)

# Text Preprocessing

In [4]:
def case_folding(df, column_name):
    new_df = df.copy()
    new_df[column_name] = new_df[column_name].str.lower()
    
    return new_df

def remove_punctuation(df, column_name):
    new_df = df.copy()
    new_df[column_name] = new_df[column_name].str.replace(r'[^\w\s]', '', regex=True)
    
    return new_df

def remove_symbols(df, column_name):
    new_df = df.copy()
    new_df[column_name] = new_df[column_name].apply(lambda text: re.sub(r'[^\x00-\x7F]+', '', text))
    
    return new_df

def remove_punctuation_and_sc(df, column_name):
    new_df = df.copy()
    new_df[column_name] = new_df[column_name].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)
    
    return new_df

def remove_mentions_hashtags(df, column_name):
    new_df = df.copy()
    new_df[column_name] = new_df[column_name].apply(lambda x: re.sub(r'@\w+|\#\w+', '', x))
    
    return new_df

def remove_hyperlink(df, column_name):
    new_df = df.copy()
    new_df[column_name] = new_df[column_name].apply(lambda x: re.sub(r'http\S+', '', x))
    
    return new_df

def remove_redundant_whitespace(df, column_name):
    new_df = df.copy()
    new_df[column_name] = new_df[column_name].str.replace(r'\s+', ' ', regex=False).str.strip()
    
    return new_df

In [5]:
factory = StopWordRemoverFactory()
stopwords_sastrawi = factory.get_stop_words()

def do_stopwords_sastrawi(text):
    words = text.split()
    words_filtered = [word for word in words if not word in stopwords_sastrawi]
    return " ".join(words_filtered)

def remove_stopwords(df, column_name):
    new_df = df.copy()
    new_df[column_name] = new_df[column_name].apply(do_stopwords_sastrawi)
    
    return new_df

In [6]:
# Load the Indonesian stopwords from Hunspell
current_directory = os.getcwd()
filepath = os.path.join(current_directory, "..", "hunspell-id-main", "id_ID")

h = Hunspell(filepath,filepath)

def word_hunspell(word):
    try:
        stems = h.stem(word)
    except UnicodeEncodeError:
        stems = [word]
    
    if len(stems) == 0:
        output = word
    else:
        output = stems[0]
    return output

def stem_hunspell(text):
    hs_stem = [word_hunspell(word) for word in text.split()]
    output = ' '.join(hs_stem) 
    return output

def stemming(df, column_name):
    new_df = df.copy()
    new_df[column_name] = new_df[column_name].apply(stem_hunspell)
    
    return new_df

In [7]:
def preprocessing(df, column_name):  
    new_df = df.copy()
    new_df = remove_mentions_hashtags(new_df, column_name)
    new_df = remove_hyperlink(new_df, column_name)
    new_df = remove_punctuation_and_sc(new_df, column_name)
    new_df = case_folding(new_df, column_name)
    new_df = remove_stopwords(new_df, column_name)
    new_df = remove_redundant_whitespace(new_df, column_name)
    new_df = stemming(new_df, column_name)
    
    return new_df

In [8]:
#df_cf = case_folding(df, 'tweet')
#df_rsw = remove_stopwords(df, 'tweet')
#df_stem = stemming(df, 'tweet')
#df_rpunc = remove_punctuation(df, 'tweet')
#df_rsym = remove_symbols(df, 'tweet')
#df_rpsc = remove_punctuation_and_sc(df, 'tweet')
#df_rrw = remove_redundant_whitespace(df, 'tweet')
#df_rmh = remove_mentions_hashtags(df, 'tweet')
#df_rhl = remove_hyperlink(df, 'tweet')
#df_preprocessed = preprocessing(df, 'tweet')

# LSTM

In [9]:
# Preparing tweets and label for each respective preprocessing steps (to be compared by using evaluation metrics)

#X = df['tweet']
#y = df['label']

#X = df_cf['tweet']
#y = df_cf['label']

#X = df_rsw['tweet']
#y = df_rsw['label']

#X = df_stem['tweet']
#y = df_stem['label']

#X = df_rpunc['tweet']
#y = df_rpunc['label']

#X = df_rsym['tweet']
#y = df_rsym['label']

#X = df_rpsc['tweet']
#y = df_rpsc['label']

#X = df_rrw['tweet']
#y = df_rrw['label']

#X = df_rmh['tweet']
#y = df_rmh['label']

#X = df_rhl['tweet']
#y = df_rhl['label']

#X = df_preprocessed['tweet']
#y = df_preprocessed['label']

## No Imbalanced Class Treatment

In [10]:
# Split data into train and test
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=1)

In [11]:
# Tokenize the data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)

In [12]:
# Pad the sequences
max_length = max([len(s.split()) for s in X])
X_train = pad_sequences(X_train, maxlen=max_length)
X_val = pad_sequences(X_val, maxlen=max_length)

In [13]:
# check the dimension of the embeddings
dim = fasttext_model.get_dimension()

# create the embedding matrix
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = fasttext_model.get_word_vector(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [14]:
pd.DataFrame(embedding_matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.002437,-0.043058,-0.018707,0.128495,-0.018542,-0.101053,-0.034554,0.143302,-0.066489,-0.113288,...,-0.069537,-0.042619,-0.024802,-0.008546,-0.021058,-0.034511,0.009614,-0.020076,-0.057329,0.047373
2,0.006226,-0.039082,-0.188137,0.121136,-0.001375,-0.017565,0.021986,0.010326,-0.003909,-0.072086,...,0.007077,-0.125213,0.051104,-0.055601,-0.034176,-0.043827,-0.025502,-0.045835,-0.027698,0.121965
3,0.051436,-0.028671,-0.022541,0.191665,-0.029133,-0.184469,0.045916,0.121244,0.051155,-0.261428,...,0.032695,-0.007114,0.101565,0.101271,0.069635,-0.078312,0.066119,-0.240654,-0.107772,0.228045
4,-0.039191,-0.041498,-0.063466,0.100154,-0.048755,-0.259327,-0.061933,-0.004503,0.026468,-0.137056,...,-0.026037,0.006308,0.079056,-0.005207,0.022232,-0.019059,-0.042259,-0.080854,-0.021680,0.169146
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23680,0.011723,0.061613,-0.031122,0.046441,0.031312,-0.056365,-0.029583,0.016447,0.030902,0.011736,...,0.008631,-0.019576,-0.059615,0.000401,-0.001603,-0.054475,0.006014,0.052755,-0.002003,0.057023
23681,-0.025238,0.014615,0.002253,0.030075,0.011921,-0.003852,-0.050046,-0.008946,-0.045779,-0.030470,...,-0.029670,0.030701,-0.035793,-0.002349,0.016751,0.002381,0.027547,-0.018048,-0.048202,-0.011137
23682,-0.011442,-0.030182,0.025152,0.030010,-0.036651,0.007234,-0.007530,-0.041400,0.014827,-0.066604,...,-0.016342,0.007197,-0.002522,-0.029678,-0.007912,-0.007289,0.048508,-0.065911,0.002160,0.013772
23683,-0.024868,0.088420,0.243243,0.129497,-0.136866,0.021089,-0.103722,0.031331,0.137365,-0.115415,...,-0.090035,0.075169,-0.143936,0.024632,-0.108176,0.055342,-0.028520,-0.065496,0.151506,0.125149


In [26]:
model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1, dim, weights=[embedding_matrix], input_length=max_length, trainable=False))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy')
model.fit(X_train, y_train, epochs=10, batch_size=8, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1e63b430d00>

In [27]:
y_pred = model.predict(X_val)
y_pred = np.round(y_pred)
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

[[1517  117]
 [  58  334]]
              precision    recall  f1-score   support

           0       0.96      0.93      0.95      1634
           1       0.74      0.85      0.79       392

    accuracy                           0.91      2026
   macro avg       0.85      0.89      0.87      2026
weighted avg       0.92      0.91      0.92      2026



In [35]:
model_name = './suicide_detection_no_treatment.h5'
model.save(model_name)

# Class Weighting

In [28]:
# Get the class labels and their frequencies in the training data
class_labels = np.unique(y_train)
class_freq = compute_class_weight(class_weight='balanced', classes=class_labels, y=y_train)

# Calculate the inverse frequency as the class weight
class_weight = dict(zip(class_labels, class_freq))

In [29]:
model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1, dim, weights=[embedding_matrix], input_length=max_length, trainable=False))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=8, validation_split=0.1, class_weight=class_weight)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1e640e46f70>

In [30]:
# evaluate the model
y_pred = model.predict(X_val)
y_pred = np.round(y_pred)
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

[[1423  211]
 [  32  360]]
              precision    recall  f1-score   support

           0       0.98      0.87      0.92      1634
           1       0.63      0.92      0.75       392

    accuracy                           0.88      2026
   macro avg       0.80      0.89      0.83      2026
weighted avg       0.91      0.88      0.89      2026



In [35]:
model_name = './suicide_detection_cw_final.h5'
model.save(model_name)

# Oversampling

### ADASYN

In [31]:
adasyn = ADASYN()
X_adasyn, y_adasyn = adasyn.fit_resample(X_train,y_train)

In [32]:
model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1, dim, weights=[embedding_matrix], input_length=max_length, trainable=False))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_adasyn, y_adasyn, epochs=10, batch_size=8, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1e62f0aaf40>

In [33]:
y_pred = model.predict(X_val)
y_pred = np.round(y_pred)
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

[[1376  258]
 [  46  346]]
              precision    recall  f1-score   support

           0       0.97      0.84      0.90      1634
           1       0.57      0.88      0.69       392

    accuracy                           0.85      2026
   macro avg       0.77      0.86      0.80      2026
weighted avg       0.89      0.85      0.86      2026



In [34]:
model_name = './suicide_detection_adasyn.h5'
model.save(model_name)

### SMOTEENN

In [36]:
smoteenn = SMOTEENN()
X_smoteenn, y_smoteenn = smoteenn.fit_resample(X_train,y_train)

In [37]:
model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1, dim, weights=[embedding_matrix], input_length=max_length, trainable=False))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_smoteenn, y_smoteenn, epochs=10, batch_size=8, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1e659687d90>

In [38]:
y_pred = model.predict(X_val)
y_pred = np.round(y_pred)
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

[[1228  406]
 [  43  349]]
              precision    recall  f1-score   support

           0       0.97      0.75      0.85      1634
           1       0.46      0.89      0.61       392

    accuracy                           0.78      2026
   macro avg       0.71      0.82      0.73      2026
weighted avg       0.87      0.78      0.80      2026

