In [1]:
import pandas as pd
import pickle
import random
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.classify import NaiveBayesClassifier, accuracy
from nltk.tag import pos_tag
from nltk.probability import FreqDist
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
import spacy
import numpy as np

In [2]:
dataset = pd.read_csv('Suicide_Data.csv')

dataset = dataset.dropna()
dataset = dataset.drop_duplicates()
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1777 entries, 0 to 1786
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Tweet    1777 non-null   object
 1   Suicide  1777 non-null   object
dtypes: object(2)
memory usage: 41.6+ KB


In [3]:
category_counts = dataset['Suicide'].value_counts()
category_counts

Suicide
Not Suicide post           1124
Potential Suicide post      653
Name: count, dtype: int64

In [4]:
not_suicide = dataset[dataset['Suicide'] == 'Not Suicide post']
potential_suicide = dataset[dataset['Suicide'] == 'Potential Suicide post ']

not_suicide_downsampled = resample(
    not_suicide, 
    replace=False,             
    n_samples=653,             
    random_state=42            
)

balanced_dataset = pd.concat([not_suicide_downsampled, potential_suicide])

print("Distribusi sebelum balancing:")
print(dataset['Suicide'].value_counts())

print("Distribusi setelah balancing:")
print(balanced_dataset['Suicide'].value_counts())

Distribusi sebelum balancing:
Suicide
Not Suicide post           1124
Potential Suicide post      653
Name: count, dtype: int64
Distribusi setelah balancing:
Suicide
Not Suicide post           653
Potential Suicide post     653
Name: count, dtype: int64


In [5]:
tweetList = balanced_dataset['Tweet'].to_list()
labelList = balanced_dataset['Suicide'].to_list()

tfidfVectorizer = TfidfVectorizer(tokenizer=word_tokenize, stop_words='english')
tfidfMatrix = tfidfVectorizer.fit_transform(tweetList)



In [6]:
unique_categories = pd.DataFrame(dataset['Suicide'].unique())
unique_categories

Unnamed: 0,0
0,Not Suicide post
1,Potential Suicide post


In [7]:
# Preprocessing

eng_stopwords = stopwords.words('english')
punctuation_list = string.punctuation
stemming = SnowballStemmer('english')
wnl  = WordNetLemmatizer()

def removeStopwords(wordList):
    removed = []
    for word in wordList:
        if word not in eng_stopwords:
            removed.append(word)
            
    return removed 
            
def removePunctuation(wordList):
    removed = []
    for word in wordList:
        if word not in punctuation_list:
            removed.append(word)
            
    return removed

def removeNumber(wordList):
    removed = []
    for word in wordList:
        if word.isalpha(): 
            removed.append(word)
    return removed

def stemmingWord(wordList):
    removed = []
    for word in wordList:
        removed.append(stemming.stem(word))
        
    return removed

def getTag(tag):
    if tag =='jj':
        return 'a'
    elif tag in ['vb', 'nn', 'rb']:
        return tag[0]
    else:
        return None
    
def lemmatizingWord(wordList):
    lemmatizing = []
    tagging = pos_tag(wordList)
    for word, tag in tagging: 
        label = getTag(tag.lower())
        
        if label != None:
            lemmatizing.append(wnl.lemmatize(word, label))
        else:
            lemmatizing.append(wnl.lemmatize(word))
            
    return lemmatizing

In [8]:
def preprocess_text(text):
    words = word_tokenize(text)
    words = removeStopwords(words)
    words = removePunctuation(words)
    words = removeNumber(words)
    words = stemmingWord(words)
    words = lemmatizingWord(words)
    return ' '.join(words)

In [9]:
preprocessed_tweets = [preprocess_text(tweet) if isinstance(tweet, str) else '' for tweet in tweetList]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(preprocessed_tweets)
y = labelList

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [16]:
### NAIVE BAYES MODEL

labeled_list = list(zip(preprocessed_tweets, y))
features_sets = []
for sentence, label in labeled_list:
    features = {}
    check_list = preprocess_text(sentence).split()
    for word in vectorizer.get_feature_names_out():
        features[word] = (word in check_list)
    features_sets.append((features, label))

random.shuffle(features_sets)
train_count = int(len(features_sets) * 0.7)
train_dataset = features_sets[:train_count]
test_dataset = features_sets[train_count:]

naive_bayes_classifier = NaiveBayesClassifier.train(train_dataset)
nb_accuracy = accuracy(naive_bayes_classifier, test_dataset)
true_labels = [label for _, label in test_dataset]
predicted_labels = [naive_bayes_classifier.classify(features) for features, _ in test_dataset]

print(f'Naive Bayes Accuracy: {accuracy(naive_bayes_classifier, test_dataset) * 100:.2f}%')

trueLabels = [label for _, label in test_dataset]
predictedLabels = [naive_bayes_classifier.classify(features) for features, _ in test_dataset]

print("Naive Bayes Classification Report:")
print(classification_report(trueLabels, predictedLabels, zero_division=0))

Naive Bayes Accuracy: 91.84%
Naive Bayes Classification Report:
                         precision    recall  f1-score   support

       Not Suicide post       0.90      0.94      0.92       193
Potential Suicide post        0.94      0.89      0.92       199

               accuracy                           0.92       392
              macro avg       0.92      0.92      0.92       392
           weighted avg       0.92      0.92      0.92       392



In [17]:
### SVM MODEL

svm = SVC(kernel='linear', random_state=42)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

svm_accuracy = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {svm_accuracy * 100:.2f}%")
print("SVM Classification Report:")
print(classification_report(y_test, y_pred_svm, zero_division=0))

SVM Accuracy: 92.35%
SVM Classification Report:
                         precision    recall  f1-score   support

       Not Suicide post       0.91      0.95      0.93       208
Potential Suicide post        0.94      0.89      0.92       184

               accuracy                           0.92       392
              macro avg       0.93      0.92      0.92       392
           weighted avg       0.92      0.92      0.92       392



In [18]:
### RANDOM FOREST MODEL

random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(X_train, y_train)
y_pred_rf = random_forest.predict(X_test)

rf_accuracy = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {rf_accuracy * 100:.2f}%")
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf, zero_division=0))


Random Forest Accuracy: 93.11%
Random Forest Classification Report:
                         precision    recall  f1-score   support

       Not Suicide post       0.91      0.97      0.94       208
Potential Suicide post        0.96      0.89      0.92       184

               accuracy                           0.93       392
              macro avg       0.94      0.93      0.93       392
           weighted avg       0.93      0.93      0.93       392



In [19]:
### LOGISTIC REGRESSION MODEL

log_reg = LogisticRegression(random_state=42, max_iter=1000)
log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)

lr_accuracy = accuracy_score(y_test, y_pred_lr)
print(f"Logistic Regression Accuracy: {lr_accuracy * 100:.2f}%")
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr, zero_division=0))


Logistic Regression Accuracy: 92.09%
Logistic Regression Classification Report:
                         precision    recall  f1-score   support

       Not Suicide post       0.90      0.95      0.93       208
Potential Suicide post        0.94      0.89      0.91       184

               accuracy                           0.92       392
              macro avg       0.92      0.92      0.92       392
           weighted avg       0.92      0.92      0.92       392



In [21]:
label_encoder = LabelEncoder()

y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

xgboost_model = XGBClassifier(random_state=42)
xgboost_model.fit(X_train, y_train_encoded)

y_pred_xgb = xgboost_model.predict(X_test)

xgb_accuracy = accuracy_score(y_test_encoded, y_pred_xgb)
print(f"XGBoost Accuracy: {xgb_accuracy * 100:.2f}%")
print("XGBoost Classification Report:")
print(classification_report(y_test_encoded, y_pred_xgb, zero_division=0))


XGBoost Accuracy: 91.33%
XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.92      0.92       208
           1       0.91      0.90      0.91       184

    accuracy                           0.91       392
   macro avg       0.91      0.91      0.91       392
weighted avg       0.91      0.91      0.91       392



In [23]:
### SAVE BEST MODEL

results = {
    "Naive Bayes": nb_accuracy,
    "SVM": svm_accuracy,
    "Random Forest": rf_accuracy,
    "Logistic Regression": lr_accuracy,
    "XGBoost": xgb_accuracy
}

best_model_name = max(results, key=results.get)
if best_model_name == "Naive Bayes":
    best_model = naive_bayes_classifier
else:
    best_model = eval(best_model_name.lower().replace(" ", "_"))

with open("best_model_rf.pickle", "wb") as file:
    pickle.dump(best_model, file)

with open("vectorize.pickle", "wb") as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

print(f"Best Model: {best_model_name} with Accuracy: {results[best_model_name] * 100:.2f}%")


Best Model: Random Forest with Accuracy: 93.11%
