In [1]:
import pandas as pd
import pickle
import random
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.classify import NaiveBayesClassifier, accuracy
from nltk.tag import pos_tag
from nltk.probability import FreqDist
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
from sklearn.utils import resample

In [2]:
dataset = pd.read_csv('Suicide_Data.csv')

dataset = dataset.dropna()
dataset = dataset.drop_duplicates()

In [6]:
category_counts = dataset['Suicide'].value_counts()
category_counts

Suicide
Not Suicide post           1124
Potential Suicide post      653
Name: count, dtype: int64

In [7]:
not_suicide = dataset[dataset['Suicide'] == 'Not Suicide post']
potential_suicide = dataset[dataset['Suicide'] == 'Potential Suicide post ']

not_suicide_downsampled = resample(
    not_suicide, 
    replace=False,             # Tidak mengganti sampel
    n_samples=653,             # Jumlah yang diinginkan
    random_state=42            # Untuk reproduktifitas
)

# Menggabungkan kembali dataset setelah undersampling
balanced_dataset = pd.concat([not_suicide_downsampled, potential_suicide])

# Memeriksa distribusi sebelum balancing
print("Distribusi sebelum balancing:")
print(dataset['Suicide'].value_counts())

# Setelah undersampling dan menggabungkan kembali, memeriksa distribusi lagi
print("Distribusi setelah balancing:")
print(balanced_dataset['Suicide'].value_counts())

Distribusi sebelum balancing:
Suicide
Not Suicide post           1124
Potential Suicide post      653
Name: count, dtype: int64
Distribusi setelah balancing:
Suicide
Not Suicide post           653
Potential Suicide post     653
Name: count, dtype: int64


In [8]:
tweetList = balanced_dataset['Tweet'].to_list()
labelList = balanced_dataset['Suicide'].to_list()

vectorizer = TfidfVectorizer(tokenizer=word_tokenize, stop_words='english')
tfidfMatrix = vectorizer.fit_transform(tweetList)



In [9]:
unique_categories = pd.DataFrame(dataset['Suicide'].unique())
unique_categories

Unnamed: 0,0
0,Not Suicide post
1,Potential Suicide post


In [10]:
# Preprocessing

eng_stopwords = stopwords.words('english')
punctuation_list = string.punctuation
stemming = SnowballStemmer('english')
wnl  = WordNetLemmatizer()

def removeStopwords(wordList):
    # return [word for word in wordList if word not in eng_stopwords]
    removed = []
    for word in wordList:
        if word not in eng_stopwords:
            removed.append(word)
            
    return removed 
            
def removePunctuation(wordList):
    # return [word for word in wordList if word not in punctuation_list]
    removed = []
    for word in wordList:
        if word not in punctuation_list:
            removed.append(word)
            
    return removed

def removeNumber(wordList):
    # return [word for word in wordList if word.isalpha()]
    removed = []
    for word in wordList:
        if word.isalpha(): # alpha = alphabet
            removed.append(word)
    return removed

def stemmingWord(wordList):
    # return [stemming.stem(word) for word in wordList]
    removed = []
    for word in wordList:
        removed.append(stemming.stem(word))
        
    return removed


# JJ, NN, RB, VB
# a, n, r, v
# -> adjective, noun, adverb, verb

def getTag(tag):
    if tag =='jj':
        return 'a'
    elif tag in ['vb', 'nn', 'rb']:
        return tag[0]
    else:
        return None
    
def lemmatizingWord(wordList):
    lemmatizing = []
    tagging = pos_tag(wordList)
    for word, tag in tagging: # disini soalnya dia JJ jadi dibikin lower biar jj
        label = getTag(tag.lower())
        
        if label != None:
            lemmatizing.append(wnl.lemmatize(word, label))
        else:
            lemmatizing.append(wnl.lemmatize(word))
            
    return lemmatizing

In [20]:
def trainingModel():
    wordList = []
    
    for sentence in tweetList:
        if not isinstance(sentence, str):
            sentence = str(sentence)
        words = word_tokenize(sentence)
        for word in words:
            wordList.append(word.lower())
            
    wordList = removeStopwords(wordList)
    wordList = removePunctuation(wordList)
    wordList = removeNumber(wordList)
    wordList = stemmingWord(wordList)
    wordList = lemmatizingWord(wordList)
    
    fd = FreqDist(wordList)
    wordFeatures = [word for word, _ in fd.most_common()]
    
    labeledList = list(zip(tweetList, labelList))
    
    featuresSets = []
    
    for sentence, label in labeledList:
        features = {}
        
        if not isinstance(sentence, str):
            sentence = str(sentence)
            
        checkList = word_tokenize(sentence)
        checkList = removeStopwords(checkList)
        checkList = removePunctuation(checkList)
        checkList = removeNumber(checkList)
        checkList = stemmingWord(checkList)
        checkList = lemmatizingWord(checkList)
        
        for word in wordFeatures:
            features[word] = (word in checkList)
        featuresSets.append((features, label))
        
    random.shuffle(featuresSets)
    trainCount = int(len(featuresSets)*0.7)
    trainDataset = featuresSets[:trainCount]
    testDataset = featuresSets[trainCount:]
    
    classifier = NaiveBayesClassifier.train(trainDataset)
    
    print(f'Training accuracy: {accuracy(classifier, testDataset)}')

    trueLabels = [label for _, label in testDataset]
    predictedLabels = [classifier.classify(features) for features, _ in testDataset]
    
    print("Classification Report:")
    print(classification_report(trueLabels, predictedLabels))
    
    file = open("model2.pickle", "wb")
    pickle.dump(classifier, file)
    file.close()
    
    return classifier

In [17]:
try:
    print("Load model..")
    file = open("model2.pickle", "rb")
    classifier = pickle.load(file)
    file.close()
except:
    print("No model..")
    print("Training model..")
    classifier = trainingModel()

Load model..


In [10]:
tweets = "" 
category = ""  

def printMenu():
    if tweets != "":
        print(f'Your Tweet: {tweets}')
        print(f'Category: {category}')
    else:
        print("Your Tweet: - ")
        print("Category: - ")
        
def menu1():
    global tweets
    global category
    
    while True:
        inputTweet = input("Input your tweet: ")
        length = len(inputTweet.split(' '))
        if length < 5:
            print("Review length must be 5 words or more")
        else:
            tweets = inputTweet
            category = classifier.classify(FreqDist(word_tokenize(tweets)))
            break

In [11]:
while True:
    print("Suicidal Tweet Detection")
    printMenu()
    print("==========================")
    print("Do you want to detect whether a text is suicidal or not?")
    print("Y/N")
    choice = input("Input your choice: ")
    if choice == 'Y':
        print("Input your tweet to detect!")
        menu1()
    elif choice == 'N':
        break
    else:
        print("Invalid Choice!")

Suicidal Tweet Detection
Your Tweet: - 
Category: - 
Do you want to detect whether a text is suicidal or not?
Y/N
Invalid Choice!
Suicidal Tweet Detection
Your Tweet: - 
Category: - 
Do you want to detect whether a text is suicidal or not?
Y/N


In [12]:
# List kalimat test

# Not Suicide post
# =======================================================================
# I had such a great day today! Met up with old friends.
# Just finished watching a new movie, and it was amazing!
# I'm really excited for the weekend trip with my family.
# Feeling a bit tired, but I’m looking forward to a relaxing evening.
# I’m learning a new language, and it’s a lot of fun so far.


# Potential Suicide post
# =======================================================================
# I feel like there's no way out. Everything seems so hopeless.
# I can't do this anymore. I just want it all to end.
# No one cares about me. I'm just a burden to everyone.
# I’ve been thinking about ending it all, but I don’t know how.
# I just want the pain to stop. I don't want to be here anymore.

In [19]:
import random
import pickle
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

def preprocess_text(text):
    words = word_tokenize(text)
    words = removeStopwords(words)
    words = removePunctuation(words)
    words = removeNumber(words)
    words = stemmingWord(words)
    words = lemmatizingWord(words)
    return ' '.join(words)

def trainingModel2(tweetList, labelList):
    # Preprocess the tweets
    preprocessed_tweets = [preprocess_text(tweet) if isinstance(tweet, str) else '' for tweet in tweetList]

    # Convert text to feature vectors using CountVectorizer
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(preprocessed_tweets)
    y = labelList

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Initialize classifiers
    classifiers = {
        "Naive Bayes": NaiveBayesClassifier,
        "SVM": SVC(kernel='linear', random_state=42),
        "Random Forest": RandomForestClassifier(random_state=42),
        "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000)
    }

    # Store results
    results = {}

    # Train and evaluate Naive Bayes separately (since it requires a different format)
    if "Naive Bayes" in classifiers:
        labeled_list = list(zip(preprocessed_tweets, y))
        features_sets = []
        for sentence, label in labeled_list:
            features = {}
            check_list = preprocess_text(sentence).split()
            for word in vectorizer.get_feature_names_out():
                features[word] = (word in check_list)
            features_sets.append((features, label))

        random.shuffle(features_sets)
        train_count = int(len(features_sets) * 0.7)
        train_dataset = features_sets[:train_count]
        test_dataset = features_sets[train_count:]

        naive_bayes_classifier = NaiveBayesClassifier.train(train_dataset)
        nb_accuracy = accuracy(naive_bayes_classifier, test_dataset)
        true_labels = [label for _, label in test_dataset]
        predicted_labels = [naive_bayes_classifier.classify(features) for features, _ in test_dataset]

        results["Naive Bayes"] = {
            "Accuracy": nb_accuracy,
            "Report": classification_report(true_labels, predicted_labels, zero_division=0)
        }

    # Train and evaluate other classifiers
    for name, clf in classifiers.items():
        if name == "Naive Bayes":
            continue

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, zero_division=0)

        results[name] = {
            "Accuracy": acc,
            "Report": report
        }

    # Save the best model
    best_model = max(results, key=lambda k: results[k]["Accuracy"])
    with open("best_model_rf.pickle", "wb") as file:
        pickle.dump(classifiers[best_model], file)

    # Print results
    for name, metrics in results.items():
        print(f"Classifier: {name}")
        print(f"Accuracy: {metrics['Accuracy']}")
        print("Classification Report:")
        print(metrics["Report"])
        print("-" * 50)

    return results

trainingModel2(tweetList, labelList)

Classifier: Naive Bayes
Accuracy: 0.9107142857142857
Classification Report:
                         precision    recall  f1-score   support

       Not Suicide post       0.88      0.95      0.91       194
Potential Suicide post        0.95      0.87      0.91       198

               accuracy                           0.91       392
              macro avg       0.91      0.91      0.91       392
           weighted avg       0.91      0.91      0.91       392

--------------------------------------------------
Classifier: SVM
Accuracy: 0.923469387755102
Classification Report:
                         precision    recall  f1-score   support

       Not Suicide post       0.91      0.95      0.93       208
Potential Suicide post        0.94      0.89      0.92       184

               accuracy                           0.92       392
              macro avg       0.93      0.92      0.92       392
           weighted avg       0.92      0.92      0.92       392

--------------------

{'Naive Bayes': {'Accuracy': 0.9107142857142857,
  'Report': '                         precision    recall  f1-score   support\n\n       Not Suicide post       0.88      0.95      0.91       194\nPotential Suicide post        0.95      0.87      0.91       198\n\n               accuracy                           0.91       392\n              macro avg       0.91      0.91      0.91       392\n           weighted avg       0.91      0.91      0.91       392\n'},
 'SVM': {'Accuracy': 0.923469387755102,
  'Report': '                         precision    recall  f1-score   support\n\n       Not Suicide post       0.91      0.95      0.93       208\nPotential Suicide post        0.94      0.89      0.92       184\n\n               accuracy                           0.92       392\n              macro avg       0.93      0.92      0.92       392\n           weighted avg       0.92      0.92      0.92       392\n'},
 'Random Forest': {'Accuracy': 0.9336734693877551,
  'Report': '             