In [1]:
import pandas as pd
import re
from langdetect import detect
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from hunspell import Hunspell

In [2]:
# Load data
df = pd.read_excel("F:/Github/text_suicide_detection/data/final_dataset.xlsx", sheet_name="10k")

In [3]:
df.head(3)

Unnamed: 0,index,tweet,label
0,1,suasana batin ngerasa tertekan banget ðŸ˜”,0
1,2,@camareily__ KAMU MAU NGAPAIN WOI PLS JANGAN B...,0
2,3,Mau bunuh diri masih juga ngerepotin https://t...,0


# Preprocessing

In [4]:
def case_folding(df, column_name):
    df[column_name] = df[column_name].str.lower()
    return df

def remove_punctuation_and_sc(df, column_name):
    df[column_name] = df[column_name].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)
    return df

def remove_punctuation(df, column_name):
    df[column_name] = df[column_name].str.replace(r'[^\w\s]', '', regex=True)
    return df

def remove_specialchar(df, column_name):
    df[column_name] = df[column_name].apply(lambda text: re.sub(r'[^\x00-\x7F]+', '', text))
    return df

def remove_redundant_whitespace(df, column_name):
    df[column_name] = df[column_name].str.replace(r'\s+', ' ', regex=False).str.strip()
    return df

def remove_mentions_hashtags(df, column_name):
    df[column_name] = df[column_name].apply(lambda x: re.sub(r'@\w+|\#\w+', '', x))
    return df

def remove_hyperlink(df, column_name):
    df[column_name] = df[column_name].apply(lambda x: re.sub(r'http\S+', '', x))
    return df

In [5]:
stopwords_sastrawi = StopWordRemoverFactory().get_stop_words()

def remove_stopwords(df, column_name):
    df[column_name] = df[column_name].apply(lambda text: ' '.join([word for word in text.split() if word not in stopwords_sastrawi]))
    return df

In [7]:
hunspell = Hunspell('F:/Github/text_suicide_detection/hunspell-id-main/id_ID', 'F:/Github/text_suicide_detection/hunspell-id-main/id_ID')

def stem_word_hunspell(word):
    try:
        stems = hunspell.stem(word)
    except UnicodeEncodeError:
        stems = [word]
    return stems[0] if stems else word

def stemming(df, column_name):
    df[column_name] = df[column_name].apply(lambda text: ' '.join([stem_word_hunspell(word) for word in text.split()]))
    return df

In [8]:
def preprocessing(df, column_name):  
    df = remove_mentions_hashtags(df, column_name)
    df = remove_hyperlink(df, column_name)
    df = remove_punctuation(df, column_name)
    df = case_folding(df, column_name)
    df = remove_stopwords(df, column_name)
    df = remove_redundant_whitespace(df, column_name)
    df = stemming(df, column_name)
    
    return df[column_name]

In [9]:
df['tweet'] = preprocessing(df, 'tweet')

In [10]:
filename = 'preprocessed_10k.xlsx'   
sheetname = '10k'     
df.to_excel(filename, sheet_name=sheetname, index=False)