In [8]:
import time
import pandas as pd
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from hunspell import Hunspell
import stanza

In [9]:
# Load data
path = "C:/Code/Github/text_suicide_detection/data/final_dataset.xlsx"
df = pd.read_excel(path, sheet_name="10k")

In [11]:
# Load the Stemming tools and Indonesian stopwords from Sastrawi
stemmer_sastrawi = StemmerFactory().create_stemmer()
stopwords_sastrawi = StopWordRemoverFactory().get_stop_words()

# Load the Stemming tools and Indonesian stopwords from Hunspell
h = Hunspell('C:/Code/Github/text_suicide_detection/hunspell-id-main/id_ID','C:/Code/Github/text_suicide_detection/hunspell-id-main/id_ID')
hs_stopwords = Hunspell('C:/Code/Github/text_suicide_detection/hunspell-id-main/id_ID')
stopwords_hunspell = set([s.decode('utf-8') for s in hs_stopwords.suggest('')])

# Load the Stemming tools and Indonesian stopwords from Stanza
st = stanza.Pipeline('id', download_method=None, processors='tokenize,mwt,pos,lemma')
nlp = stanza.Pipeline('id', download_method=None)
doc = nlp('')
stopwords_stanza = set([word.text for sent in doc.sentences for word in sent.words])

2024-01-22 23:27:29 INFO: Loading these models for language: id (Indonesian):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| mwt       | gsd     |
| pos       | gsd     |
| lemma     | gsd     |

2024-01-22 23:27:29 INFO: Using device: cpu
2024-01-22 23:27:29 INFO: Loading: tokenize
2024-01-22 23:27:29 INFO: Loading: mwt
2024-01-22 23:27:29 INFO: Loading: pos
2024-01-22 23:27:29 INFO: Loading: lemma
2024-01-22 23:27:29 INFO: Done loading processors!
2024-01-22 23:27:29 INFO: Loading these models for language: id (Indonesian):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| mwt       | gsd     |
| pos       | gsd     |
| lemma     | gsd     |
| depparse  | gsd     |

2024-01-22 23:27:29 INFO: Using device: cpu
2024-01-22 23:27:29 INFO: Loading: tokenize
2024-01-22 23:27:29 INFO: Loading: mwt
2024-01-22 23:27:29 INFO: Loading: pos
2024-01-22 23:27:29 INFO: Loading: lemma
2024-01-22 23:27:29 INFO: Loading: depparse
2024-01-22 23:27:30 I

# Stemming

In [12]:
def stem_sastrawi(text):
    return stemmer_sastrawi.stem(text)

def word_hunspell(word):
    try:
        stems = h.stem(word)
    except UnicodeEncodeError:
        stems = [word]
    
    if len(stems) == 0:
        output = word
    else:
        output = stems[0]
    return output
    
def stem_hunspell(text):
    hs_stem = [word_hunspell(word) for word in text.split()]
    output = ' '.join(hs_stem) 
    return output

def stem_stanza(text):
    doc = st(text)
    lemmas = [word.lemma if word.lemma is not None else word.text for sent in doc.sentences for word in sent.words]
    return ' '.join(lemmas)

In [13]:
text = 'bermain berkebun bersama dia saya sedang menunggu perapian padam hingga senja menjemput'

print(f'Hasil Stemming Sastrawi: {stem_sastrawi(text)}')
print(f'Hasil Stemming Hunspell: {stem_hunspell(text)}')
print(f'Hasil Stemming Stanza: {stem_stanza(text)}')

Hasil Stemming Sastrawi: main kebun sama dia saya sedang tunggu api padam hingga senja jemput
Hasil Stemming Hunspell: main kebun sama dia saya sedang tunggu api padam hingga senja jemput
Hasil Stemming Stanza: main kebun bersama dia saya sedang tunggu perapian padam hingga senja menjemput


In [14]:
t = time.time()
# Apply the stopword removal functions to the DataFrame
df['text_sastrawi'] = df['tweet'].apply(stem_sastrawi)
print("Elapsed time:", time.time()-t)

Elapsed time: 1535.472715139389


In [15]:
t = time.time()
# Apply the stopword removal functions to the DataFrame
df['text_hunspell'] = df['tweet'].apply(stem_hunspell)
print("Elapsed time:", time.time()-t)

Elapsed time: 0.3992314338684082


In [16]:
t = time.time()
# Apply the stopword removal functions to the DataFrame
df['text_stanza'] = df['tweet'].apply(stem_stanza)
print("Elapsed time:", time.time()-t)

Elapsed time: 741.537252664566


In [17]:
filename = 'stemmed.xlsx'  
sheetname = 'stemmed_result' 
df.to_excel(filename, sheet_name=sheetname, index=False)

# Stopwords Removal

In [18]:
def remove_stopwords_sastrawi(text):
    words = text.split()
    words_filtered = [word for word in words if not word in stopwords_sastrawi]
    return " ".join(words_filtered)

def remove_stopwords_hunspell(text):
    words = text.split()
    words_filtered = [word for word in words if not word in stopwords_hunspell]
    return " ".join(words_filtered)

def remove_stopwords_stanza(text):
    doc = nlp(text)
    words_filtered = []
    for sent in doc.sentences:
        for word in sent.words:
            if not word.text in stopwords_stanza:
                words_filtered.append(word.text)
    return " ".join(words_filtered)

In [19]:
text = "aku sama dia mau yang pergi ke pohon mangga sampai pulang pergi di ragunan"

print(f'Hasil Stopwords Removal Sastrawi: {remove_stopwords_sastrawi(text)}')
print(f'Hasil Stopwords Removal Hunspell: {remove_stopwords_hunspell(text)}')
print(f'Hasil Stopwords Removal Stanza: {remove_stopwords_stanza(text)}')

Hasil Stopwords Removal Sastrawi: aku sama mau pergi pohon mangga pulang pergi ragunan
Hasil Stopwords Removal Hunspell: aku sama dia mau yang pergi ke pohon mangga sampai pulang pergi di ragunan
Hasil Stopwords Removal Stanza: aku sama dia mau yang pergi ke pohon mangga sampai pulang pergi di ragunan


In [20]:
t = time.time()
# Apply the stopword removal functions to the DataFrame
df['text_sastrawi'] = df['tweet'].apply(remove_stopwords_sastrawi)
print("Elapsed time:", time.time()-t)

Elapsed time: 0.31885337829589844


In [21]:
t = time.time()
# Apply the stopword removal functions to the DataFrame
df['text_hunspell'] = df['tweet'].apply(remove_stopwords_hunspell)
print("Elapsed time:", time.time()-t)

Elapsed time: 0.03647923469543457


In [22]:
t = time.time()
# Apply the stopword removal functions to the DataFrame
df['text_stanza'] = df['tweet'].apply(remove_stopwords_stanza)
print("Elapsed time:", time.time()-t)

Elapsed time: 1373.5758798122406


In [23]:
filename = 'stopwords.xlsx'  
sheetname = 'stopwords_removal_result'      
df.to_excel(filename, sheet_name=sheetname, index=False)