### Open .csv file

In [1]:
import pandas as pd

In [55]:
df = pd.read_csv('./dataset/data-uji.csv', sep=';', encoding='utf-8')
df

Unnamed: 0,Comment
0,Buat kawan-kawan yang masih agak bingung atau ...
1,terimakasih penjelasannya. sangat membantu
2,Jelas
3,TOLONG DI PIN
4,Ralat bang poin ke 4 chat gpt itu fine tuning ...
...,...
1244,Dr indra bsa ngoding ga ?
1245,t
1246,2015
1247,Pohara


In [None]:
# 'ind' or 'en'
LANGUAGE='ind'

### Cleaning

In [12]:
import re
import os

In [41]:
def cleanComment(comment, language):
    try:
        comment = re.sub(r'@\w+|[^\w\s]', '', comment)
        comment = re.sub(r'\d+', '', comment)
       
        slang_dict = {}
        filename = f'slang-{language}.txt'
        if os.path.exists(filename) and os.path.getsize(filename) > 0:
            with open(filename, 'r') as f:
                for line in f:
                    slang, formal = line.strip().split(',')
                    slang_dict[slang] = formal

        words = comment.split()
        for i in range(len(words)):
            if words[i] in slang_dict:
                words[i] = slang_dict[words[i]]
        comment = ' '.join(words)

    except:
        comment = ''
        print("Err: Failed to clean comments")
    return comment.strip()


In [46]:
df['cleanComment'] = df['Comment'].apply(lambda x: cleanComment(x, LANGUAGE))
df.head()

Unnamed: 0,Comment,cleanComment,resultCaseFolding,resultTokenize
0,Buat kawan-kawan yang masih agak bingung atau ...,Buat kawankawan yang masih agak bingung atau a...,buat kawankawan yang masih agak bingung atau a...,"[buat, kawankawan, yang, masih, agak, bingung,..."
1,terimakasih penjelasannya. sangat membantu,terimakasih penjelasannya sangat membantu,terimakasih penjelasannya sangat membantu,"[terimakasih, penjelasannya, sangat, membantu]"
2,Jelas,Jelas,jelas,[jelas]
3,TOLONG DI PIN,TOLONG DI PIN,tolong di pin,"[tolong, di, pin]"
4,Ralat bang poin ke 4 chat gpt itu fine tuning ...,Ralat bang poin ke chat gpt itu fine tuning su...,ralat bang poin ke chat gpt itu fine tuning su...,"[ralat, bang, poin, ke, chat, gpt, itu, fine, ..."


### Case folding

In [17]:
def caseFolding(cleanComment):
    try:
        cleanComment = cleanComment.lower()
    except:
        cleanComment = ''
        print("Err: Failed to case folding")
    return cleanComment

In [44]:
df['resultCaseFolding'] = df['cleanComment'].apply(caseFolding)
df[['cleanComment', 'resultCaseFolding']].head()

Unnamed: 0,cleanComment,resultCaseFolding
0,Buat kawankawan yang masih agak bingung atau a...,buat kawankawan yang masih agak bingung atau a...
1,terimakasih penjelasannya sangat membantu,terimakasih penjelasannya sangat membantu
2,Jelas,jelas
3,TOLONG DI PIN,tolong di pin
4,Ralat bang poin ke chat gpt itu fine tuning su...,ralat bang poin ke chat gpt itu fine tuning su...


### Tokenizing

In [20]:
def tokenize(comments):
    try:
        words = comments.split(' ')
    except:
        print("Err: Failed to tokenizing")
        words = []
  
    indexToRemove = []
    index = -1
    for word in words:
        index += 1
        if word == '':
            indexToRemove.append(index)
  
    prevIndex = 0
    currentIndex = 0
    for removeIndex in indexToRemove:
        currentIndex = removeIndex - prevIndex
        del words[prevIndex]
        prevIndex += 1
  
    return words

In [45]:
df['resultTokenize'] = df['resultCaseFolding'].apply(tokenize)
df[['resultCaseFolding', 'resultTokenize']].head()

Unnamed: 0,resultCaseFolding,resultTokenize
0,buat kawankawan yang masih agak bingung atau a...,"[buat, kawankawan, yang, masih, agak, bingung,..."
1,terimakasih penjelasannya sangat membantu,"[terimakasih, penjelasannya, sangat, membantu]"
2,jelas,[jelas]
3,tolong di pin,"[tolong, di, pin]"
4,ralat bang poin ke chat gpt itu fine tuning su...,"[ralat, bang, poin, ke, chat, gpt, itu, fine, ..."


In [25]:
# !pip install nltk

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Dell
[nltk_data]     7300\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [36]:
def stopwordRemoval(comments, language):
    language_mapping = {
        'en': 'english', 
        'ind': 'indonesian'
    }

    nltk_language = language_mapping.get(language, 'english') 

    stopWordRemoved = []
    try:
        stopList = stopwords.words(nltk_language)
        filename = f'stopwords-{language}.txt'
        if os.path.exists(filename) and os.path.getsize(filename) > 0:
            with open(filename, 'r') as f:
                stopList.extend(f.read().split('\n')) 
        for word in comments:
            if word not in stopList:
                stopWordRemoved.append(word)
    except:
        print("Err: Failed to remove stopwords")
        
    return stopWordRemoved

In [50]:
df['resultStopword'] = df['resultTokenize'].apply(lambda x: stopwordRemoval(x, LANGUAGE))
df[['resultTokenize', 'resultStopword']].head()

Unnamed: 0,resultTokenize,resultStopword
0,"[buat, kawankawan, yang, masih, agak, bingung,...","[kawankawan, bingung, asingdengan, istilah, di..."
1,"[terimakasih, penjelasannya, sangat, membantu]","[terimakasih, penjelasannya, membantu]"
2,[jelas],[]
3,"[tolong, di, pin]","[tolong, pin]"
4,"[ralat, bang, poin, ke, chat, gpt, itu, fine, ...","[ralat, bang, poin, chat, gpt, fine, tuning, s..."


### Hapus data kosong ([])

In [52]:
df = df[df['resultStopword'].apply(lambda x: len(x) > 0)]
df['resultStopword']

0       [kawankawan, bingung, asingdengan, istilah, di...
1                  [terimakasih, penjelasannya, membantu]
3                                           [tolong, pin]
4       [ralat, bang, poin, chat, gpt, fine, tuning, s...
5                      [thanks, additional, infonya, mas]
                              ...                        
1243                                  [ngga, paham, blas]
1244                        [dr, indra, bsa, ngoding, ga]
1245                                                  [t]
1247                                             [pohara]
1248                                            [dahsyat]
Name: resultStopword, Length: 1241, dtype: object

### Stemming

In [33]:
# !pip install Sastrawi

from nltk.stem import PorterStemmer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [34]:
def stemmed_wrapper(term, language):
    if language == 'ind':
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()
        return stemmer.stem(term)
    else: 
        stemmer = PorterStemmer()
        return stemmer.stem(term)

def stemming(document, language):
    return [stemmed_wrapper(term, language) for term in document]


In [53]:
df['resultStemming'] = df['resultStopword'].apply(lambda x: stemming(x, LANGUAGE))
df[['resultStopword', 'resultStemming']].head()

Unnamed: 0,resultStopword,resultStemming
0,"[kawankawan, bingung, asingdengan, istilah, di...","[kawankawan, bingung, asingdengan, istilah, pa..."
1,"[terimakasih, penjelasannya, membantu]","[terimakasih, jelas, bantu]"
3,"[tolong, pin]","[tolong, pin]"
4,"[ralat, bang, poin, chat, gpt, fine, tuning, s...","[ralat, bang, poin, chat, gpt, fine, tuning, s..."
5,"[thanks, additional, infonya, mas]","[thanks, additional, info, mas]"


### Export clean data

In [54]:
df.to_csv(
    './output/data-clean-ind.csv', 
    columns=[
        'Comment', 
        'cleanComment',
        'resultCaseFolding',
        'resultTokenize',
        'resultStopword',
        'resultStemming',
    ],
    index=False
)