### Open .csv file

In [15]:
import pandas as pd

In [16]:
df = pd.read_csv('./dataset/data-uji.csv', sep=';', encoding='utf-8')
df

Unnamed: 0,Comment
0,Buat kawan-kawan yang masih agak bingung atau ...
1,terimakasih penjelasannya. sangat membantu
2,Jelas
3,TOLONG DI PIN
4,Ralat bang poin ke 4 chat gpt itu fine tuning ...
...,...
1244,Dr indra bsa ngoding ga ?
1245,t
1246,2015
1247,Pohara


In [17]:
# 'ind' or 'en'
LANGUAGE='ind'

### Cleaning

In [32]:
import re

In [51]:
def cleanComment(comment):
    try:
        # misal ada "kawan-kawan"
        comment = re.sub(r'(?<=\w)-(?=\w)', 'STRIP', comment)
        comment = re.sub(r'@\w+|[^\w\s-]|(?<!\w)-(?!\w)|\d+|(?<=\n)[IVXLCDM]+', '', comment)
        comment = comment.replace('STRIP', '-')
        comment = comment.replace('\n', ' ')
        return comment.strip()
    except Exception as e: 
        print(f"Err: Failed to clean comments due to {str(e)}")
        return comment

In [52]:
df['cleanComment'] = df['Comment'].apply(lambda x: cleanComment(x))
df[['Comment', 'cleanComment']].head(20)

Unnamed: 0,Comment,cleanComment
0,Buat kawan-kawan yang masih agak bingung atau ...,Buat kawan-kawan yang masih agak bingung atau ...
1,terimakasih penjelasannya. sangat membantu,terimakasih penjelasannya sangat membantu
2,Jelas,Jelas
3,TOLONG DI PIN,TOLONG DI PIN
4,Ralat bang poin ke 4 chat gpt itu fine tuning ...,Ralat bang poin ke chat gpt itu fine tuning s...
5,Thanks untuk additional infonya mas,Thanks untuk additional infonya mas
6,"@itsvan5791 thanks bro koreksinya, di salah ...",thanks bro koreksinya di salah satu literasi y...
7,@IndrawanNugroho My Pleasure Pak Indrawan :),My Pleasure Pak Indrawan
8,Saya sangat suka komentar komentar di sini. ti...,Saya sangat suka komentar komentar di sini tid...
9,@fadhlurrohmanfaqih480 kalau pakai fine tuni...,kalau pakai fine tuning dataset udah pasti sup...


### Case folding

In [53]:
def caseFolding(comment):
    try:
        cleanComment = comment.lower()
        return cleanComment
    except Exception as e: 
        print(f"Err: Failed to case folding due to {str(e)}")
        return comment

In [54]:
df['resultCaseFolding'] = df['cleanComment'].apply(caseFolding)
df[['cleanComment', 'resultCaseFolding']].head(10)

Unnamed: 0,cleanComment,resultCaseFolding
0,Buat kawan-kawan yang masih agak bingung atau ...,buat kawan-kawan yang masih agak bingung atau ...
1,terimakasih penjelasannya sangat membantu,terimakasih penjelasannya sangat membantu
2,Jelas,jelas
3,TOLONG DI PIN,tolong di pin
4,Ralat bang poin ke chat gpt itu fine tuning s...,ralat bang poin ke chat gpt itu fine tuning s...
5,Thanks untuk additional infonya mas,thanks untuk additional infonya mas
6,thanks bro koreksinya di salah satu literasi y...,thanks bro koreksinya di salah satu literasi y...
7,My Pleasure Pak Indrawan,my pleasure pak indrawan
8,Saya sangat suka komentar komentar di sini tid...,saya sangat suka komentar komentar di sini tid...
9,kalau pakai fine tuning dataset udah pasti sup...,kalau pakai fine tuning dataset udah pasti sup...


### Slang Word

In [55]:
import os

In [56]:
def replaceSlangWords(comment, language):
    try:
        slang_dict = {}
        filename = f'./slang/slang-{language}.txt'
        if os.path.exists(filename) and os.path.getsize(filename) > 0:
            with open(filename, 'r') as f:
                for line in f:
                    slang, formal = line.strip().split(',')
                    slang_dict[slang] = formal

        words = comment.split()
        for i in range(len(words)):
            if words[i] in slang_dict:
                words[i] = slang_dict[words[i]]
        return ' '.join(words)
    except Exception as e:
        print(f"Err: Failed to replace slang words due to {str(e)}")
        return comment 

In [57]:
df['resultReplaceSlang'] = df['resultCaseFolding'].apply(lambda x: replaceSlangWords(x, LANGUAGE))
df[['resultCaseFolding', 'resultReplaceSlang']].head(20)

Unnamed: 0,resultCaseFolding,resultReplaceSlang
0,buat kawan-kawan yang masih agak bingung atau ...,buat kawan-kawan yang masih agak bingung atau ...
1,terimakasih penjelasannya sangat membantu,terimakasih penjelasannya sangat membantu
2,jelas,jelas
3,tolong di pin,tolong di pin
4,ralat bang poin ke chat gpt itu fine tuning s...,ralat bang poin ke chat gpt itu fine tuning su...
5,thanks untuk additional infonya mas,thanks untuk additional infonya mas
6,thanks bro koreksinya di salah satu literasi y...,thanks saudara koreksinya di salah satu litera...
7,my pleasure pak indrawan,my pleasure pak indrawan
8,saya sangat suka komentar komentar di sini tid...,saya sangat suka komentar komentar di sini tid...
9,kalau pakai fine tuning dataset udah pasti sup...,kalau pakai fine tuning dataset udah pasti sup...


### Tokenizing

In [59]:
def tokenize(comment):
    try:
        words = comment.split(' ')
        words = list(filter(None, words)) 

        return words
    except Exception as e:
        print("Err: Failed to tokenize due to", str(e))
        return comment

In [60]:
df['resultTokenize'] = df['resultReplaceSlang'].apply(tokenize)
df[['resultReplaceSlang', 'resultTokenize']].head(20)

Unnamed: 0,resultReplaceSlang,resultTokenize
0,buat kawan-kawan yang masih agak bingung atau ...,"[buat, kawan-kawan, yang, masih, agak, bingung..."
1,terimakasih penjelasannya sangat membantu,"[terimakasih, penjelasannya, sangat, membantu]"
2,jelas,[jelas]
3,tolong di pin,"[tolong, di, pin]"
4,ralat bang poin ke chat gpt itu fine tuning su...,"[ralat, bang, poin, ke, chat, gpt, itu, fine, ..."
5,thanks untuk additional infonya mas,"[thanks, untuk, additional, infonya, mas]"
6,thanks saudara koreksinya di salah satu litera...,"[thanks, saudara, koreksinya, di, salah, satu,..."
7,my pleasure pak indrawan,"[my, pleasure, pak, indrawan]"
8,saya sangat suka komentar komentar di sini tid...,"[saya, sangat, suka, komentar, komentar, di, s..."
9,kalau pakai fine tuning dataset udah pasti sup...,"[kalau, pakai, fine, tuning, dataset, udah, pa..."


### Stop Word Removal

In [62]:
# !pip install nltk

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Dell
[nltk_data]     7300\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [65]:
def stopwordRemoval(comments, language):
    language_mapping = {
        'en': 'english', 
        'ind': 'indonesian'
    }

    nltk_language = language_mapping.get(language, 'english') 
    stopWordRemoved = []
    
    try:
        stopList = stopwords.words(nltk_language)
        filename = f'./stopword/stopword-{language}.txt' 
        if os.path.exists(filename) and os.path.getsize(filename) > 0:
            with open(filename, 'r') as f:
                stopList.extend(f.read().split('\n')) 
        for word in comments:
            if word not in stopList:
                stopWordRemoved.append(word)

        return stopWordRemoved  
    except Exception as e:  
        print(f"Err: Failed to remove stopwords due to {str(e)}")
        return comments 

In [67]:
df['resultStopword'] = df['resultTokenize'].apply(lambda x: stopwordRemoval(x, LANGUAGE))
df[['resultTokenize', 'resultStopword']].head(20)

Unnamed: 0,resultTokenize,resultStopword
0,"[buat, kawan-kawan, yang, masih, agak, bingung...","[kawan-kawan, bingung, asingdengan, istilah, d..."
1,"[terimakasih, penjelasannya, sangat, membantu]","[terimakasih, penjelasannya, membantu]"
2,[jelas],[]
3,"[tolong, di, pin]","[tolong, pin]"
4,"[ralat, bang, poin, ke, chat, gpt, itu, fine, ...","[ralat, bang, poin, chat, gpt, fine, tuning, s..."
5,"[thanks, untuk, additional, infonya, mas]","[thanks, additional, infonya, mas]"
6,"[thanks, saudara, koreksinya, di, salah, satu,...","[thanks, saudara, koreksinya, salah, literasi,..."
7,"[my, pleasure, pak, indrawan]","[my, pleasure, indrawan]"
8,"[saya, sangat, suka, komentar, komentar, di, s...","[suka, komentar, komentar, menggoblokan, komen..."
9,"[kalau, pakai, fine, tuning, dataset, udah, pa...","[pakai, fine, tuning, dataset, udah, supervise..."


### Hapus data kosong ([])

In [68]:
df = df[df['resultStopword'].apply(lambda x: len(x) > 0)]
df['resultStopword']

0       [kawan-kawan, bingung, asingdengan, istilah, d...
1                  [terimakasih, penjelasannya, membantu]
3                                           [tolong, pin]
4       [ralat, bang, poin, chat, gpt, fine, tuning, s...
5                      [thanks, additional, infonya, mas]
                              ...                        
1241                                [kalah, mbah, google]
1243                                        [paham, blas]
1244                        [dr, indra, bsa, ngoding, ga]
1247                                             [pohara]
1248                                            [dahsyat]
Name: resultStopword, Length: 1239, dtype: object

### Stemming

In [69]:
# !pip install Sastrawi

from nltk.stem import PorterStemmer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [71]:
def stemmed_wrapper(term, language):
    try:
        if language == 'ind':
            factory = StemmerFactory()
            stemmer = factory.create_stemmer()
            return stemmer.stem(term)
        else: 
            stemmer = PorterStemmer()
            return stemmer.stem(term)
    except Exception as e:
        print(f"Err: Failed to stem term '{term}' due to {str(e)}")
        return term 

def stemming(document, language):
    return [stemmed_wrapper(term, language) for term in document]

In [72]:
df['resultStemming'] = df['resultStopword'].apply(lambda x: stemming(x, LANGUAGE))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['resultStemming'] = df['resultStopword'].apply(lambda x: stemming(x, LANGUAGE))


Unnamed: 0,resultStopword,resultStemming
0,"[kawan-kawan, bingung, asingdengan, istilah, d...","[kawan, bingung, asingdengan, istilah, pakai, ..."
1,"[terimakasih, penjelasannya, membantu]","[terimakasih, jelas, bantu]"
3,"[tolong, pin]","[tolong, pin]"
4,"[ralat, bang, poin, chat, gpt, fine, tuning, s...","[ralat, bang, poin, chat, gpt, fine, tuning, s..."
5,"[thanks, additional, infonya, mas]","[thanks, additional, info, mas]"


In [79]:
df[['resultStopword', 'resultStemming']].head(200)

Unnamed: 0,resultStopword,resultStemming
0,"[kawan-kawan, bingung, asingdengan, istilah, d...","[kawan, bingung, asingdengan, istilah, pakai, ..."
1,"[terimakasih, penjelasannya, membantu]","[terimakasih, jelas, bantu]"
3,"[tolong, pin]","[tolong, pin]"
4,"[ralat, bang, poin, chat, gpt, fine, tuning, s...","[ralat, bang, poin, chat, gpt, fine, tuning, s..."
5,"[thanks, additional, infonya, mas]","[thanks, additional, info, mas]"
...,...,...
197,"[dlm, konteks, transformasi, ekonomi, pengelol...","[dlm, konteks, transformasi, ekonomi, kelola, ..."
198,"[menuntut, anak, juara, kelas, disekolah, mend...","[tuntut, anak, juara, kelas, seko, dukung, kre..."
199,"[problem, solving, komunikasi, hubungan, inter...","[problem, solving, komunikasi, hubung, interpe..."
200,"[ai, bikin, channel, youtube, kayak, ai, diplo...","[ai, bikin, channel, youtube, kayak, ai, diplo..."


### Export clean data

In [74]:
df.to_csv(
    './output/data-clean-ind-ril.csv', 
    columns=[
        'Comment', 
        'cleanComment',
        'resultCaseFolding',
        'resultReplaceSlang',
        'resultTokenize',
        'resultStopword',
        'resultStemming',
    ],
    index=False
)

In [76]:
df.to_excel(
    './output/data-clean-ind-ril.xlsx', 
    columns=[
        'Comment', 
        'cleanComment',
        'resultCaseFolding',
        'resultReplaceSlang',
        'resultTokenize',
        'resultStopword',
        'resultStemming',
    ],
    index=False
)