### Open .csv file

In [50]:
# !pip install pandas
import pandas as pd

In [53]:
df = pd.read_csv('../dataset/dataset_chatgpt_scrap.csv', sep=';', encoding='utf-8')
df

Unnamed: 0,Comment
0,Buat kawan-kawan yang masih agak bingung atau ...
1,terimakasih penjelasannya. sangat membantu
2,Jelas
3,TOLONG DI PIN
4,Ralat bang poin ke 4 chat gpt itu fine tuning ...
...,...
1244,Dr indra bsa ngoding ga ?
1245,t
1246,2015
1247,Pohara


In [54]:
# 'ind' or 'en'
LANGUAGE='ind'

In [55]:
df.dtypes

Comment    object
dtype: object

### Cleaning

In [56]:
import re

In [57]:
def cleanComment(comment):
    try:
        # misal ada "kawan-kawan"
        comment = re.sub(r'(?<=\w)-(?=\w)', 'STRIP', str(comment))
        comment = re.sub(r'http\S+|www\S+', '', str(comment))
        comment = re.sub(r'@\w+|[^\w\s-]|(?<!\w)-(?!\w)|\d+|(?<=\n)[IVXLCDM]+', ' ', str(comment))
        comment = re.sub(r'\s+', ' ', str(comment))
        comment = comment.replace('STRIP', '-')
        comment = comment.replace('\n', ' ')
        return comment.strip()
    except Exception as e: 
        print(f"Err: Failed to clean comments due to {str(e)}")
        return comment

In [58]:
df['cleanComment'] = df['Comment'].apply(lambda x: cleanComment(x))
df[['Comment', 'cleanComment']]

Unnamed: 0,Comment,cleanComment
0,Buat kawan-kawan yang masih agak bingung atau ...,Buat kawan-kawan yang masih agak bingung atau ...
1,terimakasih penjelasannya. sangat membantu,terimakasih penjelasannya sangat membantu
2,Jelas,Jelas
3,TOLONG DI PIN,TOLONG DI PIN
4,Ralat bang poin ke 4 chat gpt itu fine tuning ...,Ralat bang poin ke chat gpt itu fine tuning su...
...,...,...
1244,Dr indra bsa ngoding ga ?,Dr indra bsa ngoding ga
1245,t,t
1246,2015,
1247,Pohara,Pohara


### Case folding

In [59]:
def caseFolding(comment):
    try:
        str(comment)
        cleanComment = comment.lower()
        return cleanComment     
    except Exception as e: 
        print(f"Err: Failed to case folding due to {str(e)}")
        return comment

In [60]:
df['resultCaseFolding'] = df['cleanComment'].apply(caseFolding)
df[['cleanComment', 'resultCaseFolding']]

Unnamed: 0,cleanComment,resultCaseFolding
0,Buat kawan-kawan yang masih agak bingung atau ...,buat kawan-kawan yang masih agak bingung atau ...
1,terimakasih penjelasannya sangat membantu,terimakasih penjelasannya sangat membantu
2,Jelas,jelas
3,TOLONG DI PIN,tolong di pin
4,Ralat bang poin ke chat gpt itu fine tuning su...,ralat bang poin ke chat gpt itu fine tuning su...
...,...,...
1244,Dr indra bsa ngoding ga,dr indra bsa ngoding ga
1245,t,t
1246,,
1247,Pohara,pohara


### Slang Word

In [61]:
import os

In [62]:
def replaceSlangWords(comment, language):
    try:
        slang_dict = {}
        filename = f'../slang/slang-{language}.txt'
        if os.path.exists(filename) and os.path.getsize(filename) > 0:
            with open(filename, 'r') as f:
                for line in f:
                    slang, formal = line.strip().split(',')
                    slang_dict[slang] = formal

        words = comment.split()
        for i in range(len(words)):
            if words[i] in slang_dict:
                words[i] = slang_dict[words[i]]
        return ' '.join(words)
    except Exception as e:
        print(f"Err: Failed to replace slang words due to {str(e)}")
        return comment 

In [22]:
# default remove None
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None) 

In [63]:
df['resultReplaceSlang'] = df['resultCaseFolding'].apply(lambda x: replaceSlangWords(x, LANGUAGE))
df[['resultCaseFolding', 'resultReplaceSlang']]

Unnamed: 0,resultCaseFolding,resultReplaceSlang
0,buat kawan-kawan yang masih agak bingung atau ...,buat kawan-kawan yang masih agak bingung atau ...
1,terimakasih penjelasannya sangat membantu,terimakasih penjelasannya sangat membantu
2,jelas,jelas
3,tolong di pin,tolong di pin
4,ralat bang poin ke chat gpt itu fine tuning su...,ralat bang poin ke chat gpt itu fine tuning su...
...,...,...
1244,dr indra bsa ngoding ga,dari indra bisa ngoding tidak
1245,t,t
1246,,
1247,pohara,terlalu


### Tokenizing

In [64]:
def tokenize(comment):
    try:
        words = comment.split(' ')
        words = list(filter(None, words)) 

        return words
    except Exception as e:
        print("Err: Failed to tokenize due to", str(e))
        return comment

In [65]:
df['resultTokenize'] = df['resultReplaceSlang'].apply(tokenize)
df[['resultReplaceSlang', 'resultTokenize']]

Unnamed: 0,resultReplaceSlang,resultTokenize
0,buat kawan-kawan yang masih agak bingung atau ...,"[buat, kawan-kawan, yang, masih, agak, bingung..."
1,terimakasih penjelasannya sangat membantu,"[terimakasih, penjelasannya, sangat, membantu]"
2,jelas,[jelas]
3,tolong di pin,"[tolong, di, pin]"
4,ralat bang poin ke chat gpt itu fine tuning su...,"[ralat, bang, poin, ke, chat, gpt, itu, fine, ..."
...,...,...
1244,dari indra bisa ngoding tidak,"[dari, indra, bisa, ngoding, tidak]"
1245,t,[t]
1246,,[]
1247,terlalu,[terlalu]


### Stop Word Removal

In [66]:
# !pip install nltk

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Dell
[nltk_data]     7300\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [67]:
def stopwordRemoval(comments, language):
    language_mapping = {
        'en': 'english', 
        'ind': 'indonesian'
    }

    nltk_language = language_mapping.get(language, 'indonesian') 
    stopWordRemoved = []
    
    try:
        stopList = stopwords.words(nltk_language)
        filename = f'../stopword/stopword-{language}.txt' 
        if os.path.exists(filename) and os.path.getsize(filename) > 0:
            with open(filename, 'r') as f:
                stopList.extend(f.read().split('\n')) 
        for word in comments:
            if word not in stopList:
                stopWordRemoved.append(word)

        return stopWordRemoved  
    except Exception as e:  
        print(f"Err: Failed to remove stopwords due to {str(e)}")
        return comments 

In [68]:
df['resultStopword'] = df['resultTokenize'].apply(lambda x: stopwordRemoval(x, LANGUAGE))
df[['resultTokenize', 'resultStopword']]

Unnamed: 0,resultTokenize,resultStopword
0,"[buat, kawan-kawan, yang, masih, agak, bingung...","[kawan-kawan, bingung, asing, istilah, dipakai..."
1,"[terimakasih, penjelasannya, sangat, membantu]","[terimakasih, penjelasannya, membantu]"
2,[jelas],[]
3,"[tolong, di, pin]","[tolong, pin]"
4,"[ralat, bang, poin, ke, chat, gpt, itu, fine, ...","[ralat, bang, poin, chat, gpt, fine, tuning, s..."
...,...,...
1244,"[dari, indra, bisa, ngoding, tidak]","[indra, ngoding]"
1245,[t],[]
1246,[],[]
1247,[terlalu],[]


### Hapus data kosong ([])

In [69]:
df = df[df['resultStopword'].apply(lambda x: len(x) > 0)]
df[['resultStopword']]

Unnamed: 0,resultStopword
0,"[kawan-kawan, bingung, asing, istilah, dipakai..."
1,"[terimakasih, penjelasannya, membantu]"
3,"[tolong, pin]"
4,"[ralat, bang, poin, chat, gpt, fine, tuning, s..."
5,"[thanks, additional, infonya, mas]"
...,...
1240,"[bang, mukanya, aja, bang, ah, emang, bawaan]"
1241,"[kalah, mbah, google]"
1243,[paham]
1244,"[indra, ngoding]"


### Stemming

In [70]:
# !pip install Sastrawi

from nltk.stem import PorterStemmer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [71]:
def stemmed_wrapper(term, language):
    try:
        if language == 'ind':
            factory = StemmerFactory()
            stemmer = factory.create_stemmer()
            return stemmer.stem(term)
        else: 
            stemmer = PorterStemmer()
            return stemmer.stem(term)
    except Exception as e:
        print(f"Err: Failed to stem term '{term}' due to {str(e)}")
        return term 

def stemming(document, language):
    return [stemmed_wrapper(term, language) for term in document]

In [72]:
df.loc[:, 'resultStemming'] = df['resultStopword'].apply(lambda x: stemming(x, LANGUAGE))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'resultStemming'] = df['resultStopword'].apply(lambda x: stemming(x, LANGUAGE))


In [73]:
df[['resultStopword', 'resultStemming']]
# df

Unnamed: 0,resultStopword,resultStemming
0,"[kawan-kawan, bingung, asing, istilah, dipakai...","[kawan, bingung, asing, istilah, pakai, video,..."
1,"[terimakasih, penjelasannya, membantu]","[terimakasih, jelas, bantu]"
3,"[tolong, pin]","[tolong, pin]"
4,"[ralat, bang, poin, chat, gpt, fine, tuning, s...","[ralat, bang, poin, chat, gpt, fine, tuning, s..."
5,"[thanks, additional, infonya, mas]","[thanks, additional, info, mas]"
...,...,...
1240,"[bang, mukanya, aja, bang, ah, emang, bawaan]","[bang, muka, aja, bang, ah, emang, bawa]"
1241,"[kalah, mbah, google]","[kalah, mbah, google]"
1243,[paham],[paham]
1244,"[indra, ngoding]","[indra, ngoding]"


### Export clean data

In [74]:
df.to_csv(
    '../output/cleaning/dataset_chatgpt_scrap_clean.csv',
    columns=df.columns,
    index=False
)

In [130]:
df.to_csv(
    './output/CLEAN-dataset_tweet_sentiment_opini_film.csv',
    columns=df.columns,
    index=False
)

In [38]:
df.to_csv(
    '../output/CLEAN-dataset_mobil_listrik.csv',
    columns=df.columns,
    index=False
)

In [68]:
df.to_csv(
    './output/data-clean-ind-new.csv', 
    columns=[
        'Comment', 
        'cleanComment',
        'resultCaseFolding',
        'resultReplaceSlang',
        'resultTokenize',
        'resultStopword',
        'resultStemming',
    ],
    index=False
)

In [67]:
df.to_excel(
    './output/data-clean-ind-new.xlsx', 
    columns=[
        'Comment', 
        'cleanComment',
        'resultCaseFolding',
        'resultReplaceSlang',
        'resultTokenize',
        'resultStopword',
        'resultStemming',
    ],
    index=False
)