### Open .csv file

In [13]:
import pandas as pd

In [14]:
df = pd.read_csv('../dataset/dataset_mobil_listrik.csv', encoding='utf-8')
df

Unnamed: 0,id_komentar,nama_akun,tanggal,text_cleaning,sentimen
0,Ugzbll5eyrIy3-gdUUJ4AaABAg,Sqn Ldr,2023-08-06 12:54:49+00:00,saran sih bikin harga ionic sama kayak brio ...,positif
1,UgzEDUiV3OTrV943p8p4AaABAg,lushen ace,2023-08-04 12:16:23+00:00,problem subsidi kualitas diturunin harga dinai...,negatif
2,UgwqJqu6JMF4EH2CsVV4AaABAg,Fatih Al-Ayyubi,2023-08-04 10:17:57+00:00,baik kualitas kembang dulu baik kualitas motor...,positif
3,UgyYicCMR1rKwuOj2Y14AaABAg,yp office,2023-08-04 08:29:54+00:00,model jelek kwalitas buruk harga mahal croot,negatif
4,UgxKAcLuAwZOQK6es-x4AaABAg,Lembur Kuring,2023-08-04 07:55:37+00:00,syarat ngaco woy anak muda blom punya ruma...,negatif
...,...,...,...,...,...
1512,UgxJPeJSdLI9a7L_adx4AaABAg,Fish aquspace,2022-09-19 12:30:44+00:00,apa kabar padahal negri luar biasa dapat negara,negatif
1513,UgyHzUNyXtjxftnySNF4AaABAg,Elias Panai,2022-09-19 12:27:18+00:00,antar anak sekolah antar bantu pasar klw jau...,negatif
1514,UgxbIAGGO6Jt50q6wT94AaABAg,Mikhanalont Horassss,2022-09-19 12:23:06+00:00,esemka bangga solo,positif
1515,UgyRNiIV6k2t21V4iMN4AaABAg,ANI ANI SOFIE,2022-09-19 11:48:39+00:00,cerdas orang hny dn pasar itu jln x blom d...,netral


In [15]:
# 'ind' or 'en'
LANGUAGE='ind'

In [16]:
df['text_cleaning'] = df['text_cleaning'].astype(str)

### Cleaning

In [17]:
import re

In [18]:
def cleanComment(comment):
    try:
        # misal ada "kawan-kawan"
        comment = re.sub(r'(?<=\w)-(?=\w)', 'STRIP', comment)
        comment = re.sub(r'http\S+|www\S+', '', comment)
        comment = re.sub(r'@\w+|[^\w\s-]|(?<!\w)-(?!\w)|\d+|(?<=\n)[IVXLCDM]+', ' ', comment)
        comment = re.sub(r'\s+', ' ', comment)
        comment = comment.replace('STRIP', '-')
        comment = comment.replace('\n', ' ')
        return comment.strip()
    except Exception as e: 
        print(f"Err: Failed to clean comments due to {str(e)}")
        return comment

In [19]:
df['cleanComment'] = df['text_cleaning'].apply(lambda x: cleanComment(x))
df[['text_cleaning', 'cleanComment']].head(20)

Unnamed: 0,text_cleaning,cleanComment
0,saran sih bikin harga ionic sama kayak brio ...,saran sih bikin harga ionic sama kayak brio in...
1,problem subsidi kualitas diturunin harga dinai...,problem subsidi kualitas diturunin harga dinai...
2,baik kualitas kembang dulu baik kualitas motor...,baik kualitas kembang dulu baik kualitas motor...
3,model jelek kwalitas buruk harga mahal croot,model jelek kwalitas buruk harga mahal croot
4,syarat ngaco woy anak muda blom punya ruma...,syarat ngaco woy anak muda blom punya rumah bl...
5,harga motor mahal masa harga mirip motor beat ...,harga motor mahal masa harga mirip motor beat ...
6,mol keren yah berita plus padahal mol loka...,mol keren yah berita plus padahal mol lokal me...
7,proses kenal produk baru butuh waktu ganti ken...,proses kenal produk baru butuh waktu ganti ken...
8,subsidi tepat sasar,subsidi tepat sasar
9,adil rata terima subsidi jangan jangan pajak p...,adil rata terima subsidi jangan jangan pajak p...


### Case folding

In [21]:
def caseFolding(comment):
    try:
        cleanComment = comment.lower()
        return cleanComment     
    except Exception as e: 
        print(f"Err: Failed to case folding due to {str(e)}")
        return comment

In [22]:
df['resultCaseFolding'] = df['cleanComment'].apply(caseFolding)
df[['cleanComment', 'resultCaseFolding']].head(10)

Unnamed: 0,cleanComment,resultCaseFolding
0,saran sih bikin harga ionic sama kayak brio in...,saran sih bikin harga ionic sama kayak brio in...
1,problem subsidi kualitas diturunin harga dinai...,problem subsidi kualitas diturunin harga dinai...
2,baik kualitas kembang dulu baik kualitas motor...,baik kualitas kembang dulu baik kualitas motor...
3,model jelek kwalitas buruk harga mahal croot,model jelek kwalitas buruk harga mahal croot
4,syarat ngaco woy anak muda blom punya rumah bl...,syarat ngaco woy anak muda blom punya rumah bl...
5,harga motor mahal masa harga mirip motor beat ...,harga motor mahal masa harga mirip motor beat ...
6,mol keren yah berita plus padahal mol lokal me...,mol keren yah berita plus padahal mol lokal me...
7,proses kenal produk baru butuh waktu ganti ken...,proses kenal produk baru butuh waktu ganti ken...
8,subsidi tepat sasar,subsidi tepat sasar
9,adil rata terima subsidi jangan jangan pajak p...,adil rata terima subsidi jangan jangan pajak p...


### Slang Word

In [23]:
import os

In [24]:
def replaceSlangWords(comment, language):
    try:
        slang_dict = {}
        filename = f'./slang/slang-{language}.txt'
        if os.path.exists(filename) and os.path.getsize(filename) > 0:
            with open(filename, 'r') as f:
                for line in f:
                    slang, formal = line.strip().split(',')
                    slang_dict[slang] = formal

        words = comment.split()
        for i in range(len(words)):
            if words[i] in slang_dict:
                words[i] = slang_dict[words[i]]
        return ' '.join(words)
    except Exception as e:
        print(f"Err: Failed to replace slang words due to {str(e)}")
        return comment 

In [26]:
df['resultReplaceSlang'] = df['resultCaseFolding'].apply(lambda x: replaceSlangWords(x, LANGUAGE))
df[['resultCaseFolding', 'resultReplaceSlang']].head(20)

Unnamed: 0,resultCaseFolding,resultReplaceSlang
0,saran sih bikin harga ionic sama kayak brio in...,saran sih bikin harga ionic sama kayak brio in...
1,problem subsidi kualitas diturunin harga dinai...,problem subsidi kualitas diturunin harga dinai...
2,baik kualitas kembang dulu baik kualitas motor...,baik kualitas kembang dulu baik kualitas motor...
3,model jelek kwalitas buruk harga mahal croot,model jelek kwalitas buruk harga mahal croot
4,syarat ngaco woy anak muda blom punya rumah bl...,syarat ngaco woy anak muda blom punya rumah bl...
5,harga motor mahal masa harga mirip motor beat ...,harga motor mahal masa harga mirip motor beat ...
6,mol keren yah berita plus padahal mol lokal me...,mol keren yah berita plus padahal mol lokal me...
7,proses kenal produk baru butuh waktu ganti ken...,proses kenal produk baru butuh waktu ganti ken...
8,subsidi tepat sasar,subsidi tepat sasar
9,adil rata terima subsidi jangan jangan pajak p...,adil rata terima subsidi jangan jangan pajak p...


### Tokenizing

In [27]:
def tokenize(comment):
    try:
        words = comment.split(' ')
        words = list(filter(None, words)) 

        return words
    except Exception as e:
        print("Err: Failed to tokenize due to", str(e))
        return comment

In [28]:
df['resultTokenize'] = df['resultReplaceSlang'].apply(tokenize)
df[['resultReplaceSlang', 'resultTokenize']].head(20)

Unnamed: 0,resultReplaceSlang,resultTokenize
0,saran sih bikin harga ionic sama kayak brio in...,"[saran, sih, bikin, harga, ionic, sama, kayak,..."
1,problem subsidi kualitas diturunin harga dinai...,"[problem, subsidi, kualitas, diturunin, harga,..."
2,baik kualitas kembang dulu baik kualitas motor...,"[baik, kualitas, kembang, dulu, baik, kualitas..."
3,model jelek kwalitas buruk harga mahal croot,"[model, jelek, kwalitas, buruk, harga, mahal, ..."
4,syarat ngaco woy anak muda blom punya rumah bl...,"[syarat, ngaco, woy, anak, muda, blom, punya, ..."
5,harga motor mahal masa harga mirip motor beat ...,"[harga, motor, mahal, masa, harga, mirip, moto..."
6,mol keren yah berita plus padahal mol lokal me...,"[mol, keren, yah, berita, plus, padahal, mol, ..."
7,proses kenal produk baru butuh waktu ganti ken...,"[proses, kenal, produk, baru, butuh, waktu, ga..."
8,subsidi tepat sasar,"[subsidi, tepat, sasar]"
9,adil rata terima subsidi jangan jangan pajak p...,"[adil, rata, terima, subsidi, jangan, jangan, ..."


### Stop Word Removal

In [29]:
# !pip install nltk

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Dell
[nltk_data]     7300\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [30]:
def stopwordRemoval(comments, language):
    language_mapping = {
        'en': 'english', 
        'ind': 'indonesian'
    }

    nltk_language = language_mapping.get(language, 'english') 
    stopWordRemoved = []
    
    try:
        stopList = stopwords.words(nltk_language)
        filename = f'./stopword/stopword-{language}.txt' 
        if os.path.exists(filename) and os.path.getsize(filename) > 0:
            with open(filename, 'r') as f:
                stopList.extend(f.read().split('\n')) 
        for word in comments:
            if word not in stopList:
                stopWordRemoved.append(word)

        return stopWordRemoved  
    except Exception as e:  
        print(f"Err: Failed to remove stopwords due to {str(e)}")
        return comments 

In [31]:
df['resultStopword'] = df['resultTokenize'].apply(lambda x: stopwordRemoval(x, LANGUAGE))
df[['resultTokenize', 'resultStopword']].head(20)

Unnamed: 0,resultTokenize,resultStopword
0,"[saran, sih, bikin, harga, ionic, sama, kayak,...","[saran, sih, bikin, harga, ionic, kayak, brio,..."
1,"[problem, subsidi, kualitas, diturunin, harga,...","[problem, subsidi, kualitas, diturunin, harga,..."
2,"[baik, kualitas, kembang, dulu, baik, kualitas...","[kualitas, kembang, kualitas, motor, motor, pa..."
3,"[model, jelek, kwalitas, buruk, harga, mahal, ...","[model, jelek, kwalitas, buruk, harga, mahal, ..."
4,"[syarat, ngaco, woy, anak, muda, blom, punya, ...","[syarat, ngaco, woy, anak, muda, blom, rumah, ..."
5,"[harga, motor, mahal, masa, harga, mirip, moto...","[harga, motor, mahal, harga, motor, beat, kual..."
6,"[mol, keren, yah, berita, plus, padahal, mol, ...","[mol, keren, yah, berita, plus, mol, lokal, me..."
7,"[proses, kenal, produk, baru, butuh, waktu, ga...","[proses, kenal, produk, butuh, ganti, kendara,..."
8,"[subsidi, tepat, sasar]","[subsidi, sasar]"
9,"[adil, rata, terima, subsidi, jangan, jangan, ...","[adil, terima, subsidi, pajak, pph, msh, subsi..."


### Hapus data kosong ([])

In [32]:
df = df[df['resultStopword'].apply(lambda x: len(x) > 0)]
df[['resultStopword']]

Unnamed: 0,resultStopword
0,"[saran, sih, bikin, harga, ionic, kayak, brio,..."
1,"[problem, subsidi, kualitas, diturunin, harga,..."
2,"[kualitas, kembang, kualitas, motor, motor, pa..."
3,"[model, jelek, kwalitas, buruk, harga, mahal, ..."
4,"[syarat, ngaco, woy, anak, muda, blom, rumah, ..."
...,...
1512,"[kabar, negri, negara]"
1513,"[anak, sekolah, bantu, pasar, klw, klw, jebak,..."
1514,"[esemka, bangga, solo]"
1515,"[cerdas, orang, hny, dn, pasar, jln, x, blom, ..."


### Stemming

In [33]:
# !pip install Sastrawi

from nltk.stem import PorterStemmer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [34]:
def stemmed_wrapper(term, language):
    try:
        if language == 'ind':
            factory = StemmerFactory()
            stemmer = factory.create_stemmer()
            return stemmer.stem(term)
        else: 
            stemmer = PorterStemmer()
            return stemmer.stem(term)
    except Exception as e:
        print(f"Err: Failed to stem term '{term}' due to {str(e)}")
        return term 

def stemming(document, language):
    return [stemmed_wrapper(term, language) for term in document]

In [35]:
df['resultStemming'] = df['resultStopword'].apply(lambda x: stemming(x, LANGUAGE))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['resultStemming'] = df['resultStopword'].apply(lambda x: stemming(x, LANGUAGE))


In [36]:
df[['resultStopword', 'resultStemming']]

Unnamed: 0,resultStopword,resultStemming
0,"[saran, sih, bikin, harga, ionic, kayak, brio,...","[saran, sih, bikin, harga, ionic, kayak, brio,..."
1,"[problem, subsidi, kualitas, diturunin, harga,...","[problem, subsidi, kualitas, diturunin, harga,..."
2,"[kualitas, kembang, kualitas, motor, motor, pa...","[kualitas, kembang, kualitas, motor, motor, pa..."
3,"[model, jelek, kwalitas, buruk, harga, mahal, ...","[model, jelek, kwalitas, buruk, harga, mahal, ..."
4,"[syarat, ngaco, woy, anak, muda, blom, rumah, ...","[syarat, ngaco, woy, anak, muda, blom, rumah, ..."
...,...,...
1512,"[kabar, negri, negara]","[kabar, negri, negara]"
1513,"[anak, sekolah, bantu, pasar, klw, klw, jebak,...","[anak, sekolah, bantu, pasar, klw, klw, jebak,..."
1514,"[esemka, bangga, solo]","[esemka, bangga, solo]"
1515,"[cerdas, orang, hny, dn, pasar, jln, x, blom, ...","[cerdas, orang, hny, dn, pasar, jln, x, blom, ..."


### Export clean data

In [55]:
df.to_csv(
    './output/CLEAN-dataset_tweet_sentimen_tayangan_tv.csv',
    columns=[
        'Id',
        'Sentiment',
        'Acara TV',
        'Jumlah Retweet',
        'Text Tweet',
        'cleanComment',
        'resultCaseFolding',
        'resultReplaceSlang',
        'resultTokenize',
        'resultStopword',
        'resultStemming',
    ],
    index=False
)

In [97]:
df.to_csv(
    './output/CLEAN-dataset_tweet_sentiment_cellular_service_provider.csv',
    columns=df.columns,
    index=False
)

In [130]:
df.to_csv(
    './output/CLEAN-dataset_tweet_sentiment_opini_film.csv',
    columns=df.columns,
    index=False
)

In [38]:
df.to_csv(
    '../output/CLEAN-dataset_mobil_listrik.csv',
    columns=df.columns,
    index=False
)

In [68]:
df.to_csv(
    './output/data-clean-ind-new.csv', 
    columns=[
        'Comment', 
        'cleanComment',
        'resultCaseFolding',
        'resultReplaceSlang',
        'resultTokenize',
        'resultStopword',
        'resultStemming',
    ],
    index=False
)

In [67]:
df.to_excel(
    './output/data-clean-ind-new.xlsx', 
    columns=[
        'Comment', 
        'cleanComment',
        'resultCaseFolding',
        'resultReplaceSlang',
        'resultTokenize',
        'resultStopword',
        'resultStemming',
    ],
    index=False
)