### Open .csv file

In [1]:
import pandas as pd

In [98]:
df = pd.read_csv('./dataset/dataset_tweet_sentiment_opini_film.csv', encoding='utf-8')
df

Unnamed: 0,Id,Sentiment,Text Tweet
0,1,negative,Jelek filmnya... apalagi si ernest gak mutu bg...
1,2,negative,Film king Arthur ini film paling jelek dari se...
2,3,negative,@beexkuanlin Sepanjang film gwa berkata kasar ...
3,4,negative,Ane ga suka fast and furious..menurutku kok je...
4,5,negative,"@baekhyun36 kan gua ga tau film nya, lu bilang..."
...,...,...,...
195,196,positive,Fargo juga adaptasi dari film yang cukup berha...
196,197,positive,637.000 waw ini sangat keren flm horor dng jum...
197,198,positive,@filmziarah film yang tenang dan menghanyutkan...
198,199,positive,Film yg amat menarik. Kisah cinta & kesetiaan ...


In [99]:
# 'ind' or 'en'
LANGUAGE='ind'

### Cleaning

In [100]:
import re

In [113]:
def cleanComment(comment):
    try:
        # misal ada "kawan-kawan"
        comment = re.sub(r'(?<=\w)-(?=\w)', 'STRIP', comment)
        comment = re.sub(r'http\S+|www\S+', '', comment)
        comment = re.sub(r'@\w+|[^\w\s-]|(?<!\w)-(?!\w)|\d+|(?<=\n)[IVXLCDM]+', ' ', comment)
        comment = re.sub(r'\s+', ' ', comment)
        comment = comment.replace('STRIP', '-')
        comment = comment.replace('\n', ' ')
        return comment.strip()
    except Exception as e: 
        print(f"Err: Failed to clean comments due to {str(e)}")
        return comment

In [114]:
df['cleanComment'] = df['Text Tweet'].apply(lambda x: cleanComment(x))
df[['Text Tweet', 'cleanComment']].head(20)

Unnamed: 0,Text Tweet,cleanComment
0,Jelek filmnya... apalagi si ernest gak mutu bg...,Jelek filmnya apalagi si ernest gak mutu bgt a...
1,Film king Arthur ini film paling jelek dari se...,Film king Arthur ini film paling jelek dari se...
2,@beexkuanlin Sepanjang film gwa berkata kasar ...,Sepanjang film gwa berkata kasar terus pada ba...
3,Ane ga suka fast and furious..menurutku kok je...,Ane ga suka fast and furious menurutku kok jel...
4,"@baekhyun36 kan gua ga tau film nya, lu bilang...",kan gua ga tau film nya lu bilang perang peran...
5,tolong editingnya yg bagus ya. Saya sering kec...,tolong editingnya yg bagus ya Saya sering kece...
6,Kecewa dgn salah satu aktornya yg ternyata pen...,Kecewa dgn salah satu aktornya yg ternyata pen...
7,Kecewa parah sama film the guys. Dear @raditya...,Kecewa parah sama film the guys Dear sorry to ...
8,Banyak yg kecewa abis nonton film ini :(,Banyak yg kecewa abis nonton film ini
9,#TheMummy 2017 adalah film yang paling memgece...,TheMummy adalah film yang paling memgecewakan ...


### Case folding

In [115]:
def caseFolding(comment):
    try:
        cleanComment = comment.lower()
        return cleanComment
    except Exception as e: 
        print(f"Err: Failed to case folding due to {str(e)}")
        return comment

In [116]:
df['resultCaseFolding'] = df['cleanComment'].apply(caseFolding)
df[['cleanComment', 'resultCaseFolding']].head(10)

Unnamed: 0,cleanComment,resultCaseFolding
0,Jelek filmnya apalagi si ernest gak mutu bgt a...,jelek filmnya apalagi si ernest gak mutu bgt a...
1,Film king Arthur ini film paling jelek dari se...,film king arthur ini film paling jelek dari se...
2,Sepanjang film gwa berkata kasar terus pada ba...,sepanjang film gwa berkata kasar terus pada ba...
3,Ane ga suka fast and furious menurutku kok jel...,ane ga suka fast and furious menurutku kok jel...
4,kan gua ga tau film nya lu bilang perang peran...,kan gua ga tau film nya lu bilang perang peran...
5,tolong editingnya yg bagus ya Saya sering kece...,tolong editingnya yg bagus ya saya sering kece...
6,Kecewa dgn salah satu aktornya yg ternyata pen...,kecewa dgn salah satu aktornya yg ternyata pen...
7,Kecewa parah sama film the guys Dear sorry to ...,kecewa parah sama film the guys dear sorry to ...
8,Banyak yg kecewa abis nonton film ini,banyak yg kecewa abis nonton film ini
9,TheMummy adalah film yang paling memgecewakan ...,themummy adalah film yang paling memgecewakan ...


### Slang Word

In [117]:
import os

In [118]:
def replaceSlangWords(comment, language):
    try:
        slang_dict = {}
        filename = f'./slang/slang-{language}.txt'
        if os.path.exists(filename) and os.path.getsize(filename) > 0:
            with open(filename, 'r') as f:
                for line in f:
                    slang, formal = line.strip().split(',')
                    slang_dict[slang] = formal

        words = comment.split()
        for i in range(len(words)):
            if words[i] in slang_dict:
                words[i] = slang_dict[words[i]]
        return ' '.join(words)
    except Exception as e:
        print(f"Err: Failed to replace slang words due to {str(e)}")
        return comment 

In [119]:
df['resultReplaceSlang'] = df['resultCaseFolding'].apply(lambda x: replaceSlangWords(x, LANGUAGE))
df[['resultCaseFolding', 'resultReplaceSlang']]

Unnamed: 0,resultCaseFolding,resultReplaceSlang
0,jelek filmnya apalagi si ernest gak mutu bgt a...,jelek filmnya apalagi si ernest gak mutu bange...
1,film king arthur ini film paling jelek dari se...,film king arthur ini film paling jelek dari se...
2,sepanjang film gwa berkata kasar terus pada ba...,sepanjang film saya berkata kasar terus pada b...
3,ane ga suka fast and furious menurutku kok jel...,saya tidak suka fast and furious menurutku kok...
4,kan gua ga tau film nya lu bilang perang peran...,kan saya tidak tahu film nya kamu bilang peran...
...,...,...
195,fargo juga adaptasi dari film yang cukup berha...,fargo juga adaptasi dari film yang cukup berha...
196,waw ini sangat keren flm horor dng jumlah peno...,waw ini sangat keren flm horor dng jumlah peno...
197,film yang tenang dan menghanyutkan salut denga...,film yang tenang dan menghanyutkan salut denga...
198,film yg amat menarik kisah cinta kesetiaan yg ...,film yang amat menarik kisah cinta kesetiaan y...


### Tokenizing

In [120]:
def tokenize(comment):
    try:
        words = comment.split(' ')
        words = list(filter(None, words)) 

        return words
    except Exception as e:
        print("Err: Failed to tokenize due to", str(e))
        return comment

In [121]:
df['resultTokenize'] = df['resultReplaceSlang'].apply(tokenize)
df[['resultReplaceSlang', 'resultTokenize']].head(20)

Unnamed: 0,resultReplaceSlang,resultTokenize
0,jelek filmnya apalagi si ernest gak mutu bange...,"[jelek, filmnya, apalagi, si, ernest, gak, mut..."
1,film king arthur ini film paling jelek dari se...,"[film, king, arthur, ini, film, paling, jelek,..."
2,sepanjang film saya berkata kasar terus pada b...,"[sepanjang, film, saya, berkata, kasar, terus,..."
3,saya tidak suka fast and furious menurutku kok...,"[saya, tidak, suka, fast, and, furious, menuru..."
4,kan saya tidak tahu film nya kamu bilang peran...,"[kan, saya, tidak, tahu, film, nya, kamu, bila..."
5,tolong editingnya yang bagus ya saya sering ke...,"[tolong, editingnya, yang, bagus, ya, saya, se..."
6,kecewa dgn salah satu aktornya yang ternyata p...,"[kecewa, dgn, salah, satu, aktornya, yang, ter..."
7,kecewa parah sama film the guys dear sorry to ...,"[kecewa, parah, sama, film, the, guys, dear, s..."
8,banyak yang kecewa abis nonton film ini,"[banyak, yang, kecewa, abis, nonton, film, ini]"
9,themummy adalah film yang paling memgecewakan ...,"[themummy, adalah, film, yang, paling, memgece..."


### Stop Word Removal

In [122]:
# !pip install nltk

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Dell
[nltk_data]     7300\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [123]:
def stopwordRemoval(comments, language):
    language_mapping = {
        'en': 'english', 
        'ind': 'indonesian'
    }

    nltk_language = language_mapping.get(language, 'english') 
    stopWordRemoved = []
    
    try:
        stopList = stopwords.words(nltk_language)
        filename = f'./stopword/stopword-{language}.txt' 
        if os.path.exists(filename) and os.path.getsize(filename) > 0:
            with open(filename, 'r') as f:
                stopList.extend(f.read().split('\n')) 
        for word in comments:
            if word not in stopList:
                stopWordRemoved.append(word)

        return stopWordRemoved  
    except Exception as e:  
        print(f"Err: Failed to remove stopwords due to {str(e)}")
        return comments 

In [124]:
df['resultStopword'] = df['resultTokenize'].apply(lambda x: stopwordRemoval(x, LANGUAGE))
df[['resultTokenize', 'resultStopword']].head(20)

Unnamed: 0,resultTokenize,resultStopword
0,"[jelek, filmnya, apalagi, si, ernest, gak, mut...","[jelek, filmnya, si, ernest, gak, mutu, banget..."
1,"[film, king, arthur, ini, film, paling, jelek,...","[film, king, arthur, film, jelek, cerita, king..."
2,"[sepanjang, film, saya, berkata, kasar, terus,...","[film, kasar, bapaknya]"
3,"[saya, tidak, suka, fast, and, furious, menuru...","[suka, fast, and, furious, menurutku, jelek, y..."
4,"[kan, saya, tidak, tahu, film, nya, kamu, bila...","[film, nya, bilang, perang, perangan, perang, ..."
5,"[tolong, editingnya, yang, bagus, ya, saya, se...","[tolong, editingnya, bagus, ya, kecewa, dgn, f..."
6,"[kecewa, dgn, salah, satu, aktornya, yang, ter...","[kecewa, dgn, salah, aktornya, pendukung, peni..."
7,"[kecewa, parah, sama, film, the, guys, dear, s...","[kecewa, parah, film, the, guys, dear, sorry, ..."
8,"[banyak, yang, kecewa, abis, nonton, film, ini]","[kecewa, abis, nonton, film]"
9,"[themummy, adalah, film, yang, paling, memgece...","[themummy, film, memgecewakan, hidup, yah, sy,..."


### Hapus data kosong ([])

In [125]:
df = df[df['resultStopword'].apply(lambda x: len(x) > 0)]
df[['resultStopword']]

Unnamed: 0,resultStopword
0,"[jelek, filmnya, si, ernest, gak, mutu, banget..."
1,"[film, king, arthur, film, jelek, cerita, king..."
2,"[film, kasar, bapaknya]"
3,"[suka, fast, and, furious, menurutku, jelek, y..."
4,"[film, nya, bilang, perang, perangan, perang, ..."
...,...
195,"[fargo, adaptasi, film, berhasil, season, -nya..."
196,"[waw, keren, flm, horor, dng, penonton, segini]"
197,"[film, tenang, menghanyutkan, salut, mbah, pon..."
198,"[film, menarik, kisah, cinta, kesetiaan, disaj..."


### Stemming

In [126]:
# !pip install Sastrawi

from nltk.stem import PorterStemmer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [127]:
def stemmed_wrapper(term, language):
    try:
        if language == 'ind':
            factory = StemmerFactory()
            stemmer = factory.create_stemmer()
            return stemmer.stem(term)
        else: 
            stemmer = PorterStemmer()
            return stemmer.stem(term)
    except Exception as e:
        print(f"Err: Failed to stem term '{term}' due to {str(e)}")
        return term 

def stemming(document, language):
    return [stemmed_wrapper(term, language) for term in document]

In [128]:
df['resultStemming'] = df['resultStopword'].apply(lambda x: stemming(x, LANGUAGE))

In [129]:
df[['resultStopword', 'resultStemming']]

Unnamed: 0,resultStopword,resultStemming
0,"[jelek, filmnya, si, ernest, gak, mutu, banget...","[jelek, film, si, ernest, gak, mutu, banget, a..."
1,"[film, king, arthur, film, jelek, cerita, king...","[film, king, arthur, film, jelek, cerita, king..."
2,"[film, kasar, bapaknya]","[film, kasar, bapak]"
3,"[suka, fast, and, furious, menurutku, jelek, y...","[suka, fast, and, furious, turut, jelek, ya, t..."
4,"[film, nya, bilang, perang, perangan, perang, ...","[film, nya, bilang, perang, perang, perang, an..."
...,...,...
195,"[fargo, adaptasi, film, berhasil, season, -nya...","[fargo, adaptasi, film, hasil, season, -nya, t..."
196,"[waw, keren, flm, horor, dng, penonton, segini]","[waw, keren, flm, horor, dng, tonton, gin]"
197,"[film, tenang, menghanyutkan, salut, mbah, pon...","[film, tenang, hanyut, salut, mbah, ponco, baw..."
198,"[film, menarik, kisah, cinta, kesetiaan, disaj...","[film, tarik, kisah, cinta, setia, saji, bikin..."


### Export clean data

In [55]:
df.to_csv(
    './output/CLEAN-dataset_tweet_sentimen_tayangan_tv.csv',
    columns=[
        'Id',
        'Sentiment',
        'Acara TV',
        'Jumlah Retweet',
        'Text Tweet',
        'cleanComment',
        'resultCaseFolding',
        'resultReplaceSlang',
        'resultTokenize',
        'resultStopword',
        'resultStemming',
    ],
    index=False
)

In [97]:
df.to_csv(
    './output/CLEAN-dataset_tweet_sentiment_cellular_service_provider.csv',
    columns=df.columns,
    index=False
)

In [130]:
df.to_csv(
    './output/CLEAN-dataset_tweet_sentiment_opini_film.csv',
    columns=df.columns,
    index=False
)

In [68]:
df.to_csv(
    './output/data-clean-ind-new.csv', 
    columns=[
        'Comment', 
        'cleanComment',
        'resultCaseFolding',
        'resultReplaceSlang',
        'resultTokenize',
        'resultStopword',
        'resultStemming',
    ],
    index=False
)

In [67]:
df.to_excel(
    './output/data-clean-ind-new.xlsx', 
    columns=[
        'Comment', 
        'cleanComment',
        'resultCaseFolding',
        'resultReplaceSlang',
        'resultTokenize',
        'resultStopword',
        'resultStemming',
    ],
    index=False
)